diff --git a/.bazelrc b/.bazelrc
index a543ebbcd75..8be3dadaf4e 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -46,7 +46,6 @@
 #     sycl_asan:
 #     sycl_trisycl:
 #     mkl:          Enable full mkl support.
-#     mkl_open_source_only: Enable MKL support only using open source MKL libraries.
 #     tensorrt:     Enable Tensorrt support.
 #     ngraph:       Enable ngraph support.
 #     numa:         Enable numa using hwloc.
@@ -140,13 +139,6 @@ build:mkl --define=tensorflow_mkldnn_contraction_kernel=0
 build:mkl --define=build_with_mkl_dnn_v1_only=true
 build:mkl -c opt
 
-# This config option is used to enable MKL-DNN open source library only,
-# without depending on MKL binary version.
-build:mkl_open_source_only --define=build_with_mkl_dnn_only=true
-build:mkl_open_source_only --define=build_with_mkl_dnn_v1_only=true
-build:mkl_open_source_only --define=build_with_mkl=true --define=enable_mkl=true
-build:mkl_open_source_only --define=tensorflow_mkldnn_contraction_kernel=0
-
 # This config refers to building with CUDA available. It does not necessarily
 # mean that we build CUDA op kernels.
 build:using_cuda --define=using_cuda=true
@@ -248,6 +240,7 @@ build:windows --copt=/w
 # Tensorflow uses M_* math constants that only get defined by MSVC headers if
 # _USE_MATH_DEFINES is defined.
 build:windows --copt=/D_USE_MATH_DEFINES
+build:windows --host_copt=/D_USE_MATH_DEFINES
 
 # Default paths for TF_SYSTEM_LIBS
 build:linux --define=PREFIX=/usr
diff --git a/.gitignore b/.gitignore
index eab8a64c63d..72cb418fe11 100644
--- a/.gitignore
+++ b/.gitignore
@@ -38,7 +38,9 @@ gradleBuild
 *.pbxproj
 *.xcworkspace
 /*.podspec
-/tensorflow/lite/**/[ios|objc|swift]*/BUILD
+/tensorflow/lite/**/ios/BUILD
+/tensorflow/lite/**/objc/BUILD
+/tensorflow/lite/**/swift/BUILD
 /tensorflow/lite/examples/ios/simple/data/*.tflite
 /tensorflow/lite/examples/ios/simple/data/*.txt
 Podfile.lock
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 0b31ca33d20..c5574793b74 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -154,7 +154,10 @@ tf_cuda_library(
         "c_api.h",
     ],
     copts = tf_copts(),
-    visibility = ["//tensorflow/c:__subpackages__"],
+    visibility = [
+        "//tensorflow/c:__subpackages__",
+        "//third_party/llvm/llvm-project:__subpackages__",
+    ],
     deps = [
         ":c_api_internal",
         ":tf_attrtype",
@@ -698,4 +701,5 @@ tf_cuda_library(
         # TODO(b/74620627): remove when _USE_C_SHAPES is removed
         "//tensorflow/python:cpp_shape_inference_proto_cc",
     ],
+    alwayslink = 1,
 )
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 06a6bc64e74..bc1fbd3fcf5 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -774,7 +774,7 @@ extern "C" {
 static TF_OperationDescription* TF_NewOperationLocked(TF_Graph* graph,
                                                       const char* op_type,
                                                       const char* oper_name)
-    EXCLUSIVE_LOCKS_REQUIRED(graph->mu) {
+    TF_EXCLUSIVE_LOCKS_REQUIRED(graph->mu) {
   return new TF_OperationDescription(graph, op_type, oper_name);
 }
 
@@ -1032,7 +1032,7 @@ void TF_SetAttrValueProto(TF_OperationDescription* desc, const char* attr_name,
 
 static TF_Operation* TF_FinishOperationLocked(TF_OperationDescription* desc,
                                               TF_Status* status)
-    EXCLUSIVE_LOCKS_REQUIRED(desc->graph->mu) {
+    TF_EXCLUSIVE_LOCKS_REQUIRED(desc->graph->mu) {
   Node* ret = nullptr;
 
   if (desc->graph->name_map.count(desc->node_builder.node_name())) {
@@ -1706,7 +1706,7 @@ static void GraphImportGraphDefLocked(TF_Graph* graph, const GraphDef& def,
                                       const TF_ImportGraphDefOptions* opts,
                                       TF_ImportGraphDefResults* tf_results,
                                       TF_Status* status)
-    EXCLUSIVE_LOCKS_REQUIRED(graph->mu) {
+    TF_EXCLUSIVE_LOCKS_REQUIRED(graph->mu) {
   const int last_node_id = graph->graph.num_node_ids();
   tensorflow::ImportGraphDefResults results;
   status->status = tensorflow::ImportGraphDef(opts->opts, def, &graph->graph,
diff --git a/tensorflow/c/c_api_function.cc b/tensorflow/c/c_api_function.cc
index a908fd131c1..a235ea0cf5a 100644
--- a/tensorflow/c/c_api_function.cc
+++ b/tensorflow/c/c_api_function.cc
@@ -51,7 +51,7 @@ Status ProcessInputs(
     const TF_Graph* fn_body, const char* fn_name, int ninputs,
     const TF_Output* inputs, std::vector<OutputTensor>* input_tensors,
     std::unordered_map<const Node*, std::vector<int>>* input_nodes)
-    EXCLUSIVE_LOCKS_REQUIRED(fn_body->mu) {
+    TF_EXCLUSIVE_LOCKS_REQUIRED(fn_body->mu) {
   input_tensors->reserve(ninputs);
   for (int i = 0; i < ninputs; ++i) {
     Node* node = &inputs[i].oper->node;
@@ -87,7 +87,7 @@ Status ProcessInputs(
 Status ProcessOutputs(const TF_Graph* fn_body, const char* fn_name,
                       int noutputs, const TF_Output* outputs,
                       std::vector<OutputTensor>* output_tensors)
-    EXCLUSIVE_LOCKS_REQUIRED(fn_body->mu) {
+    TF_EXCLUSIVE_LOCKS_REQUIRED(fn_body->mu) {
   output_tensors->reserve(noutputs);
   for (int i = 0; i < noutputs; ++i) {
     Node* node = &outputs[i].oper->node;
@@ -111,7 +111,7 @@ Status ComputeBodyNodes(
     const TF_Operation* const* opers,
     const std::unordered_map<const Node*, std::vector<int>>& input_nodes,
     std::vector<const Node*>* body_nodes)
-    EXCLUSIVE_LOCKS_REQUIRED(fn_body->mu) {
+    TF_EXCLUSIVE_LOCKS_REQUIRED(fn_body->mu) {
   if (num_opers == -1) {
     for (const Node* node : fn_body->graph.op_nodes()) {
       const auto& iter = input_nodes.find(node);
diff --git a/tensorflow/c/c_api_internal.h b/tensorflow/c/c_api_internal.h
index 9e1b54f0029..32880378c2b 100644
--- a/tensorflow/c/c_api_internal.h
+++ b/tensorflow/c/c_api_internal.h
@@ -71,14 +71,14 @@ struct TF_Graph {
   TF_Graph();
 
   tensorflow::mutex mu;
-  tensorflow::Graph graph GUARDED_BY(mu);
+  tensorflow::Graph graph TF_GUARDED_BY(mu);
 
   // Runs shape inference.
-  tensorflow::ShapeRefiner refiner GUARDED_BY(mu);
+  tensorflow::ShapeRefiner refiner TF_GUARDED_BY(mu);
 
   // Maps from name of an operation to the Node* in 'graph'.
   std::unordered_map<tensorflow::string, tensorflow::Node*> name_map
-      GUARDED_BY(mu);
+      TF_GUARDED_BY(mu);
 
   // The keys of this map are all the active sessions using this graph. Each
   // value records whether the graph has been mutated since the corresponding
@@ -94,8 +94,8 @@ struct TF_Graph {
   // TODO(b/74949947): mutations currently trigger a warning instead of a bad
   // status, this should be reverted when possible.
   tensorflow::gtl::FlatMap<TF_Session*, tensorflow::string> sessions
-      GUARDED_BY(mu);
-  bool delete_requested GUARDED_BY(mu);  // set true by TF_DeleteGraph
+      TF_GUARDED_BY(mu);
+  bool delete_requested TF_GUARDED_BY(mu);  // set true by TF_DeleteGraph
 
   // Used to link graphs contained in TF_WhileParams to the parent graph that
   // will eventually contain the full while loop.
@@ -123,7 +123,7 @@ struct TF_Session {
   tensorflow::Session* session;
   TF_Graph* const graph;
 
-  tensorflow::mutex mu ACQUIRED_AFTER(TF_Graph::mu);
+  tensorflow::mutex mu TF_ACQUIRED_AFTER(TF_Graph::mu);
   int last_num_graph_nodes;
 
   // If true, TF_SessionRun and similar methods will call
@@ -169,9 +169,9 @@ struct TF_ApiDefMap {
   }
 
 #if !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
-  tensorflow::ApiDefMap api_def_map GUARDED_BY(lock);
+  tensorflow::ApiDefMap api_def_map TF_GUARDED_BY(lock);
 #endif  // !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
-  bool update_docs_called GUARDED_BY(lock);
+  bool update_docs_called TF_GUARDED_BY(lock);
   tensorflow::mutex lock;
 };
 
@@ -210,10 +210,10 @@ void TF_GraphSetOutputHandleShapesAndTypes(TF_Graph* graph, TF_Output output,
 
 void RecordMutation(TF_Graph* graph, const TF_Operation& op,
                     const char* mutation_type)
-    EXCLUSIVE_LOCKS_REQUIRED(graph->mu);
+    TF_EXCLUSIVE_LOCKS_REQUIRED(graph->mu);
 
 bool ExtendSessionGraphHelper(TF_Session* session, TF_Status* status)
-    LOCKS_EXCLUDED(session->graph->mu, session->mu);
+    TF_LOCKS_EXCLUDED(session->graph->mu, session->mu);
 
 std::string getTF_OutputDebugString(TF_Output node);
 
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 2749724d039..c25cb264ce7 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -354,6 +354,7 @@ cc_library(
         "//tensorflow/core:lib",
         "@dlpack",
     ],
+    alwayslink = 1,
 )
 
 # TODO(karllessard): only used by //tensorflow/core:mobile_srcs_only_runtime
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 65f37f3021f..96dc288f213 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -1710,8 +1710,9 @@ void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op,
 namespace {
 class CustomDeviceAPI : public tensorflow::CustomDevice {
  public:
-  CustomDeviceAPI(TFE_CustomDevice device, void* info, string name)
-      : device_(device), info_(info), name_(name) {}
+  CustomDeviceAPI(TFE_Context* context, TFE_CustomDevice device, void* info,
+                  string name)
+      : context_(context), device_(device), info_(info), name_(name) {}
 
   ~CustomDeviceAPI() override { device_.delete_device(info_); }
 
@@ -1725,7 +1726,7 @@ class CustomDeviceAPI : public tensorflow::CustomDevice {
         std::make_unique<tensorflow::TensorHandleInterface>(tensor)};
     TF_Status status;
     TFE_TensorHandle* result_handle =
-        device_.copy_tensor_to_device(&tensor_handle, &status, info_);
+        device_.copy_tensor_to_device(context_, &tensor_handle, &status, info_);
     if (!status.status.ok()) return status.status;
     *result = tensorflow::down_cast<tensorflow::TensorHandleInterface*>(
                   result_handle->handle.get())
@@ -1744,7 +1745,7 @@ class CustomDeviceAPI : public tensorflow::CustomDevice {
     TFE_TensorHandle tensor_handle{
         std::make_unique<tensorflow::TensorHandleInterface>(tensor)};
     TFE_TensorHandle* result_handle = device_.copy_tensor_from_device(
-        &tensor_handle, target_device_name.c_str(), &status, info_);
+        context_, &tensor_handle, target_device_name.c_str(), &status, info_);
     if (!status.status.ok()) return status.status;
     *result = tensorflow::down_cast<tensorflow::TensorHandleInterface*>(
                   result_handle->handle.get())
@@ -1768,7 +1769,7 @@ class CustomDeviceAPI : public tensorflow::CustomDevice {
     std::vector<TFE_TensorHandle*> outputs(*num_retvals);
     TF_Status status;
     TFE_OpAttrs attributes(&op->Attrs(), op->Name().c_str());
-    device_.execute(inputs.size(), inputs.data(), op->Name().c_str(),
+    device_.execute(context_, inputs.size(), inputs.data(), op->Name().c_str(),
                     &attributes, num_retvals, outputs.data(), &status, info_);
     if (status.status.ok()) {
       for (int i = 0; i < *num_retvals; ++i) {
@@ -1787,6 +1788,7 @@ class CustomDeviceAPI : public tensorflow::CustomDevice {
   }
 
  private:
+  TFE_Context* context_;
   TFE_CustomDevice device_;
   void* info_;
   string name_;
@@ -1794,8 +1796,10 @@ class CustomDeviceAPI : public tensorflow::CustomDevice {
 }  // namespace
 
 void TFE_RegisterCustomDevice(TFE_Context* ctx, TFE_CustomDevice device,
-                              const char* device_name, void* device_info) {
+                              const char* device_name, void* device_info,
+                              TF_Status* status) {
   auto custom_device =
-      std::make_unique<CustomDeviceAPI>(device, device_info, device_name);
-  ctx->context->RegisterCustomDevice(device_name, std::move(custom_device));
+      std::make_unique<CustomDeviceAPI>(ctx, device, device_info, device_name);
+  status->status =
+      ctx->context->RegisterCustomDevice(device_name, std::move(custom_device));
 }
diff --git a/tensorflow/c/eager/c_api_experimental.h b/tensorflow/c/eager/c_api_experimental.h
index b0f0da5acef..c24735963d6 100644
--- a/tensorflow/c/eager/c_api_experimental.h
+++ b/tensorflow/c/eager/c_api_experimental.h
@@ -458,27 +458,29 @@ TF_CAPI_EXPORT extern void TFE_OpSetAttrValueProto(const TFE_Op* op,
                                                    size_t proto_len,
                                                    TF_Status* status);
 
-#define TFE_CUSTOM_DEVICE_VERSION 1
+#define TFE_CUSTOM_DEVICE_VERSION 2
 
 // Struct to be filled in
 typedef struct TFE_CustomDevice {
   int version = TFE_CUSTOM_DEVICE_VERSION;
   // Method to copy a tensor to the custom device.
-  TFE_TensorHandle* (*copy_tensor_to_device)(TFE_TensorHandle* tensor,
+  TFE_TensorHandle* (*copy_tensor_to_device)(TFE_Context* context,
+                                             TFE_TensorHandle* tensor,
                                              TF_Status* status,
                                              void* device_info) = nullptr;
 
   // Method to copy a tensor from the custom device to a target device.
-  TFE_TensorHandle* (*copy_tensor_from_device)(TFE_TensorHandle* tensor,
+  TFE_TensorHandle* (*copy_tensor_from_device)(TFE_Context* context,
+                                               TFE_TensorHandle* tensor,
                                                const char* target_device_name,
                                                TF_Status* status,
                                                void* device_info);
 
   // Method to execute an operation.
-  void (*execute)(int num_inputs, TFE_TensorHandle** inputs,
-                  const char* operation_name, const TFE_OpAttrs* attributes,
-                  int* num_outputs, TFE_TensorHandle** outputs, TF_Status* s,
-                  void* device_info);
+  void (*execute)(TFE_Context* context, int num_inputs,
+                  TFE_TensorHandle** inputs, const char* operation_name,
+                  const TFE_OpAttrs* attributes, int* num_outputs,
+                  TFE_TensorHandle** outputs, TF_Status* s, void* device_info);
 
   // Method to delete a device.
   void (*delete_device)(void* device_info);
@@ -503,11 +505,21 @@ typedef struct TFE_CustomDevice {
 // devices, so executing tf.functions which contain operations placed on custom
 // devices will fail.
 //
+// `device_name` must not name an existing physical or custom device. It must
+// follow the format:
+//
+//    /job:<name>/replica:<replica>/task:<task>/device:<type>:<device_num>
+//
+// If the device is successfully registered, `status` is set to TF_OK. Otherwise
+// the device is not usable. In case of a bad status, `device.delete_device` is
+// still called on `device_info` (i.e. the caller does not retain ownership).
+//
 // This API is highly experimental, and in particular is expected to change when
 // it starts supporting operations with attributes and when tf.function support
 // is added.
 void TFE_RegisterCustomDevice(TFE_Context* ctx, TFE_CustomDevice device,
-                              const char* device_name, void* device_info);
+                              const char* device_name, void* device_info,
+                              TF_Status* status);
 
 TF_CAPI_EXPORT extern void TFE_ContextGetFunctionDef(TFE_Context* ctx,
                                                      const char* function_name,
diff --git a/tensorflow/c/eager/custom_device_test.cc b/tensorflow/c/eager/custom_device_test.cc
index 742844c3f75..b6e6369bb43 100644
--- a/tensorflow/c/eager/custom_device_test.cc
+++ b/tensorflow/c/eager/custom_device_test.cc
@@ -27,7 +27,6 @@ limitations under the License.
 namespace {
 
 struct LoggingDevice {
-  TFE_Context* ctx;
   tensorflow::string device_name;
   tensorflow::string underlying_device;
   // Set to true whenever a TensorHandle is copied onto the device
@@ -48,7 +47,7 @@ void LoggedTensorDeallocator(void* data, size_t len, void* arg) {
 }
 
 TFE_TensorHandle* MakeLoggedTensorHandle(
-    TFE_Context* ctx, const tensorflow::string& logging_device_name,
+    TFE_Context* context, const tensorflow::string& logging_device_name,
     std::unique_ptr<LoggedTensor> t, TF_Status* status) {
   std::vector<int64_t> shape(TFE_TensorHandleNumDims(t->tensor, status));
   if (TF_GetCode(status) != TF_OK) return nullptr;
@@ -58,23 +57,25 @@ TFE_TensorHandle* MakeLoggedTensorHandle(
   }
   auto dtype = TFE_TensorHandleDataType(t->tensor);
   return TFE_NewTensorHandleFromDeviceMemory(
-      ctx, logging_device_name.c_str(), dtype, shape.data(), shape.size(),
+      context, logging_device_name.c_str(), dtype, shape.data(), shape.size(),
       t.release(), 1, &LoggedTensorDeallocator, nullptr, status);
 }
 
-TFE_TensorHandle* CopyToLoggingDevice(TFE_TensorHandle* tensor,
+TFE_TensorHandle* CopyToLoggingDevice(TFE_Context* context,
+                                      TFE_TensorHandle* tensor,
                                       TF_Status* status, void* device_info) {
   LoggingDevice* dev = reinterpret_cast<LoggingDevice*>(device_info);
   TFE_TensorHandle* t = TFE_TensorHandleCopyToDevice(
-      tensor, dev->ctx, dev->underlying_device.c_str(), status);
+      tensor, context, dev->underlying_device.c_str(), status);
   if (TF_GetCode(status) != TF_OK) return nullptr;
   auto dst = std::make_unique<LoggedTensor>(t);
   *(dev->arrived_flag) = true;
-  return MakeLoggedTensorHandle(dev->ctx, dev->device_name, std::move(dst),
+  return MakeLoggedTensorHandle(context, dev->device_name, std::move(dst),
                                 status);
 }
 
-TFE_TensorHandle* CopyTensorFromLoggingDevice(TFE_TensorHandle* tensor,
+TFE_TensorHandle* CopyTensorFromLoggingDevice(TFE_Context* context,
+                                              TFE_TensorHandle* tensor,
                                               const char* target_device_name,
                                               TF_Status* status,
                                               void* device_info) {
@@ -83,13 +84,13 @@ TFE_TensorHandle* CopyTensorFromLoggingDevice(TFE_TensorHandle* tensor,
   return nullptr;
 }
 
-void LoggingDeviceExecute(int num_inputs, TFE_TensorHandle** inputs,
-                          const char* operation_name,
+void LoggingDeviceExecute(TFE_Context* context, int num_inputs,
+                          TFE_TensorHandle** inputs, const char* operation_name,
                           const TFE_OpAttrs* attributes, int* num_outputs,
                           TFE_TensorHandle** outputs, TF_Status* s,
                           void* device_info) {
   LoggingDevice* dev = reinterpret_cast<LoggingDevice*>(device_info);
-  TFE_Op* op(TFE_NewOp(dev->ctx, operation_name, s));
+  TFE_Op* op(TFE_NewOp(context, operation_name, s));
   if (TF_GetCode(s) != TF_OK) return;
   TFE_OpAddAttrs(op, attributes);
   TFE_OpSetDevice(op, dev->underlying_device.c_str(), s);
@@ -117,7 +118,7 @@ void LoggingDeviceExecute(int num_inputs, TFE_TensorHandle** inputs,
   }
   for (int i = 0; i < *num_outputs; ++i) {
     auto logged_tensor = std::make_unique<LoggedTensor>(unwrapped_outputs[i]);
-    outputs[i] = MakeLoggedTensorHandle(dev->ctx, dev->device_name,
+    outputs[i] = MakeLoggedTensorHandle(context, dev->device_name,
                                         std::move(logged_tensor), s);
   }
   *(dev->executed_flag) = true;
@@ -128,19 +129,19 @@ void DeleteLoggingDevice(void* device_info) {
 }
 
 void RegisterLoggingDevice(TFE_Context* context, const char* name,
-                           bool* arrived_flag, bool* executed_flag) {
+                           bool* arrived_flag, bool* executed_flag,
+                           TF_Status* status) {
   TFE_CustomDevice custom_device;
   custom_device.copy_tensor_to_device = &CopyToLoggingDevice;
   custom_device.copy_tensor_from_device = &CopyTensorFromLoggingDevice;
   custom_device.delete_device = &DeleteLoggingDevice;
   custom_device.execute = &LoggingDeviceExecute;
   LoggingDevice* device = new LoggingDevice;
-  device->ctx = context;
   device->arrived_flag = arrived_flag;
   device->executed_flag = executed_flag;
   device->device_name = name;
   device->underlying_device = "/job:localhost/replica:0/task:0/device:CPU:0";
-  TFE_RegisterCustomDevice(context, custom_device, name, device);
+  TFE_RegisterCustomDevice(context, custom_device, name, device, status);
 }
 
 TEST(CUSTOM_DEVICE, RegisterSimpleDevice) {
@@ -153,7 +154,8 @@ TEST(CUSTOM_DEVICE, RegisterSimpleDevice) {
   bool arrived = false;
   bool executed = false;
   const char* name = "/job:localhost/replica:0/task:0/device:CUSTOM:0";
-  RegisterLoggingDevice(context, name, &arrived, &executed);
+  RegisterLoggingDevice(context, name, &arrived, &executed, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
   TFE_TensorHandle* hcpu = TestMatrixTensorHandle();
   ASSERT_FALSE(arrived);
   TFE_TensorHandle* hdevice =
@@ -189,7 +191,9 @@ TEST(CUSTOM_DEVICE, ResetOperation) {
   bool executed = false;
   const char* custom_device_name =
       "/job:localhost/replica:0/task:0/device:CUSTOM:0";
-  RegisterLoggingDevice(context.get(), custom_device_name, &arrived, &executed);
+  RegisterLoggingDevice(context.get(), custom_device_name, &arrived, &executed,
+                        status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
   std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> reused_op(
       TFE_NewOp(context.get(), "Identity", status.get()), TFE_DeleteOp);
@@ -217,7 +221,8 @@ TEST(CUSTOM_DEVICE, MakeVariable) {
   bool arrived = false;
   bool executed = false;
   const char* name = "/job:localhost/replica:0/task:0/device:CUSTOM:0";
-  RegisterLoggingDevice(context.get(), name, &arrived, &executed);
+  RegisterLoggingDevice(context.get(), name, &arrived, &executed, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
   // Create a variable handle placed on the custom device.
   std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
@@ -291,4 +296,103 @@ TEST(CUSTOM_DEVICE, MakeVariable) {
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 }
 
+TEST(CUSTOM_DEVICE, AccessVariableOnWrongDevice) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  std::unique_ptr<TFE_ContextOptions, decltype(&TFE_DeleteContextOptions)> opts(
+      TFE_NewContextOptions(), TFE_DeleteContextOptions);
+  std::unique_ptr<TFE_Context, decltype(&TFE_DeleteContext)> context(
+      TFE_NewContext(opts.get(), status.get()), TFE_DeleteContext);
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  bool arrived = false;
+  bool executed = false;
+  const char* name = "/job:localhost/replica:0/task:0/device:CUSTOM:0";
+  RegisterLoggingDevice(context.get(), name, &arrived, &executed, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+  // Create a variable handle placed on the custom device.
+  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
+      TFE_NewOp(context.get(), "VarHandleOp", status.get()), TFE_DeleteOp);
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  TFE_OpSetAttrType(op.get(), "dtype", TF_FLOAT);
+  TFE_OpSetAttrShape(op.get(), "shape", {}, 0, status.get());
+  TFE_OpSetAttrString(op.get(), "container", "", 0);
+  TFE_OpSetAttrString(op.get(), "shared_name", "", 0);
+  TFE_OpSetDevice(op.get(), name, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  TFE_TensorHandle* var_handle = nullptr;
+  int num_retvals = 1;
+  executed = false;
+  TFE_Execute(op.get(), &var_handle, &num_retvals, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  ASSERT_TRUE(executed);
+  auto handle_cleaner = tensorflow::gtl::MakeCleanup(
+      [var_handle]() { TFE_DeleteTensorHandle(var_handle); });
+
+  // Assign to the variable, copying to the custom device.
+  std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)> one(
+      TestScalarTensorHandle(111.f), TFE_DeleteTensorHandle);
+  op.reset(TFE_NewOp(context.get(), "AssignVariableOp", status.get()));
+  TFE_OpSetAttrType(op.get(), "dtype", TF_FLOAT);
+  TFE_OpAddInput(op.get(), var_handle, status.get());
+  TFE_OpAddInput(op.get(), one.get(), status.get());
+  TFE_OpSetDevice(op.get(), name, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  executed = false;
+  num_retvals = 0;
+  TFE_Execute(op.get(), nullptr, &num_retvals, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  ASSERT_TRUE(executed);
+
+  // Read the variable's value.
+  op.reset(TFE_NewOp(context.get(), "ReadVariableOp", status.get()));
+  TFE_OpAddInput(op.get(), var_handle, status.get());
+  TFE_OpSetAttrType(op.get(), "dtype", TF_FLOAT);
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  executed = false;
+  num_retvals = 1;
+  TFE_TensorHandle* var_value = nullptr;
+  TFE_Execute(op.get(), &var_value, &num_retvals, status.get());
+  EXPECT_FALSE(TF_GetCode(status.get()) == TF_OK)
+      << "Execution should fail because the variable is being used on the "
+         "wrong device.";
+  // Free the backing buffer for the variable.
+  op.reset(TFE_NewOp(context.get(), "DestroyResourceOp", status.get()));
+  TFE_OpAddInput(op.get(), var_handle, status.get());
+  TFE_OpSetDevice(op.get(), name, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  num_retvals = 0;
+  TFE_Execute(op.get(), nullptr, &num_retvals, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+}
+
+TEST(CUSTOM_DEVICE, InvalidRegistrationError) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  std::unique_ptr<TFE_ContextOptions, decltype(&TFE_DeleteContextOptions)> opts(
+      TFE_NewContextOptions(), TFE_DeleteContextOptions);
+  std::unique_ptr<TFE_Context, decltype(&TFE_DeleteContext)> context(
+      TFE_NewContext(opts.get(), status.get()), TFE_DeleteContext);
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  bool arrived = false;
+  bool executed = false;
+  RegisterLoggingDevice(context.get(), "/device:CUSTOM:0", &arrived, &executed,
+                        status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_INVALID_ARGUMENT)
+      << TF_Message(status.get());
+
+  const char* name = "/job:localhost/replica:0/task:0/device:CUSTOM:0";
+  RegisterLoggingDevice(context.get(), name, &arrived, &executed, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  RegisterLoggingDevice(context.get(), name, &arrived, &executed, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_ALREADY_EXISTS)
+      << TF_Message(status.get());
+
+  RegisterLoggingDevice(context.get(),
+                        "/job:localhost/replica:0/task:0/device:CPU:0",
+                        &arrived, &executed, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_ALREADY_EXISTS)
+      << TF_Message(status.get());
+}
+
 }  // namespace
diff --git a/tensorflow/c/kernels/bitcast_op_test.cc b/tensorflow/c/kernels/bitcast_op_test.cc
index 7da27e99d1f..33028ea6bd9 100644
--- a/tensorflow/c/kernels/bitcast_op_test.cc
+++ b/tensorflow/c/kernels/bitcast_op_test.cc
@@ -27,14 +27,10 @@ namespace {
 
 class DummyDevice : public DeviceBase {
  public:
-  DummyDevice(Env* env, bool save) : DeviceBase(env), save_(save) {}
-  bool RequiresRecordingAccessedTensors() const override { return save_; }
+  explicit DummyDevice(Env* env) : DeviceBase(env) {}
   Allocator* GetAllocator(AllocatorAttributes /*attr*/) override {
     return cpu_allocator();
   }
-
- private:
-  bool save_;
 };
 
 void TestBitcastOp(Tensor* input_tensor, DataType out_type,
@@ -61,7 +57,7 @@ void TestBitcastOp(Tensor* input_tensor, DataType out_type,
   ASSERT_TRUE(status.ok()) << status.ToString();
 
   OpKernelContext::Params params;
-  DummyDevice dummy_device(nullptr, false);
+  DummyDevice dummy_device(nullptr);
   params.device = &dummy_device;
   params.op_kernel = kernel.get();
   gtl::InlinedVector<TensorValue, 4> inputs;
diff --git a/tensorflow/c/kernels_test.cc b/tensorflow/c/kernels_test.cc
index 80e90e7cdf9..423302741de 100644
--- a/tensorflow/c/kernels_test.cc
+++ b/tensorflow/c/kernels_test.cc
@@ -155,14 +155,10 @@ TEST(TestKernel, TestRegisterKernelBuilder) {
 
 class DummyDevice : public DeviceBase {
  public:
-  DummyDevice(Env* env, bool save) : DeviceBase(env), save_(save) {}
-  bool RequiresRecordingAccessedTensors() const override { return save_; }
+  explicit DummyDevice(Env* env) : DeviceBase(env) {}
   Allocator* GetAllocator(AllocatorAttributes /*attr*/) override {
     return cpu_allocator();
   }
-
- private:
-  bool save_;
 };
 
 TEST(TestKernel, TestInputAndOutputCount) {
@@ -223,7 +219,7 @@ TEST(TestKernel, TestInputAndOutputCount) {
 
   {
     OpKernelContext::Params p;
-    DummyDevice dummy_device(nullptr, false);
+    DummyDevice dummy_device(nullptr);
     p.device = &dummy_device;
     p.step_id = 43;
 
diff --git a/tensorflow/c/tf_tensor.h b/tensorflow/c/tf_tensor.h
index 462fdc8b497..7ed4a9f754e 100644
--- a/tensorflow/c/tf_tensor.h
+++ b/tensorflow/c/tf_tensor.h
@@ -58,9 +58,9 @@ extern "C" {
 //   start_offset: array[uint64]
 //   data:         byte[...]
 //
-//   The string length (as a varint), followed by the contents of the string
-//   is encoded at data[start_offset[i]]]. TF_StringEncode and TF_StringDecode
-//   facilitate this encoding.
+//   The string length (as a varint, start_offset[i + 1] - start_offset[i]),
+//   followed by the contents of the string is encoded at data[start_offset[i]].
+//   TF_StringEncode and TF_StringDecode facilitate this encoding.
 
 typedef struct TF_Tensor TF_Tensor;
 
diff --git a/tensorflow/cc/client/client_session.cc b/tensorflow/cc/client/client_session.cc
index c4add1589e7..da2e12a4a06 100644
--- a/tensorflow/cc/client/client_session.cc
+++ b/tensorflow/cc/client/client_session.cc
@@ -41,7 +41,7 @@ class ClientSession::Impl {
   std::shared_ptr<Graph> graph_;
 
   mutable mutex mu_;
-  mutable int last_num_graph_nodes_ GUARDED_BY(mu_) = 0;
+  mutable int last_num_graph_nodes_ TF_GUARDED_BY(mu_) = 0;
 };
 
 ClientSession::ClientSession(const Scope& scope, const string& target)
diff --git a/tensorflow/cc/training/coordinator.h b/tensorflow/cc/training/coordinator.h
index 6d92d05803d..ca2b5f956bf 100644
--- a/tensorflow/cc/training/coordinator.h
+++ b/tensorflow/cc/training/coordinator.h
@@ -114,14 +114,14 @@ class Coordinator {
   condition_variable wait_for_stop_;
 
   mutex mu_;
-  bool should_stop_ GUARDED_BY(mu_);
+  bool should_stop_ TF_GUARDED_BY(mu_);
 
   mutex status_lock_;
-  Status status_ GUARDED_BY(status_lock_);
+  Status status_ TF_GUARDED_BY(status_lock_);
 
   mutable mutex runners_lock_;
   std::vector<std::unique_ptr<RunnerInterface>> runners_
-      GUARDED_BY(runners_lock_);
+      TF_GUARDED_BY(runners_lock_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(Coordinator);
 };
diff --git a/tensorflow/cc/training/queue_runner.h b/tensorflow/cc/training/queue_runner.h
index d9ecd221493..4a748bfc924 100644
--- a/tensorflow/cc/training/queue_runner.h
+++ b/tensorflow/cc/training/queue_runner.h
@@ -119,8 +119,8 @@ class QueueRunner : public RunnerInterface {
   std::unique_ptr<thread::ThreadPool> thread_pool_;
   mutex mu_;
   int runs_ = 0;
-  Status status_ GUARDED_BY(mu_);
-  Status enqueue_status_ GUARDED_BY(mu_);
+  Status status_ TF_GUARDED_BY(mu_);
+  Status enqueue_status_ TF_GUARDED_BY(mu_);
   std::unique_ptr<BlockingCounter> counter_;
 
   Coordinator* coord_;
@@ -131,7 +131,7 @@ class QueueRunner : public RunnerInterface {
   std::vector<std::function<void(Status)>> callbacks_;
 
   mutable std::unique_ptr<mutex> cg_mu_;
-  std::unique_ptr<CostGraphDef> cost_graph_ GUARDED_BY(cg_mu_);
+  std::unique_ptr<CostGraphDef> cost_graph_ TF_GUARDED_BY(cg_mu_);
   RunOptions run_options_;
 };
 
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index dfbea9c49eb..7f1590ff75d 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -37,6 +37,7 @@ cc_library(
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
+        "//tensorflow/compiler/mlir/lite/quantization/xla:quantize",
         "//tensorflow/compiler/tf2xla",
         "//tensorflow/compiler/tf2xla:mlir_tf2xla",
         "//tensorflow/compiler/tf2xla:tf2xla_proto_cc",
@@ -64,6 +65,7 @@ cc_library(
         "@llvm-project//llvm:powerpc_code_gen",  # fixdeps: keep
         "@llvm-project//llvm:target",
         "@llvm-project//llvm:x86_code_gen",  # fixdeps: keep
+        "//tensorflow/core:regexp_internal",
     ] + if_llvm_aarch64_available([
         "//third_party/llvm/llvm-project/llvm:aarch64_target",  # fixdeps: keep
     ]),
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index 53150e991cc..4a4fec5a386 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/strings/str_split.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/aot/embedded_protocol_buffers.h"
+#include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/xla/cpu_function_runtime.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
@@ -288,8 +289,8 @@ Status GenVariableMethods(const tf2xla::Config& config,
 }
 
 // Generates code implementing {Arg,Result}Names(), where T is one of
-// tf2xla::{Feed,Fetch}. Each feed or fetch name results in a C-style string
-// literal in the array, with nullptr terminating the array.
+// tf2xla::{Feed,Fetch,Variable}. Each feed or fetch name results in a C-style
+// string literal in the array, with nullptr terminating the array.
 template <typename T>
 string GenNameToIndexCode(const T& entries, bool generate) {
   // No need for a static array if we're not supposed to generate the data.
@@ -419,6 +420,16 @@ Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config,
   // Generate metadata.
   const string arg_names_code =
       GenNameToIndexCode(config.feed(), opts.gen_name_to_index);
+
+  auto variable_copy = config.variable();
+  for (auto& var : variable_copy) {
+    if (var.name().empty()) {
+      var.set_name(var.node_name());
+    }
+  }
+  const string variable_names_code =
+      GenNameToIndexCode(variable_copy, opts.gen_name_to_index);
+
   const string result_names_code =
       GenNameToIndexCode(config.fetch(), opts.gen_name_to_index);
   const string include_xla_data_proto =
@@ -507,6 +518,9 @@ class {{CLASS}} final : public tensorflow::XlaCompiledCpuFunction {
   // Number of input arguments for the compiled computation.
   static constexpr size_t kNumArgs = {{ARG_NUM}};
 
+  // Number of variables for the compiled computation.
+  static constexpr size_t kNumVariables = {{VARIABLE_NUM}};
+
   // Byte size of each argument buffer. There are kNumArgs entries.
   static const ::tensorflow::int64 ArgSize(::tensorflow::int32 index) {
     return BufferInfos()[ArgIndexToBufferIndex()[index]].size();
@@ -522,8 +536,10 @@ class {{CLASS}} final : public tensorflow::XlaCompiledCpuFunction {
       set_static_data_num_buffers(data, kNumBuffers);
       set_static_data_arg_index_table(data, ArgIndexToBufferIndex());
       set_static_data_num_args(data, kNumArgs);
+      set_static_data_num_variables(data, kNumVariables);
       set_static_data_result_index(data, kResultIndex);
       set_static_data_arg_names(data, StaticArgNames());
+      set_static_data_variable_names(data, StaticVariableNames());
       set_static_data_result_names(data, StaticResultNames());
       set_static_data_program_shape(data, StaticProgramShape());
       set_static_data_hlo_profile_printer_data(
@@ -626,6 +642,9 @@ class {{CLASS}} final : public tensorflow::XlaCompiledCpuFunction {
   // Array of names of each positional argument, terminated by nullptr.
   static const char** StaticArgNames() {{ARG_NAMES_CODE}}
 
+  // Array of names of each positional variable, terminated by nullptr.
+  static const char** StaticVariableNames() {{VARIABLE_NAMES_CODE}}
+
   // Array of names of each positional result, terminated by nullptr.
   static const char** StaticResultNames() {{RESULT_NAMES_CODE}}
 
@@ -654,6 +673,7 @@ class {{CLASS}} final : public tensorflow::XlaCompiledCpuFunction {
       {"{{ARG_BYTES_TOTAL}}", absl::StrCat(arg_bytes_total)},
       {"{{ARG_NAMES_CODE}}", arg_names_code},
       {"{{ARG_NUM}}", absl::StrCat(arg_index_table.size())},
+      {"{{VARIABLE_NUM}}", absl::StrCat(config.variable_size())},
       {"{{ARG_INDEX_TABLE}}", absl::StrJoin(arg_index_table, ", ")},
       {"{{ASSIGN_PROFILE_COUNTERS_SIZE}}", assign_profile_counters_size},
       {"{{CLASS}}", opts.class_name},
@@ -673,6 +693,7 @@ class {{CLASS}} final : public tensorflow::XlaCompiledCpuFunction {
       {"{{PROGRAM_SHAPE}}", xla::ShapeUtil::HumanString(xla::ProgramShape(ps))},
       {"{{PROGRAM_SHAPE_SHIM_EXPRESSION}}",
        metadata_result.program_shape_access_shim},
+      {"{{VARIABLE_NAMES_CODE}}", variable_names_code},
       {"{{RESULT_INDEX}}", absl::StrCat(result_index)},
       {"{{RESULT_NAMES_CODE}}", result_names_code},
       {"{{TEMP_BYTES_ALIGNED}}", absl::StrCat(temp_bytes_aligned)},
diff --git a/tensorflow/compiler/aot/codegen_test.cc b/tensorflow/compiler/aot/codegen_test.cc
index 6206f68faf9..babbd7fb2f5 100644
--- a/tensorflow/compiler/aot/codegen_test.cc
+++ b/tensorflow/compiler/aot/codegen_test.cc
@@ -156,17 +156,14 @@ static void CompareWithGoldenFile(
   // bazel test --test_strategy=local \
   //   third_party/tensorflow/compiler/aot:codegen_test
   const bool update_golden = false;
-  string golden_file_name;
+  string golden_file_name =
+      GetDataDependencyFilepath(tensorflow_relative_golden_file_name);
 
   if (update_golden) {
-    golden_file_name = io::JoinPath(testing::TensorFlowSrcRoot(),
-                                    tensorflow_relative_golden_file_name);
     TF_EXPECT_OK(
         WriteStringToFile(Env::Default(), golden_file_name, expected_contents));
   }
 
-  golden_file_name =
-      GetDataDependencyFilepath(tensorflow_relative_golden_file_name);
   string golden_file_contents;
   TF_ASSERT_OK(ReadFileToString(Env::Default(), golden_file_name,
                                 &golden_file_contents));
@@ -220,10 +217,16 @@ TEST(CodegenTest, Golden) {
       {},
       {BufferInfo::MakeTempBuffer(1),
        BufferInfo::MakeEntryParameter(/*size=*/8, /*param_number=*/0),
-       BufferInfo::MakeTempBuffer(2),
+       BufferInfo::MakeTempBuffer(1),
        BufferInfo::MakeEntryParameter(/*size=*/96, /*param_number=*/1),
-       BufferInfo::MakeTempBuffer(3), BufferInfo::MakeTempBuffer(120)},
-      5, {}));
+       BufferInfo::MakeTempBuffer(1),
+       BufferInfo::MakeEntryParameter(/*size=*/96, /*param_number=*/2),
+       BufferInfo::MakeTempBuffer(1),
+       BufferInfo::MakeEntryParameter(/*size=*/96, /*param_number=*/3),
+       BufferInfo::MakeTempBuffer(1),
+       BufferInfo::MakeEntryParameter(/*size=*/96, /*param_number=*/4),
+       BufferInfo::MakeTempBuffer(1), BufferInfo::MakeTempBuffer(120)},
+      11, {}));
   compile_result.program_shape =
       xla::ShapeUtil::MakeProgramShape(
           {
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index 1669e728d1a..af58ca233f0 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -55,14 +55,17 @@ namespace bar {
 //   ((unknown): f32[1,2], (unknown): s64[3,4], (unknown): f32[1], (unknown): f32[1], (unknown): s32[5]) -> (u32[5,6], f32[1], s32[5])
 //
 // Memory stats:
-//   arg bytes total:    104
-//   arg bytes aligned:  192
+//   arg bytes total:    392
+//   arg bytes aligned:  576
 //   temp bytes total:   126
-//   temp bytes aligned: 320
+//   temp bytes aligned: 512
 class MyClass final : public tensorflow::XlaCompiledCpuFunction {
  public:
   // Number of input arguments for the compiled computation.
-  static constexpr size_t kNumArgs = 2;
+  static constexpr size_t kNumArgs = 5;
+
+  // Number of variables for the compiled computation.
+  static constexpr size_t kNumVariables = 3;
 
   // Byte size of each argument buffer. There are kNumArgs entries.
   static const ::tensorflow::int64 ArgSize(::tensorflow::int32 index) {
@@ -79,8 +82,10 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
       set_static_data_num_buffers(data, kNumBuffers);
       set_static_data_arg_index_table(data, ArgIndexToBufferIndex());
       set_static_data_num_args(data, kNumArgs);
+      set_static_data_num_variables(data, kNumVariables);
       set_static_data_result_index(data, kResultIndex);
       set_static_data_arg_names(data, StaticArgNames());
+      set_static_data_variable_names(data, StaticVariableNames());
       set_static_data_result_names(data, StaticResultNames());
       set_static_data_program_shape(data, StaticProgramShape());
       set_static_data_hlo_profile_printer_data(
@@ -295,16 +300,22 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
 
  private:
   // Number of buffers for the compiled computation.
-  static constexpr size_t kNumBuffers = 6;
+  static constexpr size_t kNumBuffers = 12;
 
   static const ::xla::cpu_function_runtime::BufferInfo* BufferInfos() {
     static const ::xla::cpu_function_runtime::BufferInfo
       kBufferInfos[kNumBuffers] = {
 ::xla::cpu_function_runtime::BufferInfo({5ULL, ~0ULL}),
 ::xla::cpu_function_runtime::BufferInfo({34ULL, 0ULL}),
-::xla::cpu_function_runtime::BufferInfo({9ULL, ~0ULL}),
+::xla::cpu_function_runtime::BufferInfo({5ULL, ~0ULL}),
 ::xla::cpu_function_runtime::BufferInfo({386ULL, 1ULL}),
-::xla::cpu_function_runtime::BufferInfo({13ULL, ~0ULL}),
+::xla::cpu_function_runtime::BufferInfo({5ULL, ~0ULL}),
+::xla::cpu_function_runtime::BufferInfo({386ULL, 2ULL}),
+::xla::cpu_function_runtime::BufferInfo({5ULL, ~0ULL}),
+::xla::cpu_function_runtime::BufferInfo({386ULL, 3ULL}),
+::xla::cpu_function_runtime::BufferInfo({5ULL, ~0ULL}),
+::xla::cpu_function_runtime::BufferInfo({386ULL, 4ULL}),
+::xla::cpu_function_runtime::BufferInfo({5ULL, ~0ULL}),
 ::xla::cpu_function_runtime::BufferInfo({481ULL, ~0ULL})
       };
     return kBufferInfos;
@@ -312,13 +323,13 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
 
   static const ::tensorflow::int32* ArgIndexToBufferIndex() {
     static constexpr ::tensorflow::int32 kArgIndexToBufferIndex[kNumArgs] = {
-1, 3
+1, 3, 5, 7, 9
     };
     return kArgIndexToBufferIndex;
   }
 
   // The 0-based index of the result tuple in the temporary buffers.
-  static constexpr size_t kResultIndex = 5;
+  static constexpr size_t kResultIndex = 11;
 
   // Array of names of each positional argument, terminated by nullptr.
   static const char** StaticArgNames() {
@@ -326,6 +337,12 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
     return kNames;
   }
 
+  // Array of names of each positional variable, terminated by nullptr.
+  static const char** StaticVariableNames() {
+    static const char* kNames[] = {"myvar_readonly", "myvar", "myvar2", nullptr};
+    return kNames;
+  }
+
   // Array of names of each positional result, terminated by nullptr.
   static const char** StaticResultNames() {
     static const char* kNames[] = {"myfetch", nullptr};
diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index de58c7f8a87..d6d012dcc71 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "llvm-c/Target.h"
 #include "tensorflow/compiler/aot/codegen.h"
 #include "tensorflow/compiler/aot/flags.h"
+#include "tensorflow/compiler/mlir/lite/quantization/xla/quantize.h"
 #include "tensorflow/compiler/tf2xla/tf2xla.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
@@ -39,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/regexp.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -105,14 +107,18 @@ Status CompileGraph(GraphDef graph_def, const tf2xla::Config& config,
           .ValueOrDie();
   xla::XlaComputation computation;
   if (flags.mlir_components == "Bridge") {
-    TF_RETURN_IF_ERROR(
-        ConvertGraphDefToXlaViaMlir(graph_def, config, &computation));
+    TF_RETURN_IF_ERROR(ConvertGraphDefToXlaViaMlir(
+        graph_def, config, &computation, flags.debug_info,
+        flags.debug_info_path_begin_marker));
   } else if (flags.mlir_components.empty() || flags.mlir_components == "None") {
     TF_RETURN_IF_ERROR(ConvertGraphDefToXla(std::move(graph_def), config,
                                             client, &computation));
   } else {
     return errors::Unknown("Unknown mlir_components ", flags.mlir_components);
   }
+  if (flags.quantize) {
+    TF_RETURN_IF_ERROR(mlir::xla_hlo::XlaQuantize(config, &computation));
+  }
   if (!flags.out_session_module.empty()) {
     TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::HloSnapshot> module,
                         computation.Snapshot());
@@ -166,6 +172,23 @@ static void InitializeTargets() {
   LLVMInitializeX86AsmPrinter();
 }
 
+// Replaces {{tag.type tag.name}} in the error message with tag_name.
+// TODO(bixia): We currently only handlge tag.type == "node".
+//
+// In the error message, a graph node is represented as {{tag.type, tag.name}},
+// to allow a Python debugger to insert source information about the graph node.
+// For example, a Python add expression may be represented as
+// {{node, x_y_sum}} = Add(x, y) in the error message. See routine interpolate
+// in tensorflow/python/framework/error_interpolation.py for more detail.
+static std::string InterpolateErrorMessage(std::string message) {
+  // See _NAME_REGEX in tensorflow/python/framework/error_interpolation.py
+  // Change "prefix {{node tag.name}} suffix" to "prefix tag.name suffix".
+  static LazyRE2 pattern{"(.*){{node (.*)}}(.*)"};
+  RE2::GlobalReplace(&message, *pattern, "\\1\\2\\3");
+
+  return message;
+}
+
 Status Main(const MainFlags& flags) {
   absl::call_once(targets_init, &InitializeTargets);
 
@@ -192,8 +215,13 @@ Status Main(const MainFlags& flags) {
   GraphDef graph_def;
   TF_RETURN_IF_ERROR(ReadProtoFile(flags.graph, &graph_def));
   CompileResult compile_result;
-  TF_RETURN_IF_ERROR(
-      CompileGraph(std::move(graph_def), config, flags, &compile_result));
+
+  Status status =
+      CompileGraph(std::move(graph_def), config, flags, &compile_result);
+  if (!status.ok()) {
+    return Status(status.code(),
+                  InterpolateErrorMessage(status.error_message()));
+  }
 
   // Write output files.
   Env* env = Env::Default();
diff --git a/tensorflow/compiler/aot/flags.cc b/tensorflow/compiler/aot/flags.cc
index e7040d12b8b..e8168bf706e 100644
--- a/tensorflow/compiler/aot/flags.cc
+++ b/tensorflow/compiler/aot/flags.cc
@@ -24,6 +24,13 @@ void AppendMainFlags(std::vector<Flag>* flag_list, MainFlags* flags) {
        "Input GraphDef file.  If the file ends in '.pbtxt' it is expected to "
        "be in the human-readable proto text format, otherwise it is expected "
        "to be in the proto binary format."},
+      {"debug_info", &flags->debug_info,
+       "Graph debug info file.  If the file ends in '.pbtxt' it is expected to "
+       "be in the human-readable proto text format, otherwise it is expected "
+       "to be in the proto binary format."},
+      {"debug_info_path_begin_marker", &flags->debug_info_path_begin_marker,
+       "If not none, only keep the file path in the debug information after the"
+       " marker. The default value is empty"},
       {"config", &flags->config,
        "Input file containing Config proto.  If the file ends in '.pbtxt' it "
        "is expected to be in the human-readable proto text format, otherwise "
@@ -70,6 +77,8 @@ void AppendMainFlags(std::vector<Flag>* flag_list, MainFlags* flags) {
        "Output session module proto."},
       {"mlir_components", &flags->mlir_components,
        "The MLIR components to enable. Currently only Bridge is supported."},
+      {"quantize", &flags->quantize,
+       "If set, quantization will be applied before HLO code generation."},
       {"gen_name_to_index", &flags->gen_name_to_index,
        "Generate name-to-index data for Lookup{Arg,Result}Index methods."},
       {"gen_program_shape", &flags->gen_program_shape,
diff --git a/tensorflow/compiler/aot/flags.h b/tensorflow/compiler/aot/flags.h
index 451a0455977..96395c7501b 100644
--- a/tensorflow/compiler/aot/flags.h
+++ b/tensorflow/compiler/aot/flags.h
@@ -28,6 +28,8 @@ namespace tfcompile {
 
 struct MainFlags {
   string graph;
+  string debug_info;
+  string debug_info_path_begin_marker;
   string config;
   bool dump_fetch_nodes = false;
   string target_triple;
@@ -40,6 +42,7 @@ struct MainFlags {
   string out_header;
   string out_session_module;
   string mlir_components;
+  bool quantize = false;
 
   // C++ codegen options
   bool gen_name_to_index = false;
diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD
index 6f438f0e271..0c44ed8bf37 100644
--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@@ -1,11 +1,37 @@
 load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     default_visibility = ["//visibility:private"],
     licenses = ["notice"],  # Apache 2.0
 )
 
+glob_lit_tests(
+    data = [":filecheck_test_utilities"],
+    driver = "@llvm-project//mlir:run_lit.sh",
+    tags_override = {
+        "test_error_message.lit.pbtxt": ["no_oss"],  # TODO(b/150957738): to be fixed on oss.
+    },
+    test_file_exts = ["lit.pbtxt"],
+)
+
+# Bundle together all of the test utilities that are used by tests.
+filegroup(
+    name = "filecheck_test_utilities",
+    testonly = True,
+    srcs = [
+        "test_error_message.lit.pbtxt.config.pbtxt",
+        "test_error_message.lit.pbtxt.debug.pbtxt",
+        "test_error_message.lit.pbtxt.fake_py.debug",
+    ],
+    data = [
+        "//tensorflow/compiler/aot:tfcompile",
+        "@llvm-project//llvm:FileCheck",
+        "@llvm-project//llvm:not",
+    ],
+)
+
 # We disable some tfcompile tests in the open source build with the
 # "manual" tag to avoid making our OSS users build LLVM twice
 # (once for host and once for target).
@@ -60,6 +86,7 @@ genrule(
     testonly = 1,
     outs = [
         "test_graph_tfadd.pb",
+        "test_debuginfo_tfadd.pb",
         "test_graph_tfadd_with_ckpt.ckpt",
         "test_graph_tfadd_with_ckpt.pb",
         "test_graph_tfadd_with_ckpt_saver.ckpt",
@@ -317,6 +344,7 @@ tf_library(
     testonly = 1,
     config = "test_graph_tfadd.config.pbtxt",
     cpp_class = "AddComp",
+    debug_info = "test_debuginfo_tfadd.pb",
     graph = "test_graph_tfadd.pb",
     include_standard_runtime_deps = False,
     mlir_components = "Bridge",
diff --git a/tensorflow/compiler/aot/tests/make_test_graphs.py b/tensorflow/compiler/aot/tests/make_test_graphs.py
index a96ba0e6919..629239d6e4a 100644
--- a/tensorflow/compiler/aot/tests/make_test_graphs.py
+++ b/tensorflow/compiler/aot/tests/make_test_graphs.py
@@ -30,6 +30,7 @@ from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import error_interpolation
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -184,7 +185,22 @@ def tfvariable_sequential_updates(_):
   array_ops.identity(updates, name='result')
 
 
-def write_graph(build_graph, out_dir):
+def export_debug_info(exported_graph):
+  """Exports debug information from a graph.
+
+  Args:
+    exported_graph: A Graph that has been created by tracing a saveable view.
+
+  Returns:
+    Corresponding GraphDebugInfo with traces for all ops in exported_graph.
+  """
+  exported_operations = []
+  for op in exported_graph.get_operations():
+    exported_operations.append(('', op))
+  return error_interpolation.create_graph_debug_info_def(exported_operations)
+
+
+def write_graph(build_graph, out_dir, debug_info=False):
   """Build a graph using build_graph and write it out."""
   g = ops.Graph()
   with g.as_default():
@@ -193,10 +209,19 @@ def write_graph(build_graph, out_dir):
     with open(filename, 'wb') as f:
       f.write(six.ensure_binary(g.as_graph_def().SerializeToString()))
 
+    if debug_info:
+      filename_debuginfo = os.path.join(
+          out_dir, 'test_debuginfo_%s.pb' % build_graph.__name__)
+      test_debuginfo = export_debug_info(g)
+      with open(filename_debuginfo, 'wb') as f:
+        f.write(
+            six.ensure_binary(
+                test_debuginfo.SerializeToString(deterministic=True)))
+
 
 def main(_):
   control_flow_util.enable_control_flow_v2()
-  write_graph(tfadd, FLAGS.out_dir)
+  write_graph(tfadd, FLAGS.out_dir, debug_info=True)
   write_graph(tfadd_with_ckpt, FLAGS.out_dir)
   write_graph(tfadd_with_ckpt_saver, FLAGS.out_dir)
   write_graph(tfassert_eq, FLAGS.out_dir)
diff --git a/tensorflow/compiler/aot/tests/test_error_message.lit.pbtxt b/tensorflow/compiler/aot/tests/test_error_message.lit.pbtxt
new file mode 100644
index 00000000000..5b05eb4b33d
--- /dev/null
+++ b/tensorflow/compiler/aot/tests/test_error_message.lit.pbtxt
@@ -0,0 +1,69 @@
+# RUN: not tfcompile --graph=%s --config=%s.config.pbtxt --mlir_components=Bridge --debug_info=%s.debug.pbtxt 2>&1 | FileCheck %s -dump-input-on-failure
+# RUN: not tfcompile --graph=%s --config=%s.config.pbtxt --mlir_components=None 2>&1 | FileCheck -check-prefix=OLD %s -dump-input-on-failure
+
+# Checks the error message produced by tfcompile with mlir_component
+# Checks that source debug information is used in the output error message and
+# the node x_y_sum = Add
+# CHECK: INVALID ARGUMENTS: Dimensions must be equal, but are 2 and 3 for 'x_y_sum = Add[T=DT_INT32](aot_feed_0/x, aot_feed_0/y)'
+# CHECK: math_ops.add(x, y, name='x_y_sum')
+# CHECK: build_graph(out_dir)
+
+# Checks the error message produced by tfcompile without mlir_component
+# OLD: INVALID ARGUMENTS: Incompatible shapes: [2] vs. [3]
+# OLD: x_y_sum
+
+node: {
+  name: "x"
+  op: "Placeholder"
+  attr: {
+    key: "shape"
+    value: {
+      shape: {
+        dim: {
+          size: -1
+        }
+      }
+    }
+  }
+  attr: {
+    key: "dtype"
+    value: {
+      type: DT_INT32
+    }
+  }
+}
+node: {
+  name: "y"
+  op: "Placeholder"
+  attr: {
+    key: "shape"
+    value: {
+      shape: {
+        dim: {
+          size: -1
+        }
+      }
+    }
+  }
+  attr: {
+    key: "dtype"
+    value: {
+      type: DT_INT32
+    }
+  }
+}
+node: {
+  name: "x_y_sum"
+  op: "Add"
+  input: "x"
+  input: "y"
+  attr: {
+    key: "T"
+    value: {
+      type: DT_INT32
+    }
+  }
+}
+versions: {
+  producer: 321
+}
diff --git a/tensorflow/compiler/aot/tests/test_error_message.lit.pbtxt.config.pbtxt b/tensorflow/compiler/aot/tests/test_error_message.lit.pbtxt.config.pbtxt
new file mode 100644
index 00000000000..2694e67da06
--- /dev/null
+++ b/tensorflow/compiler/aot/tests/test_error_message.lit.pbtxt.config.pbtxt
@@ -0,0 +1,16 @@
+# Text form of tensorflow.tf2xla.Config proto.
+feed {
+  id { node_name: "x" }
+  shape {
+    dim { size: 2 }
+  }
+}
+feed {
+  id { node_name: "y" }
+  shape {
+    dim { size: 3 }
+  }
+}
+fetch {
+  id { node_name: "x_y_sum" }
+}
diff --git a/tensorflow/compiler/aot/tests/test_error_message.lit.pbtxt.debug.pbtxt b/tensorflow/compiler/aot/tests/test_error_message.lit.pbtxt.debug.pbtxt
new file mode 100644
index 00000000000..7acc8287950
--- /dev/null
+++ b/tensorflow/compiler/aot/tests/test_error_message.lit.pbtxt.debug.pbtxt
@@ -0,0 +1,28 @@
+files: "org_tensorflow/tensorflow/compiler/aot/tests/test_error_message.lit.pbtxt.fake_py.debug"
+traces: {
+  key: "x@"
+  value: {
+    file_line_cols: {
+      line: 1
+    }
+  }
+}
+traces: {
+  key: "x_y_sum@"
+  value: {
+    file_line_cols: {
+      line: 3
+    }
+    file_line_cols: {
+      line: 4
+    }
+  }
+}
+traces: {
+  key: "y@"
+  value: {
+    file_line_cols: {
+      line: 2
+    }
+  }
+}
diff --git a/tensorflow/compiler/aot/tests/test_error_message.lit.pbtxt.fake_py.debug b/tensorflow/compiler/aot/tests/test_error_message.lit.pbtxt.fake_py.debug
new file mode 100644
index 00000000000..083e8d522d5
--- /dev/null
+++ b/tensorflow/compiler/aot/tests/test_error_message.lit.pbtxt.fake_py.debug
@@ -0,0 +1,4 @@
+  x = value
+  y = value
+  math_ops.add(x, y, name='x_y_sum')
+  build_graph(out_dir)
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index 80606b6c5ee..35a054a1aab 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -26,6 +26,7 @@ def tf_library(
         name,
         graph,
         config,
+        debug_info = None,
         freeze_checkpoint = None,
         freeze_saver = None,
         cpp_class = None,
@@ -191,12 +192,15 @@ def tf_library(
 
     mlir_flag = "--mlir_components=" + mlir_components
 
+    srcs = [tfcompile_graph, config]
+    debug_info_flag = ""
+    if debug_info:
+        srcs.append(debug_info)
+        debug_info_flag = " --debug_info=$(location " + debug_info + ")"
+
     native.genrule(
         name = ("gen_" + name),
-        srcs = [
-            tfcompile_graph,
-            config,
-        ],
+        srcs = srcs,
         outs = [
             header_file,
             metadata_object_file,
@@ -206,6 +210,7 @@ def tf_library(
             "CUDA_VISIBLE_DEVICES='' " +
             "$(location " + tfcompile_tool + ")" +
             " --graph=$(location " + tfcompile_graph + ")" +
+            debug_info_flag +
             " --config=$(location " + config + ")" +
             " --entry_point=" + ep +
             " --cpp_class=" + cpp_class +
@@ -237,10 +242,7 @@ def tf_library(
     session_module_pb = name + "_session_module.pb"
     native.genrule(
         name = (name + "_session_module"),
-        srcs = [
-            tfcompile_graph,
-            config,
-        ],
+        srcs = srcs,
         outs = [
             session_module_pb,
         ],
@@ -248,6 +250,7 @@ def tf_library(
             "CUDA_VISIBLE_DEVICES='' " +
             "$(location " + tfcompile_tool + ")" +
             " --graph=$(location " + tfcompile_graph + ")" +
+            debug_info_flag +
             " --config=$(location " + config + ")" +
             " --entry_point=" + ep +
             " --cpp_class=" + cpp_class +
diff --git a/tensorflow/compiler/aot/tfcompile_main.cc b/tensorflow/compiler/aot/tfcompile_main.cc
index d027bae5d04..f0cf8f2ded9 100644
--- a/tensorflow/compiler/aot/tfcompile_main.cc
+++ b/tensorflow/compiler/aot/tfcompile_main.cc
@@ -65,6 +65,7 @@ int main(int argc, char** argv) {
   flags.out_metadata_object = "out_helper.o";
   flags.out_header = "out.h";
   flags.entry_point = "entry";
+  flags.debug_info_path_begin_marker = "";
 
   std::vector<tensorflow::Flag> flag_list;
   AppendMainFlags(&flag_list, &flags);
@@ -81,12 +82,10 @@ int main(int argc, char** argv) {
 
   tensorflow::port::InitMain(usage.c_str(), &argc, &argv);
   QCHECK(argc == 1) << "\nERROR: This command does not take any arguments "
-                       "other than flags\n\n"
-                    << usage;
+                       "other than flags. See --help.\n\n";
   tensorflow::Status status = tensorflow::tfcompile::Main(flags);
   if (status.code() == tensorflow::error::INVALID_ARGUMENT) {
-    std::cerr << "INVALID ARGUMENTS: " << status.error_message() << "\n\n"
-              << usage;
+    std::cerr << "INVALID ARGUMENTS: " << status.error_message() << "\n\n";
     return 1;
   } else {
     TF_QCHECK_OK(status);
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index acbd2d27a45..f71331af0df 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -184,6 +184,7 @@ XLA_DEVICE_DEPS = [
     "//tensorflow/core:core_cpu_internal",
     "//tensorflow/core:dataset_ops_op_lib",
     "//tensorflow/core:framework",
+    "//tensorflow/core:framework_internal",
     "//tensorflow/core:functional_ops_op_lib",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
diff --git a/tensorflow/compiler/jit/graphcycles/BUILD b/tensorflow/compiler/jit/graphcycles/BUILD
index 618fafe759b..61d0c0de35f 100644
--- a/tensorflow/compiler/jit/graphcycles/BUILD
+++ b/tensorflow/compiler/jit/graphcycles/BUILD
@@ -18,6 +18,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/tensorflow/compiler/jit/graphcycles/graphcycles.cc b/tensorflow/compiler/jit/graphcycles/graphcycles.cc
index 6ec9b5a477a..6c5e3a745e2 100644
--- a/tensorflow/compiler/jit/graphcycles/graphcycles.cc
+++ b/tensorflow/compiler/jit/graphcycles/graphcycles.cc
@@ -368,14 +368,20 @@ bool GraphCycles::CanContractEdge(int32 a, int32 b) {
   return !reachable;
 }
 
-bool GraphCycles::ContractEdge(int32 a, int32 b) {
+absl::optional<int32> GraphCycles::ContractEdge(int32 a, int32 b) {
   CHECK(HasEdge(a, b));
   RemoveEdge(a, b);
 
   if (IsReachableNonConst(a, b)) {
     // Restore the graph to its original state.
     InsertEdge(a, b);
-    return false;
+    return absl::nullopt;
+  }
+
+  if (rep_->nodes_[b]->in.Size() + rep_->nodes_[b]->out.Size() >
+      rep_->nodes_[a]->in.Size() + rep_->nodes_[a]->out.Size()) {
+    // Swap "a" and "b" to minimize copying.
+    std::swap(a, b);
   }
 
   Node* nb = rep_->nodes_[b];
@@ -399,7 +405,8 @@ bool GraphCycles::ContractEdge(int32 a, int32 b) {
     InsertEdge(y, a);
   }
 
-  return true;
+  // Note, if the swap happened it might be what originally was called "b".
+  return a;
 }
 
 absl::Span<const int32> GraphCycles::Successors(int32 node) const {
diff --git a/tensorflow/compiler/jit/graphcycles/graphcycles.h b/tensorflow/compiler/jit/graphcycles/graphcycles.h
index bbf61016fb3..3e20c4e641c 100644
--- a/tensorflow/compiler/jit/graphcycles/graphcycles.h
+++ b/tensorflow/compiler/jit/graphcycles/graphcycles.h
@@ -40,6 +40,7 @@ limitations under the License.
 //   FindPath() is linear in the size of the graph.
 // The current implementation uses O(|V|+|E|) space.
 
+#include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -80,11 +81,11 @@ class GraphCycles {
   // Return whether there is an edge directly from source_node to dest_node.
   bool HasEdge(int32 source_node, int32 dest_node) const;
 
-  // Contracts the edge from 'a' to node 'b', merging nodes 'a' and 'b'. 'b' is
-  // removed from the graph, and edges to/from 'b' are replaced with edges
-  // to/from 'a'. If contracting the edge would create a cycle, does nothing
-  // and returns false.
-  bool ContractEdge(int32 a, int32 b);
+  // Contracts the edge from 'a' to node 'b', merging nodes 'a' and 'b'. One of
+  // the nodes is removed from the graph, and edges to/from it are added to
+  // the remaining one, which is returned. If contracting the edge would create
+  // a cycle, does nothing and return no value.
+  absl::optional<int32> ContractEdge(int32 a, int32 b);
 
   // Return true if can contract edge, otherwise return false.
   bool CanContractEdge(int32 a, int32 b);
diff --git a/tensorflow/compiler/jit/graphcycles/graphcycles_test.cc b/tensorflow/compiler/jit/graphcycles/graphcycles_test.cc
index 274f5938a12..5b7eec19e27 100644
--- a/tensorflow/compiler/jit/graphcycles/graphcycles_test.cc
+++ b/tensorflow/compiler/jit/graphcycles/graphcycles_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
 
+#include <optional>
 #include <random>
 #include <unordered_set>
 #include <vector>
@@ -479,19 +480,21 @@ TEST_F(GraphCyclesTest, ContractEdge) {
   ASSERT_TRUE(AddEdge(2, 4));
   ASSERT_TRUE(AddEdge(3, 4));
 
-  EXPECT_FALSE(g_.ContractEdge(1, 3));
+  EXPECT_FALSE(g_.ContractEdge(1, 3).has_value());
   CHECK(g_.CheckInvariants());
   EXPECT_TRUE(g_.HasEdge(1, 3));
 
-  EXPECT_TRUE(g_.ContractEdge(1, 2));
+  // Node (2) has more edges.
+  EXPECT_EQ(g_.ContractEdge(1, 2).value(), 2);
   CHECK(g_.CheckInvariants());
-  EXPECT_TRUE(g_.HasEdge(1, 3));
-  EXPECT_TRUE(g_.HasEdge(1, 4));
+  EXPECT_TRUE(g_.HasEdge(2, 3));
+  EXPECT_TRUE(g_.HasEdge(2, 4));
   EXPECT_TRUE(g_.HasEdge(3, 4));
 
-  EXPECT_TRUE(g_.ContractEdge(1, 3));
+  // Node (2) has more edges.
+  EXPECT_EQ(g_.ContractEdge(2, 3).value(), 2);
   CHECK(g_.CheckInvariants());
-  EXPECT_TRUE(g_.HasEdge(1, 4));
+  EXPECT_TRUE(g_.HasEdge(2, 4));
 }
 
 TEST_F(GraphCyclesTest, CanContractEdge) {
@@ -527,3 +530,26 @@ static void BM_StressTest(int iters, int num_nodes) {
   }
 }
 BENCHMARK(BM_StressTest)->Range(2048, 1048576);
+
+static void BM_ContractEdge(int iters, int num_nodes) {
+  while (iters-- > 0) {
+    tensorflow::testing::StopTiming();
+    tensorflow::GraphCycles g;
+    std::vector<int32> nodes;
+    nodes.reserve(num_nodes);
+    for (int i = 0; i < num_nodes; i++) {
+      nodes.push_back(g.NewNode());
+    }
+    // All edges point toward the last one.
+    for (int i = 0; i < num_nodes - 1; ++i) {
+      g.InsertEdge(nodes[i], nodes[num_nodes - 1]);
+    }
+
+    tensorflow::testing::StartTiming();
+    int node = num_nodes - 1;
+    for (int i = 0; i < num_nodes - 1; ++i) {
+      node = g.ContractEdge(nodes[i], node).value();
+    }
+  }
+}
+BENCHMARK(BM_ContractEdge)->Arg(1000)->Arg(10000);
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index 2b58a9260ba..c64f4d32535 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -172,8 +172,9 @@ class XlaExecutableClosureStore {
 
  private:
   mutex mutex_;
-  int64 key_counter_ GUARDED_BY(mutex_);
-  absl::flat_hash_map<KeyT, XlaExecutableClosure> closures_ GUARDED_BY(mutex_);
+  int64 key_counter_ TF_GUARDED_BY(mutex_);
+  absl::flat_hash_map<KeyT, XlaExecutableClosure> closures_
+      TF_GUARDED_BY(mutex_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(XlaExecutableClosureStore);
 };
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.h b/tensorflow/compiler/jit/kernels/xla_ops.h
index 836cb7e6862..112408226a8 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.h
+++ b/tensorflow/compiler/jit/kernels/xla_ops.h
@@ -165,7 +165,8 @@ class XlaCompileOp : public OpKernel {
   // error when compiling the cluster this _XlaCompile is supposed to compile.
   // If `cannot_compile_cluster_` is true then we avoid compiling this cluster
   // on any future calls to _XlaCompile.
-  bool cannot_compile_cluster_ GUARDED_BY(cannot_compile_cluster_mu_) = false;
+  bool cannot_compile_cluster_ TF_GUARDED_BY(cannot_compile_cluster_mu_) =
+      false;
 
   mutex cannot_compile_cluster_mu_;
 };
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 08dc1b13db6..2a29527bfef 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -161,6 +161,11 @@ class MarkForCompilationPassImpl {
     // The ID of the cluster as represented in `cycles_graph_`.
     int cycles_graph_node_id() const { return cycles_graph_node_id_; }
 
+    // Sets the ID of the cluster as represented in `cycles_graph_`.
+    void set_cycles_graph_node_id(int cycles_graph_node_id) {
+      cycles_graph_node_id_ = cycles_graph_node_id;
+    }
+
     // The size of the cluster excluding constant and identity nodes.
     int effective_cluster_size() const { return effective_cluster_size_; }
 
@@ -381,14 +386,16 @@ class MarkForCompilationPassImpl {
   // R, B} cluster.
   string DescribePotentialCycle(int from, int to);
 
-  // Merge the clusters `cluster_from` and `cluster_to`.  After this step the
-  // larger combined cluster is represented by `cluster_from`'s ID in
-  // `cycles_graph_`.
+  // Merge the clusters `cluster_from` and `cluster_to`. After this step the
+  // larger combined cluster is represented by `cluster_from`, but can have
+  // `cycles_graph_`'s ID of either `cluster_from` or `cluster_to` depending on
+  // which way will require less operations.
   bool MergeClusters(Cluster* cluster_from, Cluster* cluster_to) {
     int from = cluster_from->cycles_graph_node_id();
     int to = cluster_to->cycles_graph_node_id();
 
-    if (!cycles_graph_.ContractEdge(from, to)) {
+    auto optional_merged_node = cycles_graph_.ContractEdge(from, to);
+    if (!optional_merged_node.has_value()) {
       VLOG(3) << "Could not contract " << cluster_from->DebugString(*graph_)
               << " -> " << cluster_to->DebugString(*graph_)
               << " because contracting the edge would create a cycle via "
@@ -398,6 +405,8 @@ class MarkForCompilationPassImpl {
 
     // Merge the clusters.
     cluster_from->Merge(cluster_to);
+    // Update `cycle_graph_`'s ID.
+    cluster_from->set_cycles_graph_node_id(optional_merged_node.value());
 
     // Merge the UnionFind<Cluster*>.
     cluster_for_node_[from].Merge(&cluster_for_node_[to]);
@@ -1911,6 +1920,7 @@ absl::flat_hash_set<string> GetKnownXLAWhitelistOp() {
                                      "LinSpace",
                                      "ListDiff",
                                      "LogMatrixDeterminant",
+                                     "LowerBound",
                                      "MatMul",
                                      "MatrixBandPart",
                                      "MatrixDiag",
@@ -2037,6 +2047,7 @@ absl::flat_hash_set<string> GetKnownXLAWhitelistOp() {
                                      "TensorScatterUpdate",
                                      "TridiagonalSolve",
                                      "TruncatedNormal",
+                                     "UpperBound",
                                      "UnsortedSegmentMax",
                                      "UnsortedSegmentMin",
                                      "UnsortedSegmentProd",
diff --git a/tensorflow/compiler/jit/xla_activity_listener.cc b/tensorflow/compiler/jit/xla_activity_listener.cc
index a1ea6a6bf8e..ae28bf10fb2 100644
--- a/tensorflow/compiler/jit/xla_activity_listener.cc
+++ b/tensorflow/compiler/jit/xla_activity_listener.cc
@@ -18,13 +18,15 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "tensorflow/compiler/jit/xla_activity.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/thread_annotations.h"
 
 namespace tensorflow {
 namespace {
 // The list of all registered `XlaActivityListener`s.
 struct XlaActivityListenerList {
   absl::Mutex mutex;
-  std::vector<std::unique_ptr<XlaActivityListener>> listeners GUARDED_BY(mutex);
+  std::vector<std::unique_ptr<XlaActivityListener>> listeners
+      TF_GUARDED_BY(mutex);
 };
 
 void FlushAllListeners();
diff --git a/tensorflow/compiler/jit/xla_cluster_util_test.cc b/tensorflow/compiler/jit/xla_cluster_util_test.cc
index acac2f7d055..6333499b0c8 100644
--- a/tensorflow/compiler/jit/xla_cluster_util_test.cc
+++ b/tensorflow/compiler/jit/xla_cluster_util_test.cc
@@ -50,7 +50,7 @@ TEST(CreateCycleDetectionGraph, ConnectivityThroughEnterExitRegion) {
 
   GraphCycles cycles;
   TF_ASSERT_OK(CreateCycleDetectionGraph(root.graph(), &cycles).status());
-  EXPECT_FALSE(cycles.ContractEdge(a.node()->id(), b.node()->id()));
+  EXPECT_FALSE(cycles.CanContractEdge(a.node()->id(), b.node()->id()));
 }
 
 TEST(CreateCycleDetectionGraph, ConnectivityThroughMultipleEnterExitRegions) {
@@ -69,7 +69,7 @@ TEST(CreateCycleDetectionGraph, ConnectivityThroughMultipleEnterExitRegions) {
 
   GraphCycles cycles;
   TF_ASSERT_OK(CreateCycleDetectionGraph(root.graph(), &cycles).status());
-  EXPECT_FALSE(cycles.ContractEdge(a.node()->id(), b.node()->id()));
+  EXPECT_FALSE(cycles.CanContractEdge(a.node()->id(), b.node()->id()));
 }
 
 TEST(CreateCycleDetectionGraph, ReachingEnterExit) {
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index 03a9a3ad3a4..5540fee7276 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_activity_listener.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -33,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/hash/hash.h"
@@ -202,6 +204,52 @@ static bool ShouldBeMegamorphic(int64 compile_count, int64 execution_count) {
          execution_count < kMinExecutionsPerCompile * compile_count;
 }
 
+// Creates a simple graph using the specified op as the only op apart from the
+// arg and retval nodes.
+static xla::StatusOr<std::unique_ptr<Graph>> CreateGraph(
+    const NodeDef& node_def, absl::Span<const XlaCompiler::Argument> args,
+    absl::Span<const DataType> result_types) {
+  // TODO(b/74182462): We implement this by creating a new dummy Graph including
+  // _Arg nodes, and let CompileGraph walk it. This could be optimized.
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  Status status;
+  // First create the actual node we care about computing.
+  Node* main_node = graph->AddNode(node_def, &status);
+  TF_RETURN_IF_ERROR(status);
+
+  // Create dummy _Arg nodes. Link these to `node` and also via a control
+  // dependency edge to the _SOURCE node.
+  for (int64 i = 0; i < args.size(); ++i) {
+    Node* node;
+    string arg_name = absl::StrCat("_arg", i);
+    Status status =
+        NodeBuilder(arg_name, FunctionLibraryDefinition::kArgOp)
+            .ControlInput(graph->source_node())
+            .Attr("T", args[i].kind == XlaCompiler::Argument::kResource
+                           ? DT_RESOURCE
+                           : args[i].type)
+            .Attr("index", i)
+            .Finalize(graph.get(), &node);
+    TF_RETURN_IF_ERROR(status);
+    graph->AddEdge(node, 0, main_node, i);
+  }
+
+  // Similarly with return values, create dummy _Retval nodes fed by `node`.
+  for (int64 i = 0; i < result_types.size(); ++i) {
+    Node* node;
+    string retval_name = absl::StrCat("_retval", i);
+    Status status = NodeBuilder(retval_name, FunctionLibraryDefinition::kRetOp)
+                        .Input(main_node, i)
+                        .Attr("T", result_types[i])
+                        .Attr("index", i)
+                        .Finalize(graph.get(), &node);
+    TF_RETURN_IF_ERROR(status);
+  }
+  FixupSourceAndSinkEdges(graph.get());
+  return graph;
+}
+
 Status XlaCompilationCache::CompileSingleOp(
     const XlaCompiler::Options& options,
     absl::Span<const XlaCompiler::Argument> args, OpKernelContext* ctx,
@@ -222,8 +270,11 @@ Status XlaCompilationCache::CompileSingleOp(
     for (int i = 0; i < result_dtypes.size(); ++i) {
       result_dtypes[i] = ctx->expected_output_dtype(i);
     }
-    return compiler->CompileSingleOp(compile_options, ctx->op_kernel().def(),
-                                     args, result_dtypes, result);
+
+    const NodeDef& node_def = ctx->op_kernel().def();
+    TF_ASSIGN_OR_RETURN(auto graph, CreateGraph(node_def, args, result_dtypes));
+    return compiler->CompileGraph(compile_options, node_def.name(),
+                                  std::move(graph), args, result);
   };
   return CompileImpl(options, name, args, compile_op,
                      /*compile_threshold=*/absl::nullopt,
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.h b/tensorflow/compiler/jit/xla_compilation_cache.h
index b3653a2006a..83a0bda97d5 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.h
+++ b/tensorflow/compiler/jit/xla_compilation_cache.h
@@ -151,19 +151,19 @@ class XlaCompilationCache : public ResourceBase {
     int64 request_count = 0;
 
     // Did compilation succeed?
-    Status compilation_status GUARDED_BY(mu);
+    Status compilation_status TF_GUARDED_BY(mu);
 
     // Output of the XlaCompiler.
-    XlaCompiler::CompilationResult compilation_result GUARDED_BY(mu);
+    XlaCompiler::CompilationResult compilation_result TF_GUARDED_BY(mu);
 
     // The XLA executable compiled from <computation>. May be null if no
     // executable has been built.
-    std::unique_ptr<xla::LocalExecutable> executable GUARDED_BY(mu);
+    std::unique_ptr<xla::LocalExecutable> executable TF_GUARDED_BY(mu);
   };
 
   mutex compile_cache_mu_;
   absl::flat_hash_map<Signature, std::unique_ptr<Entry>, Signature::Hash> cache_
-      GUARDED_BY(compile_cache_mu_);
+      TF_GUARDED_BY(compile_cache_mu_);
 
   struct ClusterCompileStats {
     // Number of times the cluster has been (re-)compiled.
@@ -185,7 +185,7 @@ class XlaCompilationCache : public ResourceBase {
 
   // Maps cluster names to compilation statistics for said cluster.
   absl::flat_hash_map<string, ClusterCompileStats> cluster_compile_stats_
-      GUARDED_BY(cluster_compile_stats_mu_);
+      TF_GUARDED_BY(cluster_compile_stats_mu_);
 
   // The number of times a lazy compilation must be requested for a specific
   // signature before  we attempt to compile it.
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 830aaf74186..0cc462678b1 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -83,7 +83,7 @@ class XlaDeviceAllocatorState {
   std::unordered_map<std::pair<const xla::Backend*, int>,
                      std::unique_ptr<XlaDeviceAllocator>,
                      hash<std::pair<const xla::Backend*, int>>>
-      allocators_ GUARDED_BY(allocator_mutex_);
+      allocators_ TF_GUARDED_BY(allocator_mutex_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(XlaDeviceAllocatorState);
 };
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index 546df476d7f..30f9a99e36a 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -137,7 +137,7 @@ class XlaDevice : public LocalDevice {
   ~XlaDevice() override;
 
   Allocator* GetAllocator(AllocatorAttributes attr) override
-      LOCKS_EXCLUDED(mu_);
+      TF_LOCKS_EXCLUDED(mu_);
   void Compute(OpKernel* op_kernel, OpKernelContext* context) override;
   void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
                     AsyncOpKernel::DoneCallback done) override;
@@ -145,18 +145,18 @@ class XlaDevice : public LocalDevice {
   void Sync(const DoneCallback& done) override;
 
   Status TryGetDeviceContext(DeviceContext** out_context) override
-      LOCKS_EXCLUDED(mu_);
+      TF_LOCKS_EXCLUDED(mu_);
 
   Status MakeTensorFromProto(const TensorProto& tensor_proto,
                              const AllocatorAttributes alloc_attrs,
-                             Tensor* tensor) override LOCKS_EXCLUDED(mu_);
+                             Tensor* tensor) override TF_LOCKS_EXCLUDED(mu_);
 
   // Allocate tensor on fast memory space. This is only applied to the new TPU
   // hardware which has faster read/write memory. If the hardware doesn't
   // have such memory space, we fallback to the ordinary memory space.
   Status MakeFastMemTensorFromProto(const TensorProto& tensor_proto,
                                     const AllocatorAttributes alloc_attrs,
-                                    Tensor* tensor) LOCKS_EXCLUDED(mu_);
+                                    Tensor* tensor) TF_LOCKS_EXCLUDED(mu_);
 
   const Metadata& metadata() { return xla_metadata_; }
 
@@ -166,34 +166,35 @@ class XlaDevice : public LocalDevice {
   //
   // TODO(b/111859745): The Eager context needs to call this method to recover
   // from failures.
-  Status EnsureDeviceContextOk() LOCKS_EXCLUDED(mu_);
+  Status EnsureDeviceContextOk() TF_LOCKS_EXCLUDED(mu_);
 
   // Instructs this XlaDevice to set a GpuDeviceInfo, which holds extra
   // information for GPU and TPU devices.
-  Status UseGpuDeviceInfo() LOCKS_EXCLUDED(mu_);
+  Status UseGpuDeviceInfo() TF_LOCKS_EXCLUDED(mu_);
 
   // Instructs this XlaDevice to return 'sync_on_completion' for
   // AllowsSyncOnCompletion().
-  void SetAllowsSyncOnCompletion(bool sync_on_completion) LOCKS_EXCLUDED(mu_);
-  bool AllowsSyncOnCompletion() const override LOCKS_EXCLUDED(mu_);
+  void SetAllowsSyncOnCompletion(bool sync_on_completion)
+      TF_LOCKS_EXCLUDED(mu_);
+  bool AllowsSyncOnCompletion() const override TF_LOCKS_EXCLUDED(mu_);
 
   // Installs an error handling callback when RefreshStatus sees !status.ok().
   void SetHandleDeviceErrorCallback(std::function<Status()> callback);
 
-  Status RefreshStatus() override LOCKS_EXCLUDED(mu_);
+  Status RefreshStatus() override TF_LOCKS_EXCLUDED(mu_);
 
  private:
   xla::StatusOr<xla::LocalClient*> GetOrCreateClient() const;
   Allocator* GetAllocatorLocked(AllocatorAttributes attr)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
   Status EnsureStreamOkLocked(xla::Backend* backend, const string& name,
                               std::shared_ptr<se::Stream>* stream,
                               bool* stream_was_changed)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Return a pair of device context, the second one is fast_mem device context.
   xla::StatusOr<std::pair<XlaDeviceContext*, XlaDeviceContext*>>
-  GetDeviceContextLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  GetDeviceContextLocked() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   static Status GetMetadataFromDevice(DeviceBase* device,
                                       const XlaDevice::Metadata** metadata);
@@ -218,13 +219,13 @@ class XlaDevice : public LocalDevice {
   // Intra-op threads to spawn (from SessionOptions).
   const int intra_op_parallelism_threads_;
   // Memory allocator associated with this device.
-  Allocator* xla_allocator_ GUARDED_BY(mu_) = nullptr;  // Not owned.
+  Allocator* xla_allocator_ TF_GUARDED_BY(mu_) = nullptr;  // Not owned.
 
   // Stream associated with this device. Operations enqueued on this
   // stream are executed on the device. Operations include data
   // copying back and forth between CPU and the device, and
   // computations enqueued by XLA.
-  std::shared_ptr<se::Stream> stream_ GUARDED_BY(mu_);
+  std::shared_ptr<se::Stream> stream_ TF_GUARDED_BY(mu_);
   // If false, only stream_ is valid and all computation and transfers use
   // stream_. If true, computation is performed by stream_ and transfers are
   // performed by host_to_device/device_to_device stream or borrowing a stream
@@ -232,36 +233,36 @@ class XlaDevice : public LocalDevice {
   const bool use_multiple_streams_;
   // If use_multiple_streams_, host to device transfers are performed using this
   // stream.
-  std::shared_ptr<se::Stream> host_to_device_stream_ GUARDED_BY(mu_);
+  std::shared_ptr<se::Stream> host_to_device_stream_ TF_GUARDED_BY(mu_);
   // If use_multiple_streams_, transfers between different devices are performed
   // using these streams.
   std::vector<std::shared_ptr<se::Stream>> device_to_device_streams_
-      GUARDED_BY(mu_);
+      TF_GUARDED_BY(mu_);
 
   const XlaCompiler::ShapeRepresentationFn shape_representation_fn_;
 
   // The device context accessed by all users of the XlaDevice, set by calls to
   // EnsureDeviceContextOk. If gpu_device_info_ is non-null, this pointer is
   // also filled in to that struct. XlaDeviceContext is a ref-counted object.
-  XlaDeviceContext* device_context_ GUARDED_BY(mu_) = nullptr;
+  XlaDeviceContext* device_context_ TF_GUARDED_BY(mu_) = nullptr;
 
   // The device context will allocate memory on fast memory space on TPU.
   // XlaDeviceContext is a ref-counted object.
-  XlaDeviceContext* fast_mem_device_context_ GUARDED_BY(mu_) = nullptr;
+  XlaDeviceContext* fast_mem_device_context_ TF_GUARDED_BY(mu_) = nullptr;
 
   // Holds extra information for GPU and TPU devices, e.g. the device context.
-  bool use_gpu_device_info_ GUARDED_BY(mu_) = false;
-  std::unique_ptr<GpuDeviceInfo> gpu_device_info_ GUARDED_BY(mu_);
+  bool use_gpu_device_info_ TF_GUARDED_BY(mu_) = false;
+  std::unique_ptr<GpuDeviceInfo> gpu_device_info_ TF_GUARDED_BY(mu_);
 
   // Thread pool used for running closures
   std::unique_ptr<thread::ThreadPool> thread_pool_;
 
   // True if the device allows XlaDevice::Sync to be called on completion
   // regardless of status.
-  bool sync_on_completion_ GUARDED_BY(mu_) = true;
+  bool sync_on_completion_ TF_GUARDED_BY(mu_) = true;
 
   // A callback that will be invoked when RefreshStatus sees a status error.
-  std::function<Status()> device_error_callback_ GUARDED_BY(mu_);
+  std::function<Status()> device_error_callback_ TF_GUARDED_BY(mu_);
 
   // Set of devices to use. This controls which of the devices on the given
   // platform will have resources allocated. For GPUs this will be
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index 6871f7ec614..e8df09c7b4d 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/framework/tensor_reference.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
diff --git a/tensorflow/compiler/jit/xla_device_context.h b/tensorflow/compiler/jit/xla_device_context.h
index b90a3ad2e16..05d8dfa7556 100644
--- a/tensorflow/compiler/jit/xla_device_context.h
+++ b/tensorflow/compiler/jit/xla_device_context.h
@@ -117,7 +117,7 @@ class XlaDeviceContext : public DeviceContext {
   bool use_fast_mem_;
 
   absl::Mutex mu_;
-  int next_stream_ GUARDED_BY(mu_) = 0;
+  int next_stream_ TF_GUARDED_BY(mu_) = 0;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 81d63d299ee..511e0f1451a 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -18,7 +18,6 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_
 #define TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_
 
-#include "absl/base/thread_annotations.h"
 #include "tensorflow/compiler/jit/xla_compilation_cache.h"
 #include "tensorflow/compiler/jit/xla_tensor.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
@@ -30,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/device_memory_allocator.h"
 
 namespace tensorflow {
@@ -102,7 +102,7 @@ class VariableInfo {
 // `variables` is allowed to contain instances that don't track a resource
 // variable (i.e. variables[i].var() can be null for some i).
 Status LockVariables(absl::Span<VariableInfo> variables)
-    EXCLUSIVE_LOCK_FUNCTION();
+    TF_EXCLUSIVE_LOCK_FUNCTION();
 
 // Helper class to perform the marshalling of TensorFlow inputs and outputs to
 // ShapedBuffers suitable for passing to an XLA computation.
diff --git a/tensorflow/compiler/jit/xla_tensor.h b/tensorflow/compiler/jit/xla_tensor.h
index 8a4eb7493be..7f7d97e3b3f 100644
--- a/tensorflow/compiler/jit/xla_tensor.h
+++ b/tensorflow/compiler/jit/xla_tensor.h
@@ -122,7 +122,7 @@ class XlaTensor {
   std::shared_ptr<se::Event> definition_event_;
   // A list of all streams for which the tensor's content is defined for any
   // newly enqueued command.
-  absl::InlinedVector<se::Stream*, 2> streams_defined_on_ GUARDED_BY(mu_);
+  absl::InlinedVector<se::Stream*, 2> streams_defined_on_ TF_GUARDED_BY(mu_);
   mutex mu_;
 };
 
diff --git a/tensorflow/compiler/mlir/BUILD b/tensorflow/compiler/mlir/BUILD
index 90c60a85ba2..2ed1c274f75 100644
--- a/tensorflow/compiler/mlir/BUILD
+++ b/tensorflow/compiler/mlir/BUILD
@@ -74,12 +74,15 @@ cc_library(
         "//tensorflow/compiler/mlir/xla:hlo",
         "//tensorflow/compiler/mlir/xla:hlo_legalize_to_lhlo",
         "//tensorflow/compiler/mlir/xla:lhlo",
+        "//tensorflow/compiler/mlir/xla:lhlo_copy_removal",
         "//tensorflow/compiler/mlir/xla:lhlo_fuse_linalg",
         "//tensorflow/compiler/mlir/xla:lhlo_legalize_to_affine",
         "//tensorflow/compiler/mlir/xla:lhlo_legalize_to_gpu",
+        "//tensorflow/compiler/mlir/xla:lhlo_legalize_to_parallel_loops",
         "//tensorflow/compiler/mlir/xla:xla_dialect_registration",
         "//tensorflow/compiler/mlir/xla:xla_legalize_control_flow",
         "//tensorflow/compiler/mlir/xla:xla_legalize_tf",
+        "//tensorflow/compiler/mlir/xla:xla_legalize_tf_with_tf2xla",
         "//tensorflow/compiler/mlir/xla:xla_legalize_to_linalg",
         "//tensorflow/compiler/mlir/xla:xla_legalize_to_standard",
         "//tensorflow/compiler/mlir/xla:xla_lower",
@@ -100,6 +103,38 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "mlir_graph_optimization_pass",
+    srcs = ["mlir_graph_optimization_pass.cc"],
+    hdrs = ["mlir_graph_optimization_pass.h"],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
+        "//tensorflow/compiler/mlir/tensorflow:device_util",
+        "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
+        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
+        "//tensorflow/core:core_cpu",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "mlir_graph_optimization_pass_registration",
+    srcs = [
+        "mlir_graph_optimization_pass_registration.cc",
+    ],
+    deps = [
+        ":mlir_graph_optimization_pass",
+        "//tensorflow/core:core_cpu",
+    ],
+    alwayslink = 1,
+)
+
 tf_cc_binary(
     name = "tf-opt",
     deps = [
diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 5e6348e3ac0..c917af71f92 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -30,7 +30,8 @@ filegroup(
         "ir/tfl_ops.td",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_td_files",
         "@llvm-project//mlir:OpBaseTdFiles",
-        "@llvm-project//mlir:include/mlir/Transforms/LoopLikeInterface.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/LoopLikeInterface.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
     ],
 )
 
@@ -221,19 +222,21 @@ cc_library(
     deps = [
         ":tensorflow_lite_ops_inc_gen",
         ":validators",
-        "@llvm-project//llvm:support",
-        "@llvm-project//mlir:Analysis",
-        "@llvm-project//mlir:Dialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:QuantOps",
-        "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:Support",
-        # TODO(jpienaar): Move this out after splitting out LoopLikeOpInterface.
-        "@llvm-project//mlir:Transforms",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/lite/schema:schema_fbs",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:Analysis",
+        "@llvm-project//mlir:DerivedAttributeOpInterface",
+        "@llvm-project//mlir:Dialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LoopLikeInterface",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:SideEffects",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
     ],
     alwayslink = 1,
 )
@@ -325,8 +328,8 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
         "//tensorflow/compiler/mlir/tensorflow:mangling_util",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "//tensorflow/compiler/mlir/tensorflow:unroll_batch_matmul_pass",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/core:framework",
@@ -436,7 +439,7 @@ genrule(
     srcs = [
         "ir/tfl_ops.td",
         "ir/tfl_op_interfaces.td",
-        "@llvm-project//mlir:include/mlir/Transforms/LoopLikeInterface.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/LoopLikeInterface.td",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_td_files",
     ],
     outs = [
@@ -516,6 +519,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@flatbuffers",
         "@llvm-project//llvm:support",
+        "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:TransformUtils",
     ],
@@ -580,7 +584,7 @@ cc_library(
         "//tensorflow/lite/delegates/flex:whitelisted_flex_ops_lib",
         "//tensorflow/lite/kernels/internal:kernel_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/tools/versioning:op_version",
+        "//tensorflow/lite/tools/versioning",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -697,11 +701,12 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/quantization:quantization_config",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_passes",
         "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:decode_constant_pass",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
-        "//tensorflow/compiler/mlir/tensorflow:tf_dialect_lib",
-        "//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_graph_optimization_pass",
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
+        "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
@@ -725,6 +730,7 @@ cc_library(
         ":tensorflow_lite_quantize",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_config",
         "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:decode_constant_pass",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/compiler/mlir/tensorflow:tf_dialect_lib",
         "//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes",
@@ -734,11 +740,10 @@ cc_library(
         "//tensorflow/lite/tools/optimize:quantize_weights",
         "//tensorflow/stream_executor/lib",
         "@llvm-project//llvm:support",
-        "@llvm-project//mlir:Analysis",
+        "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
     ],
diff --git a/tensorflow/compiler/mlir/lite/README.md b/tensorflow/compiler/mlir/lite/README.md
index 224727621d6..b9c58b28a79 100644
--- a/tensorflow/compiler/mlir/lite/README.md
+++ b/tensorflow/compiler/mlir/lite/README.md
@@ -1,9 +1,9 @@
-# Experimental code for the new TF-Lite convertor, and MLIR dialects and utilities for TensorFlow Lite.
+# The new [MLIR](https://github.com/llvm/llvm-project/tree/master/mlir) based
+TensorFlow to TensorFlow Lite converter
 
 This directory contains:
 
-1. Experimental code for the new TF-Lite convertor.
-2. Code for the TF-lite dialect [MLIR](https://github.com/tensorflow/mlir).
+1.  MLIR dialects, transformation passes and utilities for TensorFlow Lite.
 
 ## API:
 
@@ -11,7 +11,8 @@ The API for converting TensorFlow models to TensorFlow Lite will be through
 `tf.lite.TFLiteConverter`. All the conversion code is open sourced, and
 the API will be integrated soon.
 
-### The conversion process from TensorFlow to TensorFlow Lite includes the following major passes:
+### The conversion process from TensorFlow to TensorFlow Lite includes the
+following major passes:
 
 - Import from GraphDef, in .pb or .pbtxt  format, into MLIR.
 - Raise to Control-flow-graph. Converts TF Control Flow dialect to TF dialect.
@@ -28,3 +29,6 @@ TensorFlow Lite models).
 - The Export pass writes out TensorFlow Lite FlatBuffer format. This pass
 operates on MLIR TensorFlow Lite dialect and is simple/direct translation.
 
+See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+for the full list of MLIR passes for conversion from TensorFlow to
+TensorFlow Lite.
diff --git a/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h b/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h
index b14041e8067..322aeadfa37 100644
--- a/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h
+++ b/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h
@@ -34,9 +34,9 @@ struct PassConfig {
         quant_specs(std::move(specs)),
         skip_control_dialect(false),
         form_clusters(false),
-        inline_functions(true),
         unfold_batch_matmul(true),
-        legalize_tf_while(true) {}
+        legalize_tf_while(true),
+        shape_inference(false) {}
 
   // If `emit_builtin_tflite_ops` is true, TF Lite legalization passes will be
   // added, which produces TF Lite ops.
@@ -56,9 +56,6 @@ struct PassConfig {
   // are formed by grouping consecutive ops of the same device, under a
   // `tf_device.launch` op.
   bool form_clusters;
-  // Inline function calls within the main function in the MLIR module, prior
-  // to legalization to TFLite.
-  bool inline_functions;
   // if `unfold_batch_matmul` is true, the tf.BatchMatMul is unfolded to a set
   // of tfl.fully_connected ops.
   bool unfold_batch_matmul;
@@ -66,6 +63,8 @@ struct PassConfig {
   // Note: This is staging step and will be removed.
   // TODO(b/137395003): Remove post switching legalization.
   bool legalize_tf_while;
+  // Whether to do shape inference.
+  bool shape_inference;
 };
 
 }  // namespace TFL
diff --git a/tensorflow/compiler/mlir/lite/converter_gen.cc b/tensorflow/compiler/mlir/lite/converter_gen.cc
index 02d9ef45591..8ecff8757b7 100644
--- a/tensorflow/compiler/mlir/lite/converter_gen.cc
+++ b/tensorflow/compiler/mlir/lite/converter_gen.cc
@@ -119,6 +119,12 @@ static void EmitOptionBuilders(const RecordKeeper &record_keeper,
         // conversion generation and so the simplicity was chosen over the
         // flexibility.
         StringRef arg_name = arg_values->getArgNameStr(i);
+        // Skip any "intermiadiateXXX" attribute as they are specially handled
+        // in the exporter. They are special because though they are attributes
+        // in the MLIR they are expressed as tensors in the flatbuffer instead
+        // of option.
+        if (op_name == "LSTMOp" && arg_name.take_back(12) == "intermediate")
+          continue;
         os << formatv(
             "  auto {0} = Convert{1}ForOptionWriter(op.{0}(), fbb);\n",
             arg_name, mlir::tblgen::Attribute(arg_def).getAttrDefName());
@@ -164,17 +170,24 @@ static void EmitOperatorBuilders(const std::vector<Record *> &defs,
   for (const auto *def : defs) {
     StringRef op_name = def->getName().drop_front(4);
 
+    const bool has_intermediates = op_name == "LSTMOp";
     // Signature
     os << "static flatbuffers::Offset<tflite::Operator> "
        << GetOperatorBuilderName(def->getName()) << "(mlir::TFL::" << op_name
        << " tflOp, uint32_t opcode_index, "
        << "const std::vector<int32_t>& operands,"
        << "const std::vector<int32_t>& results,"
+       << (has_intermediates ? "const std::vector<int32_t>& intermediate_index,"
+                             : "")
        << "flatbuffers::FlatBufferBuilder *fbb) {\n";
 
     // Inputs & outputs
     os << "  auto inputs = fbb->CreateVector(operands);\n"
           "  auto outputs = fbb->CreateVector(results);\n\n";
+    // Intermediates for LSTM.
+    if (has_intermediates) {
+      os << "  auto intermediates = fbb->CreateVector(intermediate_index);\n";
+    }
 
     // Build the FlatBuffer operator
     os << "  return tflite::CreateOperator(\n"
@@ -191,9 +204,9 @@ static void EmitOperatorBuilders(const std::vector<Record *> &defs,
     // Only builtin ops' builders are auto-generated. custom_options are only
     // used by custom or flex ops and those ops are handled manually.
     os << "      /*custom_options=*/0, "
-          "tflite::CustomOptionsFormat_FLEXBUFFERS,\n"
-          "      /*mutating_variable_inputs=*/0);\n"
-          "}\n\n";
+       << "tflite::CustomOptionsFormat_FLEXBUFFERS,\n"
+       << "      /*mutating_variable_inputs=*/0"
+       << (has_intermediates ? ", intermediates" : "") << ");\n}\n\n";
   }
 }
 
@@ -244,6 +257,7 @@ static void EmitGetBuiltinOpCode(const std::vector<Record *> &defs,
 //       uint32_t opcode_index,
 //       const std::vector<int32_t>& operands,
 //       const std::vector<int32_t>& results,
+//       const std::vector<int32_t>& intermediates,
 //       flatbuffers::FlatBufferBuilder *fbb);
 static void EmitBuildOperator(const std::vector<Record *> &defs,
                               raw_ostream *ostream) {
@@ -255,6 +269,7 @@ static void EmitBuildOperator(const std::vector<Record *> &defs,
         "uint32_t opcode_index, "
         "const std::vector<int32_t>& operands,"
         "const std::vector<int32_t>& results,"
+        "const std::vector<int32_t>& intermediates,"
         "flatbuffers::FlatBufferBuilder *fbb) {\n";
 
   for (const auto *def : defs) {
@@ -264,7 +279,8 @@ static void EmitBuildOperator(const std::vector<Record *> &defs,
     os << "  if (auto tflOp = llvm::dyn_cast<mlir::TFL::" << op_name
        << ">(op))\n"
        << "    return " << GetOperatorBuilderName(def->getName())
-       << "(tflOp, opcode_index, operands, results, fbb);\n";
+       << "(tflOp, opcode_index, operands, results, "
+       << (op_name == "LSTMOp" ? "intermediates, " : "") << "fbb);\n";
   }
 
   os << "  return llvm::None;\n"
@@ -307,6 +323,10 @@ static void EmitBuiltinOptionsToAttributes(const RecordKeeper &record_keeper,
       if (!arg_def) continue;
       if (arg_def->getDef()->isSubClassOf(attr_type)) {
         StringRef arg_name = arg_values->getArgNameStr(i);
+        // Already handle this case in flatbuffer_import.cc.
+        if (option_name == "LSTMOptions" &&
+            arg_name.take_back(12) == "intermediate")
+          continue;
         StringRef attr_type = mlir::tblgen::Attribute(arg_def).getAttrDefName();
         os << formatv(
             "    attributes.emplace_back(builder.getNamedAttr(\"{0}\","
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
index 6753ab9e728..29233f86e4a 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
@@ -547,6 +547,7 @@ bool IsCustomOp(const std::string& op_name) {
 // TODO(krzysd) Handle function calls
 StatusOr<Operation*> ConvertOp(
     const tflite::OperatorT& op, const std::vector<Value>& vals_map,
+    const std::vector<mlir::TensorType>& intermediate_types,
     Value optional_arg_marker, const std::vector<std::string>& op_names,
     const std::vector<std::string>& func_names,
     const std::vector<std::unique_ptr<tflite::TensorT>>& tensors, Location loc,
@@ -608,6 +609,28 @@ StatusOr<Operation*> ConvertOp(
   if (op_name == "tfl.lstm") {
     // TODO(b/147587779): add the right region if region is empty.
     op_state.addRegion();
+    if (!op.intermediates.empty()) {
+      if (op.intermediates.size() != 5) {
+        auto err = errors::InvalidArgument(
+            "operator has intermediate tensors but the number of them is not "
+            "five.");
+        return emitError(loc, err.ToString()), err;
+      }
+      // Create intermediate value
+
+      const llvm::SmallVector<llvm::StringRef, 5> kIntermediateNames = {
+          "input_to_input_intermediate", "input_to_forget_intermediate",
+          "input_to_cell_intermediate", "input_to_output_intermediate",
+          "effective_hidden_scale_intermediate"};
+      for (auto type_and_name :
+           llvm::zip(intermediate_types, kIntermediateNames)) {
+        mlir::TypeAttr type_attr =
+            mlir::TypeAttr::get(std::get<0>(type_and_name));
+        auto named_attr =
+            builder.getNamedAttr(std::get<1>(type_and_name), type_attr);
+        op_state.addAttribute(named_attr.first, named_attr.second);
+      }
+    }
   }
 
   llvm::SmallVector<mlir::NamedAttribute, 2> attrs;
@@ -893,6 +916,18 @@ StatusOr<FuncOp> ConvertSubgraph(
       }
     }
 
+    // Intermediate tensors for tfl.lstm are used to carry quantization range
+    // in their types, so we only need and extract their types.
+    std::vector<mlir::TensorType> intermediate_types;
+    intermediate_types.reserve(5);
+    for (auto intermediate : op->intermediates) {
+      TF_ASSIGN_OR_RETURN(
+          auto type, GetTensorType(*subgraph.tensors[intermediate], builder,
+                                   /*shapeless_are_scalars=*/true,
+                                   /*is_constant=*/true));
+      intermediate_types.emplace_back(type);
+    }
+
     // The NameLoc corresponding to the name of the first output tensor
     auto op_loc =
         op->outputs.empty()
@@ -902,8 +937,8 @@ StatusOr<FuncOp> ConvertSubgraph(
     // to a valid Value
     TF_ASSIGN_OR_RETURN(
         auto* mlir_op,
-        ConvertOp(*op, vals_map, maybe_optional_arg_marker, op_names,
-                  func_names, subgraph.tensors, op_loc, op_builder));
+        ConvertOp(*op, vals_map, intermediate_types, maybe_optional_arg_marker,
+                  op_names, func_names, subgraph.tensors, op_loc, op_builder));
 
     // Add the results to the value maps. There are two cases: 1. the result
     // tensor does not have min/max values, the original op result is used
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_operator.h b/tensorflow/compiler/mlir/lite/flatbuffer_operator.h
index fdc0fd81f8f..4e8e3f6424e 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_operator.h
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_operator.h
@@ -44,6 +44,7 @@ llvm::Optional<tflite::BuiltinOperator> GetBuiltinOpCode(Operation *mlir_op);
 llvm::Optional<flatbuffers::Offset<tflite::Operator>> CreateFlatBufferOperator(
     Operation *mlir_op, uint32_t opcode_index,
     const std::vector<int32_t> &operands, const std::vector<int32_t> &results,
+    const std::vector<int32_t> &intermediates,
     flatbuffers::FlatBufferBuilder *fbb);
 
 // Populates the array of mlir::NamedAttributes corresponding to the given
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc b/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc
index ac20ab68eaa..9e9330e2c96 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "llvm/Support/ToolOutputFile.h"
 #include "mlir/Dialect/QuantOps/QuantTypes.h"  // TF:llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
+#include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/Function.h"  // TF:llvm-project
 #include "mlir/IR/Location.h"  // TF:llvm-project
@@ -75,6 +76,7 @@ limitations under the License.
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/tools/versioning/op_version.h"
+#include "tensorflow/lite/tools/versioning/runtime_version.h"
 #include "tensorflow/lite/version.h"
 
 using llvm::dyn_cast;
@@ -179,8 +181,6 @@ static StatusOr<tflite::TensorType> GetTFLiteType(Type type,
       return tflite::TensorType_FLOAT16;
     case mlir::TF::TensorFlowTypes::STRING:
       return tflite::TensorType_STRING;
-    case mlir::TF::TensorFlowTypes::UINT8:
-      return tflite::TensorType_UINT8;
     case mlir::TF::TensorFlowTypes::QUINT8:
       return tflite::TensorType_UINT8;
     case mlir::StandardTypes::Complex: {
@@ -196,7 +196,8 @@ static StatusOr<tflite::TensorType> GetTFLiteType(Type type,
         case 1:
           return tflite::TensorType_BOOL;
         case 8:
-          return tflite::TensorType_INT8;
+          return itype.isUnsigned() ? tflite::TensorType_UINT8
+                                    : tflite::TensorType_INT8;
         case 16:
           return tflite::TensorType_INT16;
         case 32:
@@ -404,6 +405,11 @@ class Translator {
   // and returns llvm::None on failure.
   Optional<BufferOffset<tflite::Buffer>> BuildBuffer(Operation* inst);
 
+  // Build TFLite tensor from the given type. This function is for tfl.lstm
+  // intermediates, which should have UniformQuantizedType.
+  Optional<BufferOffset<tflite::Tensor>> BuildTensorFromType(
+      mlir::Type type, const std::string& name);
+
   // Builds TFLite tensor from the given value. `buffer_idx` is index of the
   // corresponding buffer. Emits error and returns llvm::None on failure.
   Optional<BufferOffset<tflite::Tensor>> BuildTensor(Value value,
@@ -469,7 +475,8 @@ class Translator {
   // tensor indices. Emits an error and returns llvm::None on failure.
   Optional<BufferOffset<tflite::Operator>> BuildOperator(
       Operation* inst, const std::vector<int32_t>& operands,
-      const std::vector<int32_t>& results);
+      const std::vector<int32_t>& results,
+      const std::vector<int32_t>& intermediates);
 
   // Build a subgraph with a given name out of the region either corresponding
   // to a function's body or while op.
@@ -581,6 +588,34 @@ Optional<BufferOffset<tflite::Buffer>> Translator::BuildBuffer(
   return tflite::CreateBuffer(builder_, buffer_data);
 }
 
+Optional<BufferOffset<tflite::Tensor>> Translator::BuildTensorFromType(
+    mlir::Type type, const std::string& name) {
+  auto tensor_type = type.cast<TensorType>();
+
+  if (!tensor_type.hasStaticShape()) {
+    return llvm::None;
+  }
+  llvm::ArrayRef<int64_t> shape_ref = tensor_type.getShape();
+  std::vector<int32_t> shape(shape_ref.begin(), shape_ref.end());
+
+  auto element_type = tensor_type.getElementType();
+  tflite::TensorType tflite_element_type =
+      GetTFLiteType(tensor_type.getElementType()).ValueOrDie();
+  BufferOffset<tflite::QuantizationParameters> q_params;
+  auto qtype = element_type.dyn_cast<mlir::quant::UniformQuantizedType>();
+  if (!qtype) {
+    return llvm::None;
+  }
+  q_params = tflite::CreateQuantizationParameters(
+      builder_, /*min=*/0, /*max=*/0,
+      builder_.CreateVector<float>({static_cast<float>(qtype.getScale())}),
+      builder_.CreateVector<int64_t>({qtype.getZeroPoint()}));
+  return tflite::CreateTensor(
+      builder_, builder_.CreateVector(shape), tflite_element_type,
+      /*buffer=*/0, builder_.CreateString(name), q_params,
+      /*is_variable=*/false);
+}
+
 Optional<BufferOffset<tflite::Tensor>> Translator::BuildTensor(
     Value value, const std::string& name, unsigned buffer_idx) {
   auto type = value.getType().cast<TensorType>();
@@ -933,7 +968,8 @@ uint32_t Translator::GetOpcodeIndex(const std::string& op_name,
 
 Optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
     Operation* inst, const std::vector<int32_t>& operands,
-    const std::vector<int32_t>& results) {
+    const std::vector<int32_t>& results,
+    const std::vector<int32_t>& intermediates) {
   const auto* dialect = inst->getDialect();
   if (!dialect) {
     inst->emitOpError("dialect is not registered");
@@ -986,7 +1022,7 @@ Optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
     std::string op_name = inst->getName().getStringRef().str();
     uint32_t opcode_index = GetOpcodeIndex(op_name, *builtin_code);
     auto offset = CreateFlatBufferOperator(inst, opcode_index, operands,
-                                           results, &builder_);
+                                           results, intermediates, &builder_);
     if (!offset) {
       inst->emitOpError("is not a supported TFLite op");
     }
@@ -1171,6 +1207,29 @@ Optional<BufferOffset<tflite::SubGraph>> Translator::BuildSubGraph(
   bool failed_once = false;
   for (auto& inst : bb) {
     if (inst.isKnownTerminator()) break;
+    std::vector<int32_t> intermediates;
+    // Build intermediate tensors for tfl.lstm and insert these tensors into
+    // flatbuffer.
+    if (llvm::isa<mlir::TFL::LSTMOp>(inst)) {
+      std::vector<std::string> intermediate_names = {
+          "input_to_input_intermediate", "input_to_forget_intermediate",
+          "input_to_cell_intermediate", "input_to_output_intermediate",
+          "effective_hidden_scale_intermediate"};
+      for (const std::string& intermediate : intermediate_names) {
+        auto intermediate_attr = inst.getAttr(intermediate);
+        if (auto attr = intermediate_attr.dyn_cast_or_null<mlir::TypeAttr>()) {
+          Type qtype = attr.getValue();
+          auto tensor_or = BuildTensorFromType(
+              qtype, name_mapper_.GetUniqueName(intermediate).str());
+          if (!tensor_or.hasValue()) {
+            continue;
+          } else {
+            intermediates.push_back(tensors.size());
+            tensors.push_back(tensor_or.getValue());
+          }
+        }
+      }
+    }
 
     for (auto val : inst.getResults()) {
       std::string name = UniqueName(val);
@@ -1195,7 +1254,8 @@ Optional<BufferOffset<tflite::SubGraph>> Translator::BuildSubGraph(
       results.push_back(tensor_index_map.lookup(result));
     }
 
-    if (auto tfl_operator = BuildOperator(&inst, operands, results))
+    if (auto tfl_operator =
+            BuildOperator(&inst, operands, results, intermediates))
       operators.push_back(*tfl_operator);
     else
       failed_once = true;
@@ -1230,27 +1290,58 @@ BufferOffset<tflite::Metadata> Translator::BuildMetadata(StringRef name,
 Optional<VectorBufferOffset<BufferOffset<tflite::Metadata>>>
 Translator::CreateMetadataVector() {
   auto dict_attr = module_.getAttrOfType<mlir::DictionaryAttr>("tfl.metadata");
-  if (!dict_attr) return VectorBufferOffset<BufferOffset<tflite::Metadata>>();
-
   std::vector<BufferOffset<tflite::Metadata>> metadata;
-  for (const auto& named_attr : dict_attr) {
-    StringRef name = named_attr.first;
-    mlir::Attribute attr = named_attr.second;
-    if (auto content = attr.dyn_cast<StringAttr>()) {
-      metadata.push_back(BuildMetadata(name, content.getValue()));
-    } else {
-      module_.emitError(
-          "all values in tfl.metadata's dictionary key-value pairs should be "
-          "string attributes");
-      return llvm::None;
+  if (dict_attr) {
+    for (const auto& named_attr : dict_attr) {
+      StringRef name = named_attr.first;
+      mlir::Attribute attr = named_attr.second;
+      if (auto content = attr.dyn_cast<StringAttr>()) {
+        metadata.push_back(BuildMetadata(name, content.getValue()));
+      } else {
+        module_.emitError(
+            "all values in tfl.metadata's dictionary key-value pairs should be "
+            "string attributes");
+        return llvm::None;
+      }
     }
   }
+  // Runtime version string is generated after we update the op
+  // versions. Here we put a 16-byte dummy string as a placeholder. We choose
+  // 16-byte because it's the alignment of buffers in flatbuffer, so it won't
+  // cause any waste of space if the actual string is shorter than 16 bytes.
+  metadata.push_back(
+      BuildMetadata("min_runtime_version", std::string(16, '\0')));
   return builder_.CreateVector(metadata);
 }
 
+bool UpdateEntryFunction(ModuleOp module) {
+  if (module.lookupSymbol<FuncOp>("main") != nullptr) {
+    // We already have an entry function.
+    return true;
+  }
+
+  int entry_func_count = 0;
+  FuncOp entry_func = nullptr;
+  for (auto fn : module.getOps<FuncOp>()) {
+    auto attrs = fn.getAttrOfType<mlir::DictionaryAttr>("tf.entry_function");
+    if (attrs && !attrs.empty()) {
+      entry_func_count++;
+      entry_func = fn;
+    }
+  }
+
+  // We should have one & only have one entry function.
+  if (entry_func_count != 1) return false;
+
+  // Update the entry func to main.
+  entry_func.setName("main");
+  return true;
+}
+
 Optional<std::string> Translator::Translate(
     ModuleOp module, bool emit_builtin_tflite_ops, bool emit_select_tf_ops,
     bool emit_custom_ops, OpOrArgNameMapper* op_or_arg_name_mapper) {
+  if (!UpdateEntryFunction(module)) return llvm::None;
   if (!IsValidTFLiteMlirModule(module)) return llvm::None;
   Translator translator(module, emit_builtin_tflite_ops, emit_select_tf_ops,
                         emit_custom_ops, op_or_arg_name_mapper);
@@ -1334,6 +1425,7 @@ Optional<std::string> Translator::TranslateInternal() {
       builder_.CreateVector(buffers_), metadata_buffer, *metadata);
   tflite::FinishModelBuffer(builder_, model);
   tflite::UpdateOpVersion(builder_.GetBufferPointer());
+  tflite::UpdateMinimumRuntimeVersionForModel(builder_.GetBufferPointer());
 
   // Return serialized string for the built FlatBuffer.
   return std::string(reinterpret_cast<const char*>(builder_.GetBufferPointer()),
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
index 83e372e5732..5f8e9c35b94 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "mlir/IR/TypeUtilities.h"  // TF:llvm-project
 #include "mlir/Support/LLVM.h"  // TF:llvm-project
 #include "mlir/Support/LogicalResult.h"  // TF:llvm-project
+#include "mlir/Transforms/FoldUtils.h"  // TF:llvm-project
 #include "mlir/Transforms/InliningUtils.h"  // TF:llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // TF:llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
@@ -66,13 +67,29 @@ struct TensorFlowLiteInlinerInterface : public DialectInlinerInterface {
   }
 };
 
+struct TensorFlowLiteOpFolderDialectInterface
+    : public OpFolderDialectInterface {
+  using OpFolderDialectInterface::OpFolderDialectInterface;
+
+  // Registered hook to check if the given region, which is attached to an
+  // operation that is *not* isolated from above (i.e. no internal regions
+  // reference values defined in an enclosing region), should be used when
+  // materializing constants.
+  // In the TFLite dialect we materialize inside a while regions as slightly
+  // more efficient computationally.
+  bool shouldMaterializeInto(Region *region) const final {
+    return isa<WhileOp>(region->getParentOp());
+  }
+};
+
 TensorFlowLiteDialect::TensorFlowLiteDialect(mlir::MLIRContext *context)
     : Dialect(/*name=*/"tfl", context) {
   addOperations<
 #define GET_OP_LIST
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.cc.inc"
       >();
-  addInterfaces<TensorFlowLiteInlinerInterface>();
+  addInterfaces<TensorFlowLiteInlinerInterface,
+                TensorFlowLiteOpFolderDialectInterface>();
 }
 
 //===----------------------------------------------------------------------===//
@@ -1269,6 +1286,20 @@ static LogicalResult Verify(UnidirectionalSequenceLSTMOp op) {
       "UnidirectionalSequenceLSTMOp expected to have two stateful operands");
 }
 
+//===----------------------------------------------------------------------===//
+// BidirectionalSequenceLSTMOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(BidirectionalSequenceLSTMOp op) {
+  auto operands = op.GetStatefulOperands();
+  if (operands.size() == 4 && operands[0] == 35 && operands[1] == 36 &&
+      operands[2] == 37 && operands[3] == 38) {
+    return success();
+  }
+  return op.emitError(
+      "BidirectionalSequenceLSTMOp expected to have four stateful operands");
+}
+
 //===----------------------------------------------------------------------===//
 // UnidirectionalSequenceRNNOp
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.h b/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
index 1e74a8c1a9e..cfe18a218bc 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
@@ -25,9 +25,11 @@ limitations under the License.
 #include "mlir/IR/Dialect.h"  // TF:llvm-project
 #include "mlir/IR/OpImplementation.h"  // TF:llvm-project
 #include "mlir/IR/StandardTypes.h"  // TF:llvm-project
+#include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // TF:llvm-project
+#include "mlir/Interfaces/LoopLikeInterface.h"  // TF:llvm-project
+#include "mlir/Interfaces/SideEffects.h"  // TF:llvm-project
 #include "mlir/Support/Functional.h"  // TF:llvm-project
 #include "mlir/Support/LLVM.h"  // TF:llvm-project
-#include "mlir/Transforms/LoopLikeInterface.h"  // TF:llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_traits.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
@@ -39,6 +41,8 @@ class TensorFlowLiteDialect : public Dialect {
  public:
   explicit TensorFlowLiteDialect(MLIRContext *context);
 
+  static StringRef getDialectNamespace() { return "tfl"; }
+
   // Registered hook to materialize a constant operation from a given attribute
   // value with the desired resultant type.
   Operation *materializeConstant(OpBuilder &builder, Attribute value, Type type,
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 6c5981359b3..5624c7e2b73 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -19,7 +19,8 @@ limitations under the License.
 #define TFL_OPS
 
 include "mlir/IR/OpBase.td"
-include "mlir/Transforms/LoopLikeInterface.td"
+include "mlir/Interfaces/LoopLikeInterface.td"
+include "mlir/Interfaces/SideEffects.td"
 include "tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td"
 include "tensorflow/compiler/mlir/lite/quantization/quantization.td"
 
@@ -47,13 +48,6 @@ def TFL_Str : Type<CPred<"$_self.isa<mlir::TF::StringType>()">,
                   "TFLite string type">,
              BuildableType<"getType<mlir::TF::StringType>()">;
 
-//===----------------------------------------------------------------------===//
-// TFLite dialect uint8 type - uses the TF uint8 type as implementation
-//===----------------------------------------------------------------------===//
-def TFL_Uint8 : Type<CPred<"$_self.isa<mlir::TF::Uint8Type>()">,
-                    "TFLite uint8 type">,
-              BuildableType<"getType<mlir::TF::Uint8Type>()">;
-
 //===----------------------------------------------------------------------===//
 // TFLite dialect quint8 type - uses the TF quint8 type as implementation
 //===----------------------------------------------------------------------===//
@@ -141,7 +135,8 @@ class TFL_VariadicTensorOf<list<Type> allowedRuntimeTypes,
   Variadic<TensorOf<allowedOpTypes>>,
   TFL_RuntimeType<Variadic<TensorOf<allowedRuntimeTypes>>>;
 
-def TFL_Int32Or64 : IntOfWidths<[32, 64]>;
+def TFL_Uint8 : UI<8>;
+def TFL_Int32Or64 : SignlessIntOfWidths<[32, 64]>;
 
 def TFL_BoolTensor : TFL_TensorOf<[I1]>;
 def TFL_FpOrI32OrI64Tensor : TFL_TensorOf<[AnyFloat, TFL_Int32Or64]>;
@@ -223,9 +218,9 @@ class TFL_Operand0DOr1ElementTensor<int x> :
 class TFL_TFTypesWithSameBits<int i, int j, int num> :
   And<[
     Or<[CPred<"getElementTypeOrSelf($_op.getResult(" # i # ")).isa<mlir::TF::Quint" # num # "Type>()">,
-        CPred<"getElementTypeOrSelf($_op.getResult(" # i # ")).isa<mlir::TF::Uint" # num # "Type>()">]>,
+        CPred<"getElementTypeOrSelf($_op.getResult(" # i # ")).isUnsignedInteger(" # num # ")">]>,
     Or<[CPred<"getElementTypeOrSelf($_op.getOperand(" # j # ")).isa<mlir::TF::Quint" # num # "Type>()">,
-        CPred<"getElementTypeOrSelf($_op.getOperand(" # j # ")).isa<mlir::TF::Uint" # num # "Type>()">]>]>;
+        CPred<"getElementTypeOrSelf($_op.getOperand(" # j # ")).isUnsignedInteger(" # num # ")">]>]>;
 
 class TFL_OperandHasRankLessThan<int n, int m> :
   PredOpTrait<"operand " # n # " is maximum " # m # "-D",
@@ -602,7 +597,7 @@ def TFL_ConcatenationOp : TFL_Op<"concatenation",
   let verifier = [{ return Verify(*this); }];
 }
 
-def TFL_ConstOp : Op<TFL_Dialect, "pseudo_const", [NoSideEffect,
+def TFL_ConstOp : Op<TFL_Dialect, "pseudo_const", [ConstantLike, NoSideEffect,
     FirstAttrDerivedResultType]> {
   let summary = "Constant pseudo op.";
 
@@ -1863,11 +1858,11 @@ def TFL_MulOp : TFL_Op<"mul", [ResultsBroadcastableShape, NoSideEffect, Commutat
   }];
 
   let arguments = (
-    ins AnyTensor:$lhs,
-    AnyTensor:$rhs,
+    ins TFL_TensorOf<[F32, I32, QI8, QUI8, QI16]>:$lhs,
+    TFL_TensorOf<[F32, I32, QI8, QUI8, QI16]>:$rhs,
     TFL_AFAttr:$fused_activation_function);
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[F32, I32, QI8, QUI8, QI16]>:$output);
 
   let hasFolder = 1;
 
@@ -1887,9 +1882,9 @@ def TFL_NegOp: TFL_Op<"neg", [NoSideEffect, SameOperandsAndResultType]> {
     Computes element-wise negation of input
   }];
 
-  let arguments = (ins AnyTensor:$x);
+  let arguments = (ins TFL_TensorOf<[F32, I32, I64]>:$x);
 
-  let results = (outs AnyTensor:$y);
+  let results = (outs TFL_TensorOf<[F32, I32, I64]>:$y);
 
   let hasOptions = 0b1;
 
@@ -2039,10 +2034,10 @@ def TFL_PowOp : TFL_Op<"pow", [ResultsBroadcastableShape, NoSideEffect, NoQuanti
   }];
 
   let arguments = (
-    ins AnyTensor:$lhs,
-    AnyTensor:$rhs);
+    ins TFL_TensorOf<[F32, I32]>:$lhs,
+    TFL_TensorOf<[F32, I32]>:$rhs);
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[F32, I32]>:$output);
 
   let parser = [{ return mlir::impl::parseOneResultSameOperandTypeOp(parser, result); }];
 
@@ -2716,7 +2711,7 @@ def TFL_SplitOp : TFL_Op<"split", [
   let arguments = (ins
     TFL_TensorOf<[I32]>:$split_dim,
     TFL_TensorOf<[F32, I16, I32, I64, QI8, QUI8, QI16]>:$value,
-    PositiveI32Attr:$num_splits
+    Confined<I32Attr, [IntPositive]>:$num_splits
   );
 
   let results = (outs
@@ -2741,7 +2736,7 @@ def TFL_SplitVOp : TFL_Op<"split_v", [NoSideEffect, SameOperandsAndResultsScale]
     TFL_TensorOf<[F32, I16, I32, I64, QI8, QUI8, QI16]>:$value,
     TFL_1DTensorOf<[I32], [I32]>:$size_splits,
     TFL_0DTensorOf<[I32], [I32]>:$split_dim,
-    PositiveI32Attr:$num_splits
+    Confined<I32Attr, [IntPositive]>:$num_splits
   );
 
   let results = (outs
@@ -3246,7 +3241,15 @@ Ba et al. 'Layer Normalization'
     // Since this op is the FULL kernel only, constrain it.
     Confined<
       DefaultValuedAttr<TFL_LSTMKernelTypeAttr, "FULL">,
-      [TFL_LSTM_KT_FULL]>:$kernel_type
+      [TFL_LSTM_KT_FULL]>:$kernel_type,
+
+    // Types of the optional intermediate tensors, which exist for fully
+    // quantized LSTM op and hold the ranges of the intermediate tensors.
+    OptionalAttr<TypeAttr>:$input_to_input_intermediate,
+    OptionalAttr<TypeAttr>:$input_to_forget_intermediate,
+    OptionalAttr<TypeAttr>:$input_to_cell_intermediate,
+    OptionalAttr<TypeAttr>:$input_to_output_intermediate,
+    OptionalAttr<TypeAttr>:$effective_hidden_scale_intermediate
   );
 
   let results = (outs AnyTensor:$output);
@@ -3350,6 +3353,156 @@ def TFL_UnidirectionalSequenceLSTMOp :
   }];
 }
 
+def BidiLstmMandatoryInputsConstraint : PredOpTrait<
+  "mandatory operands element types should match",
+  // TODO(ashwinm): Replace the indices with input tensor names when that
+  // support is available.
+  Or<[
+    TCopVTEtAreSameAt<[0, 2, 3, 4, 6, 7, 8, 13, 14, 15, 19, 20, 21, 23, 24, 25,
+                       30, 31, 32, 35, 36, 37, 38]>,
+    Neg<TypeIsPred<"input", F32>>]>>;
+
+def BidiLstmOptionalPeepholeWeightConstraint : PredOpTrait<
+  "the optional peephole weights should all be specified or none",
+  TCopVTEtAreSameAt<[9, 10, 11, 26, 27, 28]>>;
+
+def BidiLstmProjectionWeightBiasConstraint : PredOpTrait<
+  "either projection weight must be specified or both projection weight and "
+  "projection bias must not be specified",
+   Or<[
+      And<[TypeIsPred<"fw_projection_weights", NoneType>,
+           TypeIsPred<"fw_projection_bias", NoneType>,
+           TypeIsPred<"bw_projection_weights", NoneType>,
+           TypeIsPred<"bw_projection_bias", NoneType>]>,
+      And<[
+        Neg<TypeIsPred<"fw_projection_weights", NoneType>>,
+        Neg<TypeIsPred<"bw_projection_weights", NoneType>>,
+     ]>
+   ]>>;
+
+// BidirectionalSequenceLstm op.
+// TODO(ashwinm): Add constraint to validate the combination of operands
+// that are valid for hybrid vs fully quantized vs float only semantics
+def TFL_BidirectionalSequenceLSTMOp :
+  TFL_Op<"bidirectional_sequence_lstm",
+          [BidiLstmMandatoryInputsConstraint,
+           BidiLstmOptionalPeepholeWeightConstraint,
+           BidiLstmProjectionWeightBiasConstraint,
+           LstmResultConstraint,
+           TFL_StatefulOp]> {
+  let summary = "Bidirectional sequence lstm operator";
+
+  let description = [{
+    Bidirectional lstm is essentiallay two lstms, one running forward & the
+    other running backward. And the output is the concatenation of the two
+    lstms.
+  }];
+
+  let arguments = (
+    ins TFL_TensorOf<[F32, I8]>:$input,
+
+    // Forward LSTM Weights
+    TFL_TensorOfOrNone<[F32, I8]>:$fw_input_to_input_weights,
+    TFL_TensorOf<[F32, I8]>:$fw_input_to_forget_weights,
+    TFL_TensorOf<[F32, I8]>:$fw_input_to_cell_weights,
+    TFL_TensorOf<[F32, I8]>:$fw_input_to_output_weights,
+
+    // Forward Recurrent weights
+    TFL_TensorOfOrNone<[F32, I8]>:$fw_recurrent_to_input_weights,
+    TFL_TensorOf<[F32, I8]>:$fw_recurrent_to_forget_weights,
+    TFL_TensorOf<[F32, I8]>:$fw_recurrent_to_cell_weights,
+    TFL_TensorOf<[F32, I8]>:$fw_recurrent_to_output_weights,
+
+    // Forward Cell weights
+    TFL_TensorOfOrNone<[F32, I8]>:$fw_cell_to_input_weights,
+    // Optional Forward cell weights
+    TFL_TensorOfOrNone<[F32, I8]>:$fw_cell_to_forget_weights,
+    // Optional Forward cell weights
+    TFL_TensorOfOrNone<[F32, I8]>:$fw_cell_to_output_weights,
+
+    // Forward Bias
+    TFL_TensorOfOrNone<[F32]>:$fw_input_gate_bias,
+    TFL_TensorOf<[F32]>:$fw_forget_gate_bias,
+    TFL_TensorOf<[F32]>:$fw_cell_bias,
+    TFL_TensorOf<[F32]>:$fw_output_gate_bias,
+
+    // Forward Projection weight and bias
+    TFL_TensorOfOrNone<[F32, I8]>:$fw_projection_weights,
+    // Forward Optional input
+    TFL_TensorOfOrNone<[F32]>:$fw_projection_bias,
+
+    // Backward LSTM Weights
+    TFL_TensorOfOrNone<[F32, I8]>:$bw_input_to_input_weights,
+    TFL_TensorOf<[F32, I8]>:$bw_input_to_forget_weights,
+    TFL_TensorOf<[F32, I8]>:$bw_input_to_cell_weights,
+    TFL_TensorOf<[F32, I8]>:$bw_input_to_output_weights,
+
+    // Backward Recurrent weights
+    TFL_TensorOfOrNone<[F32, I8]>:$bw_recurrent_to_input_weights,
+    TFL_TensorOf<[F32, I8]>:$bw_recurrent_to_forget_weights,
+    TFL_TensorOf<[F32, I8]>:$bw_recurrent_to_cell_weights,
+    TFL_TensorOf<[F32, I8]>:$bw_recurrent_to_output_weights,
+
+    // Backward Cell weights
+    TFL_TensorOfOrNone<[F32, I8]>:$bw_cell_to_input_weights,
+    // Optional Forward cell weights
+    TFL_TensorOfOrNone<[F32, I8]>:$bw_cell_to_forget_weights,
+    // Optional Forward cell weights
+    TFL_TensorOfOrNone<[F32, I8]>:$bw_cell_to_output_weights,
+
+    // Backward Bias
+    TFL_TensorOfOrNone<[F32]>:$bw_input_gate_bias,
+    TFL_TensorOf<[F32]>:$bw_forget_gate_bias,
+    TFL_TensorOf<[F32]>:$bw_cell_bias,
+    TFL_TensorOf<[F32]>:$bw_output_gate_bias,
+
+    // Backward Projection weight and bias
+    TFL_TensorOfOrNone<[F32, I8]>:$bw_projection_weights,
+    // Backward Optional input
+    TFL_TensorOfOrNone<[F32]>:$bw_projection_bias,
+
+    // Stateful activation and cell states.
+    TFL_StatefulTensor:$fw_input_activation_state,
+    TFL_StatefulTensor:$fw_input_cell_state,
+    TFL_StatefulTensor:$bw_input_activation_state,
+    TFL_StatefulTensor:$bw_input_cell_state,
+
+    // Auxiliary input & weights.
+    TFL_TensorOfOrNone<[F32, I8]>:$aux_input,
+    // Auxiliary fw weights.
+    TFL_TensorOfOrNone<[F32, I8]>:$fw_aux_input_to_input_weights,
+    TFL_TensorOfOrNone<[F32, I8]>:$fw_aux_input_to_forget_weights,
+    TFL_TensorOfOrNone<[F32, I8]>:$fw_aux_input_to_cell_weights,
+    TFL_TensorOfOrNone<[F32, I8]>:$fw_aux_input_to_output_weights,
+    // Auxiliary bw weights.
+    TFL_TensorOfOrNone<[F32, I8]>:$bw_aux_input_to_input_weights,
+    TFL_TensorOfOrNone<[F32, I8]>:$bw_aux_input_to_forget_weights,
+    TFL_TensorOfOrNone<[F32, I8]>:$bw_aux_input_to_cell_weights,
+    TFL_TensorOfOrNone<[F32, I8]>:$bw_aux_input_to_output_weights,
+
+    // Attributes
+    TFL_AFAttr:$fused_activation_function,
+    DefaultValuedAttr<F32Attr, "0.0f">:$cell_clip,
+    DefaultValuedAttr<F32Attr, "0.0f">:$proj_clip,
+    BoolAttr:$merge_outputs,
+    BoolAttr:$time_major
+  );
+
+  let results = (outs
+    AnyTensor:$fw_output,
+    AnyTensor:$bw_output
+  );
+
+  let hasOptions = 1;
+
+  let verifier = [{ return Verify(*this); }];
+
+  let extraClassDeclaration = [{
+    // StatefulOpInterface:
+    std::vector<int> GetStatefulOperands() { return {35, 36, 37, 38}; }
+  }];
+}
+
 def RnnResultConstraint : PredOpTrait<
   "the input and result tensor elemental types must be same",
   TCresVTEtIsSameAsOp<0, 0>>;
diff --git a/tensorflow/compiler/mlir/lite/python/BUILD b/tensorflow/compiler/mlir/lite/python/BUILD
index 3080d74ee9c..638884634d5 100644
--- a/tensorflow/compiler/mlir/lite/python/BUILD
+++ b/tensorflow/compiler/mlir/lite/python/BUILD
@@ -10,11 +10,9 @@ package_group(
 )
 
 cc_library(
-    name = "graphdef_to_tfl_flatbuffer",
-    srcs = ["graphdef_to_tfl_flatbuffer.cc"],
-    hdrs = [
-        "graphdef_to_tfl_flatbuffer.h",
-    ],
+    name = "tf_tfl_flatbuffer_helpers",
+    srcs = ["tf_tfl_flatbuffer_helpers.cc"],
+    hdrs = ["tf_tfl_flatbuffer_helpers.h"],
     deps = [
         "//tensorflow/compiler/mlir/lite:common",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
@@ -36,3 +34,61 @@ cc_library(
         "@llvm-project//mlir:Transforms",
     ],
 )
+
+cc_library(
+    name = "graphdef_to_tfl_flatbuffer",
+    srcs = ["graphdef_to_tfl_flatbuffer.cc"],
+    hdrs = [
+        "graphdef_to_tfl_flatbuffer.h",
+    ],
+    deps = [
+        ":tf_tfl_flatbuffer_helpers",
+        "//tensorflow/compiler/mlir/lite:common",
+        "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        "//tensorflow/compiler/mlir/lite:tf_tfl_passes",
+        "//tensorflow/compiler/mlir/lite:tf_to_tfl_flatbuffer",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
+        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/lite/toco:model_flags_proto_cc",
+        "//tensorflow/lite/toco:toco_flags_proto_cc",
+        "//tensorflow/lite/toco:types_proto_cc",
+        "//tensorflow/stream_executor/lib",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
+
+cc_library(
+    name = "saved_model_to_tfl_flatbuffer",
+    srcs = ["saved_model_to_tfl_flatbuffer.cc"],
+    hdrs = [
+        "saved_model_to_tfl_flatbuffer.h",
+    ],
+    deps = [
+        ":tf_tfl_flatbuffer_helpers",
+        "//tensorflow/compiler/mlir/lite:common",
+        "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        "//tensorflow/compiler/mlir/lite:tf_tfl_passes",
+        "//tensorflow/compiler/mlir/lite:tf_to_tfl_flatbuffer",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
+        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/lite/toco:model_flags_proto_cc",
+        "//tensorflow/lite/toco:toco_flags_proto_cc",
+        "//tensorflow/lite/toco:types_proto_cc",
+        "//tensorflow/stream_executor/lib",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
index a3b71fbe8d8..660f73e59e9 100644
--- a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/Support/FileUtilities.h"  // TF:llvm-project
 #include "mlir/Transforms/ViewOpGraph.h"  // TF:llvm-project
 #include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
+#include "tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h"
 #include "tensorflow/compiler/mlir/lite/tf_tfl_passes.h"
 #include "tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
@@ -40,288 +41,7 @@ limitations under the License.
 #include "tensorflow/lite/toco/types.pb.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 
-using stream_executor::port::StatusOr;
-
 namespace tensorflow {
-
-namespace {
-// Op def string for TFLite_Detection_PostProcess Op.
-const char kDetectionPostProcessOp[] =
-    "name: 'TFLite_Detection_PostProcess' input_arg: { name: "
-    "'raw_outputs/box_encodings' type: DT_FLOAT } input_arg: { name: "
-    "'raw_outputs/class_predictions' type: DT_FLOAT } input_arg: { name: "
-    "'anchors' type: DT_FLOAT } output_arg: { name: "
-    "'TFLite_Detection_PostProcess' type: DT_FLOAT } output_arg: { name: "
-    "'TFLite_Detection_PostProcess:1' type: DT_FLOAT } output_arg: { name: "
-    "'TFLite_Detection_PostProcess:2' type: DT_FLOAT } output_arg: { name: "
-    "'TFLite_Detection_PostProcess:3' type: DT_FLOAT } attr : { name: "
-    "'h_scale' type: 'float'} attr : { name: 'max_classes_per_detection' "
-    "type: 'int'} attr : { name: 'max_detections' type: 'int'} attr : { "
-    "name: 'nms_iou_threshold' type: 'float'} attr : { name: "
-    "'nms_score_threshold' type: 'float'} attr : { name: 'num_classes' type: "
-    "'int'} attr : { name: 'w_scale' type: 'float'} attr : { name: 'x_scale' "
-    "type: 'float'} attr : { name: 'y_scale' type: 'float'} attr { name: "
-    "'detections_per_class' type: 'int' default_value { i : 100 }} attr { "
-    "name: 'use_regular_nms' type: 'bool' default_value { b : false }}";
-
-const char kUnidirectionalSequenceLstmOp[] =
-    "name: 'UnidirectionalSequenceLstm' input_arg: {name: 'Input' type: "
-    "DT_FLOAT} input_arg: { name: 'InputToInputWeights' type: DT_FLOAT } "
-    "input_arg: { name: 'InputToForgetWeights' type: DT_FLOAT } input_arg: { "
-    "name: 'InputToCellWeights' type: DT_FLOAT} input_arg: { name: "
-    "'InputToOutputWeights' type: DT_FLOAT } input_arg: { name: "
-    "'RecurrentToInputWeights' type: DT_FLOAT} input_arg: { name: "
-    "'RecurrentToForgetWeights' type: DT_FLOAT} input_arg: { name: "
-    "'RecurrentToCellWeights' type: DT_FLOAT } input_arg: { name: "
-    "'RecurrentToOutputWeights' type: DT_FLOAT } input_arg: { name: "
-    "'CellToInputWeights' type: DT_FLOAT} input_arg: { name: "
-    "'CellToForgetWeights' type: DT_FLOAT } input_arg: { name: "
-    "'CellToOutputWeights' type: DT_FLOAT } input_arg: { name: 'InputGateBias' "
-    "type: DT_FLOAT } input_arg: { name: 'ForgetGateBias' type: DT_FLOAT } "
-    "input_arg: { name: 'kCellGateBias' type: DT_FLOAT } input_arg: { name: "
-    "'OutputGateBias' type: DT_FLOAT } input_arg: { name: 'ProjectionWeights' "
-    "type: DT_FLOAT } input_arg: { name: 'ProjectionBias' type: DT_FLOAT } "
-    "input_arg: { name: 'InputActivationState' type: DT_FLOAT} input_arg: { "
-    "name: 'InputCellStateTensor' type: DT_FLOAT } "
-    "output_arg: { name: 'Concat' type: DT_FLOAT} "
-    "output_arg: { name: "
-    "'LastState' type: DT_FLOAT } output_arg: { name: 'Output' type: DT_FLOAT} "
-    "attr : { name: '_tflite_input_indices' type: 'list(int)'}";
-
-const char kUnidirectionalSequenceRnnOp[] =
-    "name: 'UnidirectionalSequenceRnn' input_arg: {name: 'Input' type: "
-    "DT_FLOAT} input_arg: { name: 'Weights' type: DT_FLOAT } "
-    "input_arg: { name: 'RecurrentWeights' type: DT_FLOAT } input_arg: { "
-    "name: 'Bias' type: DT_FLOAT} "
-    "input_arg: { name: 'HiddenState' type: DT_FLOAT} "
-    "output_arg: { name: "
-    "'LastState' type: DT_FLOAT } output_arg: { name: 'Output' type: "
-    "DT_FLOAT} "
-    "attr : { name: '_tflite_input_indices' type: 'list(int)'}";
-
-// Converts the toco::IODataType to tensorflow::DataType. Only contains the
-// conversion mapping for constants defined in TFLite Python API.
-DataType ConvertIODataTypeToDataType(toco::IODataType dtype) {
-  switch (dtype) {
-    case toco::IODataType::FLOAT:
-      return DT_FLOAT;
-    case toco::IODataType::QUANTIZED_UINT8:
-      return DT_QUINT8;
-    case toco::IODataType::INT8:
-      return DT_QINT8;
-    case toco::IODataType::INT32:
-      return DT_INT32;
-    case toco::IODataType::INT64:
-      return DT_INT64;
-    case toco::IODataType::STRING:
-      return DT_STRING;
-    case toco::IODataType::BOOL:
-      return DT_BOOL;
-    default:
-      return DT_INVALID;
-  }
-}
-
-StatusOr<std::pair<double, double>> InputStatsToMinMax(double mean, double std,
-                                                       DataType type) {
-  // Only qint8 and quint8 are considered here.
-  double qmin, qmax;
-  if (type == DT_QUINT8) {
-    qmin = 0.0;
-    qmax = 255.0;
-  } else if (type == DT_QINT8) {
-    qmin = -128.0;
-    qmax = 127.0;
-  } else {
-    return errors::InvalidArgument("Only int8 and uint8 are considered.");
-  }
-  return std::make_pair((qmin - mean) / std, (qmax - mean) / std);
-}
-
-// Give a warning for any unused flags that have been specified.
-void WarningUnusedFlags(const toco::ModelFlags& model_flags,
-                        const toco::TocoFlags& toco_flags) {
-  if (toco_flags.output_format()) {
-    LOG(WARNING) << "Ignored output_format.";
-  }
-  if (toco_flags.drop_control_dependency()) {
-    LOG(WARNING) << "Ignored drop_control_dependency.";
-  }
-  if (toco_flags.reorder_across_fake_quant()) {
-    LOG(WARNING) << "Ignored reorder_across_fake_quant.";
-  }
-  if (model_flags.change_concat_input_ranges()) {
-    LOG(WARNING) << "Ignored change_concat_input_ranges.";
-  }
-  if (toco_flags.dump_graphviz_include_video()) {
-    LOG(WARNING) << "Ignored dump_graphviz_video.";
-  }
-  if (model_flags.allow_nonexistent_arrays()) {
-    LOG(WARNING) << "Allow allow_nonexistent_arrays.";
-  }
-}
-
-// Dumps the op graph of the `module` to `filename` in DOT format.
-Status DumpOpGraphToFile(mlir::ModuleOp module, const std::string& filename) {
-  std::string error_message;
-  auto output = mlir::openOutputFile(filename, &error_message);
-  if (!error_message.empty()) {
-    return errors::InvalidArgument("Failed to open file in %s.", filename);
-  }
-  mlir::PassManager pm(module.getContext());
-  pm.addPass(mlir::createPrintOpGraphPass(output->os()));
-  if (failed(pm.run(module))) {
-    return errors::Unknown("Failed to dump Op Graph from MLIR module.");
-  }
-  output->keep();
-  return Status::OK();
-}
-
-Status RegisterCustomBuiltinOps(const std::vector<string> extra_tf_opdefs) {
-  for (const auto& tf_opdefs_string : extra_tf_opdefs) {
-    tensorflow::OpDef opdef;
-    if (!tensorflow::protobuf::TextFormat::ParseFromString(tf_opdefs_string,
-                                                           &opdef)) {
-      return errors::InvalidArgument("fail to parse extra OpDef");
-    }
-    // Make sure the op is not already registered. If registered continue.
-    const OpRegistrationData* op_reg =
-        tensorflow::OpRegistry::Global()->LookUp(opdef.name());
-    if (op_reg) continue;
-
-    tensorflow::OpRegistry::Global()->Register(
-        [opdef](tensorflow::OpRegistrationData* op_reg_data) -> Status {
-          *op_reg_data = tensorflow::OpRegistrationData(opdef);
-          return Status::OK();
-        });
-  }
-  return Status::OK();
-}
-
-Status RegisterAllCustomOps(const toco::TocoFlags& toco_flags) {
-  // Register any custom OpDefs.
-  std::vector<string> extra_tf_opdefs(toco_flags.custom_opdefs().begin(),
-                                      toco_flags.custom_opdefs().end());
-  extra_tf_opdefs.push_back(kDetectionPostProcessOp);
-  extra_tf_opdefs.push_back(kUnidirectionalSequenceLstmOp);
-  extra_tf_opdefs.push_back(kUnidirectionalSequenceRnnOp);
-  return RegisterCustomBuiltinOps(extra_tf_opdefs);
-}
-
-Status PopulateQuantizationSpecs(const toco::ModelFlags& model_flags,
-                                 const toco::TocoFlags& toco_flags,
-                                 mlir::TFL::QuantizationSpecs* quant_specs,
-                                 std::vector<string>* node_names,
-                                 std::vector<string>* node_dtypes,
-                                 std::vector<std::vector<int>>* node_shapes,
-                                 std::vector<double>* node_mins,
-                                 std::vector<double>* node_maxs) {
-  quant_specs->inference_input_type =
-      ConvertIODataTypeToDataType(toco_flags.inference_input_type());
-  tensorflow::DataType inference_type =
-      ConvertIODataTypeToDataType(toco_flags.inference_type());
-  // Use non-float flag `inference_input_type` to override the `inference_type`
-  // because we have to apply quantization to satisfy that.
-  if (quant_specs->inference_input_type != tensorflow::DT_FLOAT) {
-    inference_type = quant_specs->inference_input_type;
-  }
-
-  for (auto& flag : model_flags.input_arrays()) {
-    node_names->push_back(flag.name());
-    // TOCO doesn't required `data_type` to be filled for every input.
-    // If it's not filled, make it an empty string so the importer will use
-    // the data type in the NodeDef.
-    auto toco_data_type = flag.data_type();
-    if (toco_data_type == ::toco::IODataType::IO_DATA_TYPE_UNKNOWN) {
-      node_dtypes->push_back("");
-    } else {
-      node_dtypes->push_back(
-          DataType_Name(ConvertIODataTypeToDataType(toco_data_type)));
-    }
-    node_shapes->push_back(std::vector<int>(flag.shape().dims().begin(),
-                                            flag.shape().dims().end()));
-    // Currently, only UINT8 and INT8 require inputs stats
-    if (inference_type == DT_QINT8 || inference_type == DT_QUINT8) {
-      TF_ASSIGN_OR_RETURN(
-          auto min_max, InputStatsToMinMax(flag.mean_value(), flag.std_value(),
-                                           inference_type));
-      node_mins->push_back(min_max.first);
-      node_maxs->push_back(min_max.second);
-    }
-  }
-
-  if (mlir::TFL::GetInputNodeQuantSpecs(*node_names, *node_mins, *node_maxs,
-                                        inference_type, quant_specs)) {
-    return errors::InvalidArgument("Failed to get input quant spec.");
-  }
-
-  // Some extra flag related to post training quantization. If post-training
-  // quantization is enabled, `inference_type` and `inference_input_type` are
-  // not used by MLIR passes.
-  if (toco_flags.post_training_quantize()) {
-    quant_specs->weight_quantization = true;
-    if (toco_flags.quantize_to_float16()) {
-      quant_specs->inference_type = tensorflow::DT_HALF;
-      quant_specs->inference_input_type = tensorflow::DT_HALF;
-    } else {
-      quant_specs->inference_type = tensorflow::DT_QINT8;
-      quant_specs->inference_input_type = tensorflow::DT_QINT8;
-    }
-  }
-
-  // Other flags.
-  if (toco_flags.has_default_ranges_min()) {
-    quant_specs->default_ranges.first = toco_flags.default_ranges_min();
-  }
-  if (toco_flags.has_default_ranges_max()) {
-    quant_specs->default_ranges.second = toco_flags.default_ranges_max();
-  }
-
-  return ::tensorflow::Status::OK();
-}
-
-Status ConvertMLIRToTFLiteFlatBuffer(const toco::TocoFlags& toco_flags,
-                                     mlir::OwningModuleRef module,
-                                     mlir::TFL::QuantizationSpecs quant_specs,
-                                     string* result) {
-  bool emit_builtin_tflite_ops = !toco_flags.force_select_tf_ops();
-  bool emit_select_tf_ops = toco_flags.enable_select_tf_ops();
-  bool emit_custom_ops = toco_flags.allow_custom_ops();
-
-  if (toco_flags.has_dump_graphviz_dir()) {
-    TF_RETURN_IF_ERROR(DumpOpGraphToFile(
-        module.get(),
-        // rename once we enable the new converter feature flag.
-        absl::StrCat(toco_flags.dump_graphviz_dir(), "/toco_AT_IMPORT.dot")));
-  }
-
-  mlir::PassManager pm(module->getContext());
-  mlir::TFL::PassConfig pass_config(quant_specs);
-  pass_config.emit_builtin_tflite_ops = emit_builtin_tflite_ops;
-  pass_config.lower_tensor_list_ops = true;
-
-  tensorflow::AddTFToTFLConversionPasses(pass_config, &pm);
-  // Convert back to outlined while format for export back to flatbuffer.
-  if (pass_config.legalize_tf_while) {
-    pm.addPass(mlir::TFL::CreateWhileOutlinePass());
-  }
-  pm.addPass(mlir::TFL::CreateRuntimeTypeVerifyPass());
-
-  auto status = ConvertTFExecutorToTFLOrFlatbuffer(
-      module.get(), /*export_to_mlir=*/false, emit_builtin_tflite_ops,
-      emit_select_tf_ops, emit_custom_ops, quant_specs, result, &pm);
-  if (toco_flags.has_dump_graphviz_dir()) {
-    TF_RETURN_IF_ERROR(DumpOpGraphToFile(
-        // rename once we enable the new converter feature flag.
-        module.get(), absl::StrCat(toco_flags.dump_graphviz_dir(),
-                                   "/toco_AFTER_TRANSFORMATIONS.dot")));
-  }
-
-  return status;
-}
-
-}  // namespace
-
 Status ConvertGraphDefToTFLiteFlatBuffer(const toco::ModelFlags& model_flags,
                                          const toco::TocoFlags& toco_flags,
                                          const GraphDebugInfo& debug_info,
@@ -339,7 +59,7 @@ Status ConvertGraphDefToTFLiteFlatBuffer(const toco::ModelFlags& model_flags,
   std::vector<double> node_maxs;
 
   // Populate quantization specs.
-  TF_RETURN_IF_ERROR(PopulateQuantizationSpecs(
+  TF_RETURN_IF_ERROR(internal::PopulateQuantizationSpecs(
       model_flags, toco_flags, &quant_specs, &node_names, &node_dtypes,
       &node_shapes, &node_mins, &node_maxs));
 
@@ -356,16 +76,16 @@ Status ConvertGraphDefToTFLiteFlatBuffer(const toco::ModelFlags& model_flags,
   specs.convert_legacy_fed_inputs = true;
   specs.graph_as_function = false;
   specs.upgrade_legacy = true;
-  WarningUnusedFlags(model_flags, toco_flags);
+  internal::WarningUnusedFlags(model_flags, toco_flags);
 
   // Register all custom ops, including user-specified custom ops.
-  TF_RETURN_IF_ERROR(RegisterAllCustomOps(toco_flags));
+  TF_RETURN_IF_ERROR(internal::RegisterAllCustomOps(toco_flags));
 
   TF_ASSIGN_OR_RETURN(
       auto module, ConvertGraphdefToMlir(input, debug_info, specs, &context));
 
-  return ConvertMLIRToTFLiteFlatBuffer(toco_flags, std::move(module),
-                                       quant_specs, result);
+  return internal::ConvertMLIRToTFLiteFlatBuffer(toco_flags, std::move(module),
+                                                 quant_specs, result);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
new file mode 100644
index 00000000000..a546dba3ff3
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
@@ -0,0 +1,78 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h"
+
+#include <utility>
+
+#include "llvm/Support/ToolOutputFile.h"
+#include "mlir/IR/MLIRContext.h"  // TF:llvm-project
+#include "mlir/IR/Module.h"  // TF:llvm-project
+#include "mlir/Pass/Pass.h"  // TF:llvm-project
+#include "mlir/Support/FileUtilities.h"  // TF:llvm-project
+#include "mlir/Transforms/ViewOpGraph.h"  // TF:llvm-project
+#include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
+#include "tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h"
+#include "tensorflow/compiler/mlir/lite/tf_tfl_passes.h"
+#include "tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h"
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/toco_flags.pb.h"
+#include "tensorflow/lite/toco/types.pb.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace tensorflow {
+
+Status ConvertSavedModelToTFLiteFlatBuffer(
+    const toco::ModelFlags& model_flags, const toco::TocoFlags& toco_flags,
+    const string& saved_model_dir, bool saved_model_v1,
+    const string& saved_model_tags, const string& saved_model_exported_names,
+    string* result) {
+  mlir::MLIRContext context;
+  mlir::TFL::QuantizationSpecs quant_specs;
+
+  // Parse input arrays.
+  std::vector<string> node_names;
+  std::vector<string> node_dtypes;
+  std::vector<std::vector<int>> node_shapes;
+  std::vector<double> node_mins;
+  std::vector<double> node_maxs;
+
+  // Populate quantization specs.
+  TF_RETURN_IF_ERROR(internal::PopulateQuantizationSpecs(
+      model_flags, toco_flags, &quant_specs, &node_names, &node_dtypes,
+      &node_shapes, &node_mins, &node_maxs));
+
+  internal::WarningUnusedFlags(model_flags, toco_flags);
+
+  // Register all custom ops, including user-specified custom ops.
+  TF_RETURN_IF_ERROR(internal::RegisterAllCustomOps(toco_flags));
+
+  const bool import_saved_model = !saved_model_v1;
+  TF_ASSIGN_OR_RETURN(
+      auto module,
+      ImportSavedModel(import_saved_model, saved_model_v1, saved_model_dir,
+                       saved_model_tags, saved_model_exported_names, &context));
+  return internal::ConvertMLIRToTFLiteFlatBuffer(toco_flags, std::move(module),
+                                                 quant_specs, result);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h
new file mode 100644
index 00000000000..dea5603dad0
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_SAVED_MODEL_TO_TFL_FLATBUFFER_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_SAVED_MODEL_TO_TFL_FLATBUFFER_H_
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/toco_flags.pb.h"
+
+namespace tensorflow {
+
+// Converts the given saved_model(either v1 or v2) to a TF Lite FlatBuffer
+// string according to the given model flags, toco flags and tags. Returns error
+// status if it fails to convert the input.
+Status ConvertSavedModelToTFLiteFlatBuffer(
+    const toco::ModelFlags& model_flags, const toco::TocoFlags& toco_flags,
+    const string& saved_model_dir, bool saved_model_v1,
+    const string& saved_model_tags, const string& saved_model_exported_names,
+    string* result);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_SAVED_MODEL_TO_TFL_FLATBUFFER_H_
diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
new file mode 100644
index 00000000000..e0eb8004a01
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
@@ -0,0 +1,325 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h"
+
+#include <ostream>
+#include <utility>
+
+#include "llvm/Support/ToolOutputFile.h"
+#include "mlir/IR/MLIRContext.h"  // TF:llvm-project
+#include "mlir/IR/Module.h"  // TF:llvm-project
+#include "mlir/Pass/Pass.h"  // TF:llvm-project
+#include "mlir/Support/FileUtilities.h"  // TF:llvm-project
+#include "mlir/Transforms/ViewOpGraph.h"  // TF:llvm-project
+#include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
+#include "tensorflow/compiler/mlir/lite/tf_tfl_passes.h"
+#include "tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h"
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/graph_debug_info.pb.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/toco_flags.pb.h"
+#include "tensorflow/lite/toco/types.pb.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+using stream_executor::port::StatusOr;
+
+namespace tensorflow {
+namespace internal {
+namespace {
+
+// Op def string for TFLite_Detection_PostProcess Op.
+const char kDetectionPostProcessOp[] =
+    "name: 'TFLite_Detection_PostProcess' input_arg: { name: "
+    "'raw_outputs/box_encodings' type: DT_FLOAT } input_arg: { name: "
+    "'raw_outputs/class_predictions' type: DT_FLOAT } input_arg: { name: "
+    "'anchors' type: DT_FLOAT } output_arg: { name: "
+    "'TFLite_Detection_PostProcess' type: DT_FLOAT } output_arg: { name: "
+    "'TFLite_Detection_PostProcess:1' type: DT_FLOAT } output_arg: { name: "
+    "'TFLite_Detection_PostProcess:2' type: DT_FLOAT } output_arg: { name: "
+    "'TFLite_Detection_PostProcess:3' type: DT_FLOAT } attr : { name: "
+    "'h_scale' type: 'float'} attr : { name: 'max_classes_per_detection' "
+    "type: 'int'} attr : { name: 'max_detections' type: 'int'} attr : { "
+    "name: 'nms_iou_threshold' type: 'float'} attr : { name: "
+    "'nms_score_threshold' type: 'float'} attr : { name: 'num_classes' type: "
+    "'int'} attr : { name: 'w_scale' type: 'float'} attr : { name: 'x_scale' "
+    "type: 'float'} attr : { name: 'y_scale' type: 'float'} attr { name: "
+    "'detections_per_class' type: 'int' default_value { i : 100 }} attr { "
+    "name: 'use_regular_nms' type: 'bool' default_value { b : false }}";
+
+const char kUnidirectionalSequenceLstmOp[] =
+    "name: 'UnidirectionalSequenceLstm' input_arg: {name: 'Input' type: "
+    "DT_FLOAT} input_arg: { name: 'InputToInputWeights' type: DT_FLOAT } "
+    "input_arg: { name: 'InputToForgetWeights' type: DT_FLOAT } input_arg: { "
+    "name: 'InputToCellWeights' type: DT_FLOAT} input_arg: { name: "
+    "'InputToOutputWeights' type: DT_FLOAT } input_arg: { name: "
+    "'RecurrentToInputWeights' type: DT_FLOAT} input_arg: { name: "
+    "'RecurrentToForgetWeights' type: DT_FLOAT} input_arg: { name: "
+    "'RecurrentToCellWeights' type: DT_FLOAT } input_arg: { name: "
+    "'RecurrentToOutputWeights' type: DT_FLOAT } input_arg: { name: "
+    "'CellToInputWeights' type: DT_FLOAT} input_arg: { name: "
+    "'CellToForgetWeights' type: DT_FLOAT } input_arg: { name: "
+    "'CellToOutputWeights' type: DT_FLOAT } input_arg: { name: 'InputGateBias' "
+    "type: DT_FLOAT } input_arg: { name: 'ForgetGateBias' type: DT_FLOAT } "
+    "input_arg: { name: 'kCellGateBias' type: DT_FLOAT } input_arg: { name: "
+    "'OutputGateBias' type: DT_FLOAT } input_arg: { name: 'ProjectionWeights' "
+    "type: DT_FLOAT } input_arg: { name: 'ProjectionBias' type: DT_FLOAT } "
+    "input_arg: { name: 'InputActivationState' type: DT_FLOAT} input_arg: { "
+    "name: 'InputCellStateTensor' type: DT_FLOAT } "
+    "output_arg: { name: 'Concat' type: DT_FLOAT} "
+    "output_arg: { name: "
+    "'LastState' type: DT_FLOAT } output_arg: { name: 'Output' type: DT_FLOAT} "
+    "attr : { name: '_tflite_input_indices' type: 'list(int)'}";
+
+const char kUnidirectionalSequenceRnnOp[] =
+    "name: 'UnidirectionalSequenceRnn' input_arg: {name: 'Input' type: "
+    "DT_FLOAT} input_arg: { name: 'Weights' type: DT_FLOAT } "
+    "input_arg: { name: 'RecurrentWeights' type: DT_FLOAT } input_arg: { "
+    "name: 'Bias' type: DT_FLOAT} "
+    "input_arg: { name: 'HiddenState' type: DT_FLOAT} "
+    "output_arg: { name: "
+    "'LastState' type: DT_FLOAT } output_arg: { name: 'Output' type: "
+    "DT_FLOAT} "
+    "attr : { name: '_tflite_input_indices' type: 'list(int)'}";
+
+// Converts the toco::IODataType to tensorflow::DataType. Only contains the
+// conversion mapping for constants defined in TFLite Python API.
+DataType ConvertIODataTypeToDataType(toco::IODataType dtype) {
+  switch (dtype) {
+    case toco::IODataType::FLOAT:
+      return DT_FLOAT;
+    case toco::IODataType::QUANTIZED_UINT8:
+      return DT_QUINT8;
+    case toco::IODataType::INT8:
+      return DT_QINT8;
+    case toco::IODataType::INT32:
+      return DT_INT32;
+    case toco::IODataType::INT64:
+      return DT_INT64;
+    case toco::IODataType::STRING:
+      return DT_STRING;
+    case toco::IODataType::BOOL:
+      return DT_BOOL;
+    default:
+      return DT_INVALID;
+  }
+}
+
+StatusOr<std::pair<double, double>> InputStatsToMinMax(double mean, double std,
+                                                       DataType type) {
+  // Only qint8 and quint8 are considered here.
+  double qmin, qmax;
+  if (type == DT_QUINT8) {
+    qmin = 0.0;
+    qmax = 255.0;
+  } else if (type == DT_QINT8) {
+    qmin = -128.0;
+    qmax = 127.0;
+  } else {
+    return errors::InvalidArgument("Only int8 and uint8 are considered.");
+  }
+  return std::make_pair((qmin - mean) / std, (qmax - mean) / std);
+}
+
+Status RegisterCustomBuiltinOps(const std::vector<string> extra_tf_opdefs) {
+  for (const auto& tf_opdefs_string : extra_tf_opdefs) {
+    tensorflow::OpDef opdef;
+    if (!tensorflow::protobuf::TextFormat::ParseFromString(tf_opdefs_string,
+                                                           &opdef)) {
+      return errors::InvalidArgument("fail to parse extra OpDef");
+    }
+    // Make sure the op is not already registered. If registered continue.
+    const OpRegistrationData* op_reg =
+        tensorflow::OpRegistry::Global()->LookUp(opdef.name());
+    if (op_reg) continue;
+
+    tensorflow::OpRegistry::Global()->Register(
+        [opdef](tensorflow::OpRegistrationData* op_reg_data) -> Status {
+          *op_reg_data = tensorflow::OpRegistrationData(opdef);
+          return Status::OK();
+        });
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status RegisterAllCustomOps(const toco::TocoFlags& toco_flags) {
+  // Register any custom OpDefs.
+  std::vector<string> extra_tf_opdefs(toco_flags.custom_opdefs().begin(),
+                                      toco_flags.custom_opdefs().end());
+  extra_tf_opdefs.push_back(kDetectionPostProcessOp);
+  extra_tf_opdefs.push_back(kUnidirectionalSequenceLstmOp);
+  extra_tf_opdefs.push_back(kUnidirectionalSequenceRnnOp);
+  return RegisterCustomBuiltinOps(extra_tf_opdefs);
+}
+
+Status PopulateQuantizationSpecs(const toco::ModelFlags& model_flags,
+                                 const toco::TocoFlags& toco_flags,
+                                 mlir::TFL::QuantizationSpecs* quant_specs,
+                                 std::vector<string>* node_names,
+                                 std::vector<string>* node_dtypes,
+                                 std::vector<std::vector<int>>* node_shapes,
+                                 std::vector<double>* node_mins,
+                                 std::vector<double>* node_maxs) {
+  quant_specs->inference_input_type =
+      ConvertIODataTypeToDataType(toco_flags.inference_input_type());
+  tensorflow::DataType inference_type =
+      ConvertIODataTypeToDataType(toco_flags.inference_type());
+  // Use non-float flag `inference_input_type` to override the `inference_type`
+  // because we have to apply quantization to satisfy that.
+  if (quant_specs->inference_input_type != tensorflow::DT_FLOAT) {
+    inference_type = quant_specs->inference_input_type;
+  }
+
+  for (auto& flag : model_flags.input_arrays()) {
+    node_names->push_back(flag.name());
+    // TOCO doesn't required `data_type` to be filled for every input.
+    // If it's not filled, make it an empty string so the importer will use
+    // the data type in the NodeDef.
+    auto toco_data_type = flag.data_type();
+    if (toco_data_type == ::toco::IODataType::IO_DATA_TYPE_UNKNOWN) {
+      node_dtypes->push_back("");
+    } else {
+      node_dtypes->push_back(
+          DataType_Name(ConvertIODataTypeToDataType(toco_data_type)));
+    }
+    node_shapes->push_back(std::vector<int>(flag.shape().dims().begin(),
+                                            flag.shape().dims().end()));
+    // Currently, only UINT8 and INT8 require inputs stats
+    if (inference_type == DT_QINT8 || inference_type == DT_QUINT8) {
+      TF_ASSIGN_OR_RETURN(
+          auto min_max, InputStatsToMinMax(flag.mean_value(), flag.std_value(),
+                                           inference_type));
+      node_mins->push_back(min_max.first);
+      node_maxs->push_back(min_max.second);
+    }
+  }
+
+  if (mlir::TFL::GetInputNodeQuantSpecs(*node_names, *node_mins, *node_maxs,
+                                        inference_type, quant_specs)) {
+    return errors::InvalidArgument("Failed to get input quant spec.");
+  }
+
+  // Some extra flag related to post training quantization. If post-training
+  // quantization is enabled, `inference_type` and `inference_input_type` are
+  // not used by MLIR passes.
+  if (toco_flags.post_training_quantize()) {
+    quant_specs->weight_quantization = true;
+    if (toco_flags.quantize_to_float16()) {
+      quant_specs->inference_type = tensorflow::DT_HALF;
+      quant_specs->inference_input_type = tensorflow::DT_HALF;
+    } else {
+      quant_specs->inference_type = tensorflow::DT_QINT8;
+      quant_specs->inference_input_type = tensorflow::DT_QINT8;
+    }
+  }
+
+  // Other flags.
+  if (toco_flags.has_default_ranges_min()) {
+    quant_specs->default_ranges.first = toco_flags.default_ranges_min();
+  }
+  if (toco_flags.has_default_ranges_max()) {
+    quant_specs->default_ranges.second = toco_flags.default_ranges_max();
+  }
+
+  return ::tensorflow::Status::OK();
+}
+
+// Dumps the op graph of the `module` to `filename` in DOT format.
+Status DumpOpGraphToFile(mlir::ModuleOp module, const std::string& filename) {
+  std::string error_message;
+  auto output = mlir::openOutputFile(filename, &error_message);
+  if (!error_message.empty()) {
+    return errors::InvalidArgument("Failed to open file in %s.", filename);
+  }
+  mlir::PassManager pm(module.getContext());
+  pm.addPass(mlir::createPrintOpGraphPass(output->os()));
+  if (failed(pm.run(module))) {
+    return errors::Unknown("Failed to dump Op Graph from MLIR module.");
+  }
+  output->keep();
+  return Status::OK();
+}
+
+Status ConvertMLIRToTFLiteFlatBuffer(const toco::TocoFlags& toco_flags,
+                                     mlir::OwningModuleRef module,
+                                     mlir::TFL::QuantizationSpecs quant_specs,
+                                     string* result) {
+  bool emit_builtin_tflite_ops = !toco_flags.force_select_tf_ops();
+  bool emit_select_tf_ops = toco_flags.enable_select_tf_ops();
+  bool emit_custom_ops = toco_flags.allow_custom_ops();
+
+  if (toco_flags.has_dump_graphviz_dir()) {
+    TF_RETURN_IF_ERROR(DumpOpGraphToFile(
+        module.get(),
+        // rename once we enable the new converter feature flag.
+        absl::StrCat(toco_flags.dump_graphviz_dir(), "/toco_AT_IMPORT.dot")));
+  }
+
+  mlir::PassManager pm(module->getContext());
+  mlir::TFL::PassConfig pass_config(quant_specs);
+  pass_config.emit_builtin_tflite_ops = emit_builtin_tflite_ops;
+  pass_config.lower_tensor_list_ops = true;
+
+  tensorflow::AddTFToTFLConversionPasses(pass_config, &pm);
+  // Convert back to outlined while format for export back to flatbuffer.
+  if (pass_config.legalize_tf_while) {
+    pm.addPass(mlir::TFL::CreateWhileOutlinePass());
+  }
+  pm.addPass(mlir::TFL::CreateRuntimeTypeVerifyPass());
+
+  auto status = ConvertTFExecutorToTFLOrFlatbuffer(
+      module.get(), /*export_to_mlir=*/false, emit_builtin_tflite_ops,
+      emit_select_tf_ops, emit_custom_ops, quant_specs, result, &pm);
+  if (toco_flags.has_dump_graphviz_dir()) {
+    TF_RETURN_IF_ERROR(DumpOpGraphToFile(
+        // rename once we enable the new converter feature flag.
+        module.get(), absl::StrCat(toco_flags.dump_graphviz_dir(),
+                                   "/toco_AFTER_TRANSFORMATIONS.dot")));
+  }
+
+  return status;
+}
+
+void WarningUnusedFlags(const toco::ModelFlags& model_flags,
+                        const toco::TocoFlags& toco_flags) {
+  if (toco_flags.output_format()) {
+    LOG(WARNING) << "Ignored output_format.";
+  }
+  if (toco_flags.drop_control_dependency()) {
+    LOG(WARNING) << "Ignored drop_control_dependency.";
+  }
+  if (toco_flags.reorder_across_fake_quant()) {
+    LOG(WARNING) << "Ignored reorder_across_fake_quant.";
+  }
+  if (model_flags.change_concat_input_ranges()) {
+    LOG(WARNING) << "Ignored change_concat_input_ranges.";
+  }
+  if (toco_flags.dump_graphviz_include_video()) {
+    LOG(WARNING) << "Ignored dump_graphviz_video.";
+  }
+  if (model_flags.allow_nonexistent_arrays()) {
+    LOG(WARNING) << "Allow allow_nonexistent_arrays.";
+  }
+}
+
+}  // namespace internal
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
new file mode 100644
index 00000000000..41846d8e846
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
@@ -0,0 +1,59 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_TF_TFL_FLATBUFFER_HELPERS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_TF_TFL_FLATBUFFER_HELPERS_H_
+
+#include <ostream>
+#include <utility>
+
+#include "mlir/IR/Module.h"  // TF:llvm-project
+#include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/toco_flags.pb.h"
+#include "tensorflow/lite/toco/types.pb.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace tensorflow {
+namespace internal {
+
+// Register all custom ops including user specified custom ops.
+Status RegisterAllCustomOps(const toco::TocoFlags& toco_flags);
+
+// Populate quantization specs (or not) given user specified ranges for each
+// input arrays.
+Status PopulateQuantizationSpecs(const toco::ModelFlags& model_flags,
+                                 const toco::TocoFlags& toco_flags,
+                                 mlir::TFL::QuantizationSpecs* quant_specs,
+                                 std::vector<string>* node_names,
+                                 std::vector<string>* node_dtypes,
+                                 std::vector<std::vector<int>>* node_shapes,
+                                 std::vector<double>* node_mins,
+                                 std::vector<double>* node_maxs);
+
+// Convert imported MLIR file to TfLite flatbuffer.
+// This will also run relevant passes as well.
+Status ConvertMLIRToTFLiteFlatBuffer(const toco::TocoFlags& toco_flags,
+                                     mlir::OwningModuleRef module,
+                                     mlir::TFL::QuantizationSpecs quant_specs,
+                                     string* result);
+
+// Give a warning for any unused flags that have been specified.
+void WarningUnusedFlags(const toco::ModelFlags& model_flags,
+                        const toco::TocoFlags& toco_flags);
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_TF_TFL_FLATBUFFER_HELPERS_H_
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization.td b/tensorflow/compiler/mlir/lite/quantization/quantization.td
index 416c3d1719d..966740e605f 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization.td
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization.td
@@ -20,7 +20,7 @@ limitations under the License.
 #define TF_Quantization
 
 include "mlir/IR/OpBase.td"
-include "mlir/Dialect/QuantOps/QuantPredicates.td"
+include "mlir/Dialect/QuantOps/QuantOpsBase.td"
 
 //===----------------------------------------------------------------------===//
 // QuantizedType definitions.
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
index a98d50bd07e..a321170349a 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 
+#include <algorithm>
 #include <cstdint>
 #include <limits>
 #include <numeric>
@@ -147,29 +148,45 @@ static bool BroadcastVector(int target_size, SmallVectorImpl<T>& data) {
 // Changes the axis of the input per-channel quantized type to match the
 // dimension of the target type. Returns nullptr if it fails.
 static quant::UniformQuantizedPerAxisType ResetAxisAndBroadcast(
-    quant::UniformQuantizedPerAxisType qtype, Type target, int quant_dim) {
+    ArrayRef<int64_t> shape, quant::UniformQuantizedPerAxisType qtype,
+    Type target, int quant_dim) {
   auto shaped = target.dyn_cast<RankedTensorType>();
   if (!shaped) return {};
+  ArrayRef<int64_t> new_shape = shaped.getShape();
 
   SmallVector<double, 4> scales(qtype.getScales().begin(),
                                 qtype.getScales().end());
   SmallVector<int64_t, 4> zero_points(qtype.getZeroPoints().begin(),
                                       qtype.getZeroPoints().end());
-  // Broadcast the scales and zero points to match the target size, which is
-  // usually the axis-th dimension of the target type. Currently, it covers two
-  // cases:
-  // - for Transpose, the data layout is changed so the `dim[axis]` still equals
-  // to the `scales_size`. The broadcast skips;
-  // - for Reshape, the data layout isn't changed but the innermost dimension is
-  // expand to cover the last two original dimensions. Thus we just need to be
-  // repeated the `scales` dim[2] times to covers the new dim length.
-  //
-  // TODO(b/141709944): after the fix, the `scales` can be for dim[2], thus we
-  // have to repeat each elements in the `scales` locally dim[3] times.
-  if (BroadcastVector<double>(shaped.getDimSize(quant_dim), scales) ||
-      BroadcastVector<int64_t>(shaped.getDimSize(quant_dim), zero_points)) {
+
+  if (new_shape.size() == shape.size()) {  // same rank
+    // Broadcast the scales and zero points to match the target size, which is
+    // usually the axis-th dimension of the target type. Currently, it covers
+    // two cases:
+    // - for Transpose, the data layout is changed so the `dim[axis]` still
+    // equals to the `scales_size`. The broadcast skips;
+    // - for Reshape, the data layout isn't changed but the innermost dimension
+    // is expand to cover the last two original dimensions. Thus we just need to
+    // be repeated the `scales` dim[2] times to covers the new dim length.
+    //
+    // TODO(b/141709944): after the fix, the `scales` can be for dim[2], thus we
+    // have to repeat each elements in the `scales` locally dim[3] times.
+    if (BroadcastVector<double>(shaped.getDimSize(quant_dim), scales) ||
+        BroadcastVector<int64_t>(shaped.getDimSize(quant_dim), zero_points)) {
+      return {};
+    }
+  } else if ((new_shape.size() == shape.size() + 1) && new_shape.back() == 1) {
+    // This is a trivial shift left, then we shift the quant_dim as well.
+    if (std::equal(shape.begin(), shape.end(), new_shape.begin()) &&
+        quant_dim == -1) {
+      quant_dim = shape.size() + quant_dim;
+    } else {
+      return {};
+    }
+  } else {
     return {};
   }
+
   return quant::UniformQuantizedPerAxisType::get(
       qtype.getFlags(), qtype.getStorageType(), qtype.getExpressedType(),
       scales, zero_points, quant_dim, qtype.getStorageTypeMin(),
@@ -179,20 +196,21 @@ static quant::UniformQuantizedPerAxisType ResetAxisAndBroadcast(
 TypeAttr CastQuantizedTypeAttrFromExpressedType(Builder builder,
                                                 TypeAttr source, Type target,
                                                 int axis) {
-  if (auto source_type = source.getValue().dyn_cast_or_null<ShapedType>()) {
-    auto src_ele_type = source_type.getElementType();
-    if (auto quantized_type = src_ele_type.dyn_cast<quant::QuantizedType>()) {
-      if (auto qtype =
-              quantized_type.dyn_cast<quant::UniformQuantizedPerAxisType>()) {
-        quantized_type = ResetAxisAndBroadcast(qtype, target, axis);
-        if (!src_ele_type) return {};
-      }
-      Type final_type = quantized_type.castFromExpressedType(target);
-      if (!final_type) return {};
-      return TypeAttr::get(final_type);
-    }
+  auto source_type = source.getValue().dyn_cast_or_null<ShapedType>();
+  if (!source_type) return {};
+  auto src_ele_type = source_type.getElementType();
+  auto qtype = src_ele_type.dyn_cast<quant::QuantizedType>();
+
+  // Reset the quantization dimensions if it is per-axis.
+  if (auto per_axis =
+          qtype.dyn_cast_or_null<quant::UniformQuantizedPerAxisType>()) {
+    qtype =
+        ResetAxisAndBroadcast(source_type.getShape(), per_axis, target, axis);
   }
-  return {};
+  if (!qtype) return {};
+  Type final_type = qtype.castFromExpressedType(target);
+  if (!final_type) return {};
+  return TypeAttr::get(final_type);
 }
 
 Type GetUniformQuantizedTypeForWeight(ElementsAttr attr, bool symmetric,
diff --git a/tensorflow/compiler/mlir/lite/quantization/xla/BUILD b/tensorflow/compiler/mlir/lite/quantization/xla/BUILD
index 7616922b613..2c5bed86a84 100644
--- a/tensorflow/compiler/mlir/lite/quantization/xla/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/xla/BUILD
@@ -9,6 +9,7 @@ package_group(
     name = "friends",
     includes = ["//third_party/mlir:subpackages"],
     packages = [
+        "//tensorflow/compiler/aot/...",
         "//tensorflow/compiler/mlir/...",
         "//tensorflow/compiler/mlir/lite/...",
     ],
@@ -38,3 +39,29 @@ cc_library(
     ],
     alwayslink = 1,
 )
+
+cc_library(
+    name = "quantize",
+    srcs = [
+        "quantize.cc",
+    ],
+    hdrs = [
+        "quantize.h",
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/xla:hlo",
+        "//tensorflow/compiler/mlir/xla:hlo_to_mlir_hlo",
+        "//tensorflow/compiler/tf2xla",
+        "//tensorflow/compiler/tf2xla:mlir_tf2xla",
+        "//tensorflow/compiler/tf2xla:tf2xla_proto_cc",
+        "//tensorflow/compiler/tf2xla:tf2xla_util",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla/kernels:xla_dummy_ops",
+        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/core/platform:status",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/lite/quantization/xla/quantize.cc b/tensorflow/compiler/mlir/lite/quantization/xla/quantize.cc
new file mode 100644
index 00000000000..4640284fa5c
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/quantization/xla/quantize.cc
@@ -0,0 +1,62 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/lite/quantization/xla/quantize.h"
+
+#include "mlir/IR/Builders.h"  // TF:llvm-project
+#include "mlir/IR/Function.h"  // TF:llvm-project
+#include "mlir/IR/MLIRContext.h"  // TF:llvm-project
+#include "mlir/IR/Module.h"  // TF:llvm-project
+#include "mlir/Pass/Pass.h"  // TF:llvm-project
+#include "mlir/Pass/PassManager.h"  // TF:llvm-project
+#include "mlir/Transforms/Passes.h"  // TF:llvm-project
+#include "tensorflow/compiler/mlir/xla/hlo_to_mlir_hlo.h"
+#include "tensorflow/compiler/tf2xla/tf2xla.h"
+#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
+
+namespace mlir {
+namespace xla_hlo {
+
+// Quantizes the model in the computation.
+tensorflow::Status XlaQuantize(const tensorflow::tf2xla::Config& config,
+                               xla::XlaComputation* computation) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::HloSnapshot> snapshot,
+                      computation->Snapshot());
+
+  MLIRContext context;
+  OwningModuleRef module = ModuleOp::create(UnknownLoc::get(&context));
+  auto status = xla::ConvertHloToMlirHlo(
+      module.get(), snapshot->mutable_hlo()->mutable_hlo_module());
+  if (!status.ok()) {
+    LOG(ERROR) << "Hlo module import failed: " << status;
+    return status;
+  }
+
+  PassManager pm(&context);
+  pm.addPass(createCanonicalizerPass());
+  pm.addPass(createInlinerPass());
+  pm.addPass(createSymbolDCEPass());
+  pm.addNestedPass<FuncOp>(createCSEPass());
+
+  mlir::StatusScopedDiagnosticHandler diag_handler(&context);
+  LogicalResult result = pm.run(module.get());
+  (void)result;
+
+  module->dump();
+
+  return tensorflow::Status::OK();
+}
+
+}  // namespace xla_hlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/quantization/xla/quantize.h b/tensorflow/compiler/mlir/lite/quantization/xla/quantize.h
new file mode 100644
index 00000000000..2ec5dbb02ce
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/quantization/xla/quantize.h
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_XLA_QUANTIZE_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_XLA_QUANTIZE_H_
+
+#include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/core/platform/status.h"
+
+namespace mlir {
+namespace xla_hlo {
+
+// Quantizes the model in the computation.
+tensorflow::Status XlaQuantize(const tensorflow::tf2xla::Config& config,
+                               xla::XlaComputation* computation);
+
+}  // namespace xla_hlo
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_XLA_QUANTIZE_H_
diff --git a/tensorflow/compiler/mlir/lite/quantization/xla/tests/BUILD b/tensorflow/compiler/mlir/lite/quantization/xla/tests/BUILD
index 4faa8d2efe8..4b6b4212567 100644
--- a/tensorflow/compiler/mlir/lite/quantization/xla/tests/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/xla/tests/BUILD
@@ -3,8 +3,14 @@ load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 package(licenses = ["notice"])
 
 glob_lit_tests(
-    data = [":test_utilities"],
+    data = [
+        ":graph_config_files",
+        ":test_utilities",
+    ],
     driver = "@llvm-project//mlir:run_lit.sh",
+    tags_override = {
+        "fadd_quant.mlir": ["no_oss"],  # TODO(b/150957738): to be fixed on oss.
+    },
     test_file_exts = ["mlir"],
 )
 
@@ -13,7 +19,17 @@ filegroup(
     name = "test_utilities",
     testonly = True,
     data = [
+        "//tensorflow/compiler/aot:tfcompile",
         "//tensorflow/compiler/mlir:tf-opt",
         "@llvm-project//llvm:FileCheck",
+        "@llvm-project//llvm:not",
     ],
 )
+
+# Bundle together all the graph files that are used by the tests.
+filegroup(
+    name = "graph_config_files",
+    srcs = glob(
+        ["**/*.pbtxt"],
+    ),
+)
diff --git a/tensorflow/compiler/mlir/lite/quantization/xla/tests/fadd_quant.mlir b/tensorflow/compiler/mlir/lite/quantization/xla/tests/fadd_quant.mlir
new file mode 100644
index 00000000000..6b9ccfceddd
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/quantization/xla/tests/fadd_quant.mlir
@@ -0,0 +1,15 @@
+# RUN: not tfcompile --graph=%s.pbtxt --config=%s.config.pbtxt --quantize --cpp_class="::test::fadd_quant"  2>&1 | FileCheck %s -dump-input-on-failure
+
+# TODO(fengliuai): update this file with the progress of the implementation
+// CHECK: func @main
+// CHECK: %cst = constant dense<0.000000e+00> : tensor<f32>
+// CHECK: %cst_0 = constant dense<1.270000e+02> : tensor<f32>
+// CHECK: %cst_1 = constant dense<8> : tensor<i32>
+// CHECK: %cst_2 = constant dense<false> : tensor<i1>
+// CHECK: %0 = "xla_hlo.custom_call"(%arg0, %cst, %cst_0, %cst_1, %cst_2) {backend_config = "", call_target_name = "fake_quant_with_min_max_vars", has_side_effect = false, name = "custom-call.9"} : (tensor<2x4xf32>, tensor<f32>, tensor<f32>, tensor<i32>, tensor<i1>) -> tensor<2x4xf32>
+// CHECK: %1 = "xla_hlo.custom_call"(%arg1, %cst, %cst_0, %cst_1, %cst_2) {backend_config = "", call_target_name = "fake_quant_with_min_max_vars", has_side_effect = false, name = "custom-call.14"} : (tensor<2x4xf32>, tensor<f32>, tensor<f32>, tensor<i32>, tensor<i1>) -> tensor<2x4xf32>
+// CHECK: %2 = xla_hlo.add %0, %1 {name = "add.15"} : tensor<2x4xf32>
+// CHECK: %3 = "xla_hlo.custom_call"(%2, %cst, %cst_0, %cst_1, %cst_2) {backend_config = "", call_target_name = "fake_quant_with_min_max_vars", has_side_effect = false, name = "custom-call.20"} : (tensor<2x4xf32>, tensor<f32>, tensor<f32>, tensor<i32>, tensor<i1>) -> tensor<2x4xf32>
+// CHECK: %4 = "xla_hlo.tuple"(%3) {name = "tuple.22"} : (tensor<2x4xf32>) -> tuple<tensor<2x4xf32>>
+// CHECK: return %4 : tuple<tensor<2x4xf32>>
+// CHECK: }
diff --git a/tensorflow/compiler/mlir/lite/quantization/xla/tests/fadd_quant.mlir.config.pbtxt b/tensorflow/compiler/mlir/lite/quantization/xla/tests/fadd_quant.mlir.config.pbtxt
new file mode 100644
index 00000000000..1e97c1fa326
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/quantization/xla/tests/fadd_quant.mlir.config.pbtxt
@@ -0,0 +1,26 @@
+feed {
+  id { node_name: "input0" }
+  shape {
+    dim { size: 2 }
+    dim { size: 4 }
+  }
+}
+feed {
+  id { node_name: "input1" }
+  shape {
+    dim { size: 2 }
+    dim { size: 4 }
+  }
+}
+
+fetch {
+  id { node_name: "Add/FakeQuantWithMinMaxVars" }
+  shape {
+    dim { size: 2 }
+    dim { size: 4 }
+  }
+}
+
+conversion_options {
+  custom_fake_quant_op_calls: true
+}
diff --git a/tensorflow/compiler/mlir/lite/quantization/xla/tests/fadd_quant.mlir.pbtxt b/tensorflow/compiler/mlir/lite/quantization/xla/tests/fadd_quant.mlir.pbtxt
new file mode 100644
index 00000000000..6995c861fd0
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/quantization/xla/tests/fadd_quant.mlir.pbtxt
@@ -0,0 +1,218 @@
+node: {
+  name: "Add/FakeQuantWithMinMaxVars"
+  op: "FakeQuantWithMinMaxVars"
+  input: "Add"
+  input: "Add/FakeQuantWithMinMaxVars/min"
+  input: "Add/FakeQuantWithMinMaxVars/max"
+  attr: {
+    key: "num_bits"
+    value: {
+      i: 8
+    }
+  }
+  attr: {
+    key: "narrow_range"
+    value: {
+      b: false
+    }
+  }
+}
+node: {
+  name: "Add/FakeQuantWithMinMaxVars/min"
+  op: "Const"
+  attr: {
+    key: "value"
+    value: {
+      tensor: {
+        dtype: DT_FLOAT
+        tensor_shape: {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+  attr: {
+    key: "dtype"
+    value: {
+      type: DT_FLOAT
+    }
+  }
+}
+node: {
+  name: "Add/FakeQuantWithMinMaxVars/max"
+  op: "Const"
+  attr: {
+    key: "value"
+    value: {
+      tensor: {
+        dtype: DT_FLOAT
+        tensor_shape: {
+        }
+        float_val: 127.0
+      }
+    }
+  }
+  attr: {
+    key: "dtype"
+    value: {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "Add"
+  op: "Add"
+  input: "input0/FakeQuantWithMinMaxVars"
+  input: "input1/FakeQuantWithMinMaxVars"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node: {
+  name: "input0/FakeQuantWithMinMaxVars"
+  op: "FakeQuantWithMinMaxVars"
+  input: "input0"
+  input: "input0/FakeQuantWithMinMaxVars/min"
+  input: "input0/FakeQuantWithMinMaxVars/max"
+  attr: {
+    key: "num_bits"
+    value: {
+      i: 8
+    }
+  }
+  attr: {
+    key: "narrow_range"
+    value: {
+      b: false
+    }
+  }
+}
+node: {
+  name: "input0/FakeQuantWithMinMaxVars/min"
+  op: "Const"
+  attr: {
+    key: "value"
+    value: {
+      tensor: {
+        dtype: DT_FLOAT
+        tensor_shape: {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+  attr: {
+    key: "dtype"
+    value: {
+      type: DT_FLOAT
+    }
+  }
+}
+node: {
+  name: "input0/FakeQuantWithMinMaxVars/max"
+  op: "Const"
+  attr: {
+    key: "value"
+    value: {
+      tensor: {
+        dtype: DT_FLOAT
+        tensor_shape: {
+        }
+        float_val: 127.0
+      }
+    }
+  }
+  attr: {
+    key: "dtype"
+    value: {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "input0"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node: {
+  name: "input1/FakeQuantWithMinMaxVars"
+  op: "FakeQuantWithMinMaxVars"
+  input: "input1"
+  input: "input1/FakeQuantWithMinMaxVars/min"
+  input: "input1/FakeQuantWithMinMaxVars/max"
+  attr: {
+    key: "num_bits"
+    value: {
+      i: 8
+    }
+  }
+  attr: {
+    key: "narrow_range"
+    value: {
+      b: false
+    }
+  }
+}
+node: {
+  name: "input1/FakeQuantWithMinMaxVars/min"
+  op: "Const"
+  attr: {
+    key: "value"
+    value: {
+      tensor: {
+        dtype: DT_FLOAT
+        tensor_shape: {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+  attr: {
+    key: "dtype"
+    value: {
+      type: DT_FLOAT
+    }
+  }
+}
+node: {
+  name: "input1/FakeQuantWithMinMaxVars/max"
+  op: "Const"
+  attr: {
+    key: "value"
+    value: {
+      tensor: {
+        dtype: DT_FLOAT
+        tensor_shape: {
+        }
+        float_val: 127.0
+      }
+    }
+  }
+  attr: {
+    key: "dtype"
+    value: {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "input1"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+versions {
+  producer: 27
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/add.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/add.pbtxt
index 44ef85bfac2..902d1c98cab 100644
--- a/tensorflow/compiler/mlir/lite/tests/end2end/add.pbtxt
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/add.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf_tfl_translate -tf-input-arrays=input0,input1 -tf-input-shapes=4:4 -tf-input-data-types=DT_INT32,DT_INT32 -tf-output-arrays=Add %s -o - | flatbuffer_to_string - | FileCheck %s
+# RUN: tf_tfl_translate -tf-input-arrays=input0,input1 -tf-input-shapes=4:4 -tf-input-data-types=DT_INT32,DT_INT32 -tf-output-arrays=Add %s -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
 
 # Add two tensor<4xi32> inputs and return the result
 
@@ -90,5 +90,11 @@ versions {
 # CHECK-EMPTY:
 # CHECK-NEXT:   }, {
 # CHECK-EMPTY:
+# CHECK-NEXT:   }, {
+# CHECK-NEXT:     data: [ 49, 46, 53, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+# CHECK-NEXT:   } ],
+# CHECK-NEXT:   metadata: [ {
+# CHECK-NEXT:   name: "min_runtime_version",
+# CHECK-NEXT:   buffer: 4
 # CHECK-NEXT:   } ]
 # CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/constants.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/constants.mlir
index 4225e360d58..a113c318d80 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/constants.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/constants.mlir
@@ -61,11 +61,11 @@ func @i64() -> tensor<4xi64> {
 // the same sort of opaque round-trip we get for complex64, but it might be good
 // to check
 
-func @uint8() -> tensor<4x!tf.uint8> {
+func @uint8() -> tensor<4xui8> {
   // CHECK-LABEL: @uint8
-  // CHECK: value = opaque<"tf", "0x746674656E736F722464747970653A2044545F55494E54382074656E736F725F7368617065207B2064696D207B2073697A653A2034207D207D2074656E736F725F636F6E74656E743A20225C3333365C3235355C3237365C33353722"> : tensor<4x!tf.uint8>
-  %0 = "tfl.pseudo_const"() { value = opaque<"tf", "0x746674656E736F722464747970653A2044545F55494E54382074656E736F725F7368617065207B2064696D207B2073697A653A2034207D207D2074656E736F725F636F6E74656E743A20225C3333365C3235355C3237365C33353722"> : tensor<4x!tf.uint8> } : () -> tensor<4x!tf.uint8>
-  return %0 : tensor<4x!tf.uint8>
+  // CHECK: value = dense<[222, 173, 190, 239]> : tensor<4xui8>
+  %0 = "tfl.pseudo_const"() {value = dense<[222, 173, 190, 239]> : tensor<4xui8>} : () -> tensor<4xui8>
+  return %0 : tensor<4xui8>
 }
 
 func @qi32_per_axis() -> tensor<3x3x!quant.uniform<i32:f32:1, {1.0, 0.5:1, 0.25:1}>> {
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir
index 6003471f106..f58c0535f7c 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir
@@ -13,3 +13,16 @@ func @main(%arg0: tensor<4 x f32>, %arg1: tensor<4 x f32>, %arg2: tensor<4 x f32
 // CHECK: return %[[RES0]]
 
 }
+
+// -----
+
+func @testFullyQuantizedLSTM(%arg0: tensor<1x528x!quant.uniform<i8:f32, 0.037248000502586365:-19>>, %arg1: tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, %arg2: tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.031925998628139496>>, %arg3: tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.056272000074386597>>, %arg4: tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.063763998448848724>>, %arg5: tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.013358999975025654>>, %arg6: tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.022830000147223473>>, %arg7: tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.032276000827550888>>, %arg8: tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.035427000373601913>>, %arg9: tensor<2048x!quant.uniform<i32:f32, 4.2675782196965883E-7>>, %arg10: tensor<2048x!quant.uniform<i32:f32, 1.0742187583900886E-7>>, %arg11: tensor<2048x!quant.uniform<i32:f32, 1.6406249869760359E-7>>, %arg12: tensor<2048x!quant.uniform<i32:f32, 1.523437447303877E-7>>, %arg13: tensor<640x2048x!quant.uniform<i8<-127:127>:f32, 0.021174000576138496>>, %arg14: tensor<640x!quant.uniform<i32:f32, 1.601389680352559E-4>>, %arg15: tensor<2048x!quant.uniform<i16:f32, 4.3700000969693065E-4>>, %arg16: tensor<2048x!quant.uniform<i16:f32, 1.1000000085914508E-4>>, %arg17: tensor<2048x!quant.uniform<i16:f32, 1.6799999866634607E-4>>, %arg18: tensor<2048x!quant.uniform<i16:f32, 1.55999994603917E-4>>, %arg19: tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>, %arg20: tensor<1x2048x!quant.uniform<i16:f32, 4.8799999058246613E-4>>) -> tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>> {
+    %cst = constant unit
+    %0 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %cst, %cst, %cst, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg19, %arg20, %arg15, %arg16, %arg17, %arg18) ({}) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", input_to_input_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0049890000373125076>>, input_to_forget_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0078849997371435165>>, input_to_cell_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0087630003690719604>>, input_to_output_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0057529998011887074>>, effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8<-127:127>:f32, 0.0075630000792443752:2>>, kernel_type = "FULL", proj_clip = 0.01 : f32} : (tensor<1x528x!quant.uniform<i8:f32, 0.037248000502586365:-19>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.031925998628139496>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.056272000074386597>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.063763998448848724>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.013358999975025654>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.022830000147223473>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.032276000827550888>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.035427000373601913>>, none, none, none, tensor<2048x!quant.uniform<i32:f32, 4.2675782196965883E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.0742187583900886E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.6406249869760359E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.523437447303877E-7>>, tensor<640x2048x!quant.uniform<i8<-127:127>:f32, 0.021174000576138496>>, tensor<640x!quant.uniform<i32:f32, 1.601389680352559E-4>>, tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>, tensor<1x2048x!quant.uniform<i16:f32, 4.8799999058246613E-4>>, tensor<2048x!quant.uniform<i16:f32, 4.3700000969693065E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.1000000085914508E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.6799999866634607E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.55999994603917E-4>>) -> tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>
+    return %0 : tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>
+// CHECK-LABEL: testFullyQuantizedLSTM
+// CHECK: %cst = constant unit
+// CHECK: %[[RES0:.*]] = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %cst, %cst, %cst, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg19, %arg20, %arg15, %arg16, %arg17, %arg18)
+// CHECK: }) {cell_clip = 1.000000e+01 : f32, effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8<-127:127>:f32, 0.0075630000792443752:2>>, fused_activation_function = "TANH", input_to_cell_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0087630003690719604>>, input_to_forget_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0078849997371435165>>, input_to_input_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0049890000373125076>>, input_to_output_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0057529998011887074>>, kernel_type = "FULL", proj_clip = 0.00999999977 : f32} : (tensor<1x528x!quant.uniform<i8:f32, 0.037248000502586365:-19>>, tensor<2048x528x!quant.uniform<i8:f32, 0.059801999479532242>>, tensor<2048x528x!quant.uniform<i8:f32, 0.031925998628139496>>, tensor<2048x528x!quant.uniform<i8:f32, 0.056272000074386597>>, tensor<2048x528x!quant.uniform<i8:f32, 0.063763998448848724>>, tensor<2048x640x!quant.uniform<i8:f32, 0.013358999975025654>>, tensor<2048x640x!quant.uniform<i8:f32, 0.022830000147223473>>, tensor<2048x640x!quant.uniform<i8:f32, 0.032276000827550888>>, tensor<2048x640x!quant.uniform<i8:f32, 0.035427000373601913>>, none, none, none, tensor<2048x!quant.uniform<i32:f32, 4.2675782196965883E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.0742187583900886E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.6406249869760359E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.523437447303877E-7>>, tensor<640x2048x!quant.uniform<i8:f32, 0.021174000576138496>>, tensor<640x!quant.uniform<i32:f32, 1.6013896674849093E-4>>, tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>, tensor<1x2048x!quant.uniform<i16:f32, 4.8799999058246613E-4>>, tensor<2048x!quant.uniform<i16:f32, 4.3700000969693065E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.1000000085914508E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.6799999866634607E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.55999994603917E-4>>) -> tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>
+}
+
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf-while.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf-while.mlir
index 22d2c39535a..8f30aef8287 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf-while.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf-while.mlir
@@ -58,15 +58,15 @@ func @while_cond_10_frozen0(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>, %arg2: t
 // CANON-SAME:         (tensor<i32>, tensor<256x256xf32>, tensor<?x256x256xf32>)
 // CANON:           [[VAL_1:%.*]] = constant dense<1.000000e+00> : tensor<256x256xf32>
 // CANON:           [[VAL_2:%.*]] = constant dense<0> : tensor<i32>
-// CANON:           [[VAL_3:%.*]] = constant dense<10> : tensor<i32>
-// CANON:           [[VAL_4:%.*]] = constant dense<1> : tensor<i32>
-// CANON:           [[VAL_5:%.*]] = "tf.Const"() {value = dense<2.560000e+02> : tensor<256x256xf32>} : () -> tensor<?x?xf32>
 // CANON:           [[VAL_6:%.*]]:3 = "tfl.while"([[VAL_2]], [[VAL_2]], [[VAL_0]]) ( {
 // CANON:           ^bb0([[VAL_7:%.*]]: tensor<*xi32>, [[VAL_8:%.*]]: tensor<*xi32>, [[VAL_9:%.*]]: tensor<*xf32>):
+// CANON:             [[VAL_3:%.*]] = constant dense<10> : tensor<i32>
 // CANON:             [[VAL_10:%.*]] = "tf.Less"([[VAL_8]], [[VAL_3]])
 // CANON:             "tfl.yield"([[VAL_10]]) : (tensor<*xi1>) -> ()
 // CANON:           },  {
 // CANON:           ^bb0([[VAL_11:%.*]]: tensor<*xi32>, [[VAL_12:%.*]]: tensor<*xi32>, [[VAL_13:%.*]]: tensor<*xf32>):
+// CANON:             [[VAL_4:%.*]] = constant dense<1> : tensor<i32>
+// CANON:             [[VAL_5:%.*]] = "tf.Const"() {value = dense<2.560000e+02> : tensor<256x256xf32>} : () -> tensor<?x?xf32>
 // CANON:             [[VAL_14:%.*]] = "tf.AddV2"([[VAL_12]], [[VAL_4]])
 // CANON:             [[VAL_15:%.*]] = "tf.AddV2"([[VAL_13]], [[VAL_5]])
 // CANON:             [[VAL_16:%.*]] = "tf.AddV2"([[VAL_11]], [[VAL_4]])
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
index 1256571c3b4..d236c8169b8 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
@@ -1,22 +1,11 @@
 // RUN: tf-opt %s -tfl-legalize-tf | FileCheck %s --dump-input-on-failure
 
-func @addRelu(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
+func @add(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
   %0 = "tf.Add"(%arg0, %arg1) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
-  %1 = "tf.Add"(%arg0, %0) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
-  %2 = "tf.Relu"(%1) : (tensor<1xf32>) -> tensor<1xf32>
-  %3 = "tf.Relu"(%arg0) : (tensor<1xf32>) -> tensor<1xf32>
-  %4 = "tf.Add"(%3, %2) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
-  %5 = "tf.Relu6"(%4) : (tensor<1xf32>) -> tensor<1xf32>
-  %6 = "tfl.add"(%5, %3) {fused_activation_function = "NONE"} : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
-  %7 = "tf.Relu6"(%6) : (tensor<1xf32>) -> tensor<1xf32>
-  return %7: tensor<1xf32>
+  return %0: tensor<1xf32>
 
-// CHECK-LABEL: addRelu
+// CHECK-LABEL: add
 // CHECK:  tfl.add %arg0, %arg1 {fused_activation_function = "NONE"} : tensor<1xf32>
-// CHECK:  %1 = tfl.add %arg0, %0 {fused_activation_function = "RELU"} : tensor<1xf32>
-// CHECK:  %2 = "tfl.relu"(%arg0) : (tensor<1xf32>) -> tensor<1xf32>
-// CHECK:  %3 = tfl.add %2, %1 {fused_activation_function = "RELU6"} : tensor<1xf32>
-// CHECK:  %4 = tfl.add %3, %2 {fused_activation_function = "RELU6"} : tensor<1xf32>
 // CHECK:  return
 }
 
@@ -30,13 +19,10 @@ func @LeakyRelu(%arg0: tensor<1xf32>) -> tensor<1xf32> {
 
 func @biasAdd(%arg0: tensor<1x10x10x32xf32>, %arg1: tensor<32xf32>) -> tensor<1x10x10x32xf32> {
   %0 = "tf.BiasAdd"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", data_format = "NHWC"} : (tensor<1x10x10x32xf32>, tensor<32xf32>) -> tensor<1x10x10x32xf32>
-  %1 = "tf.BiasAdd"(%0, %arg1) {T = "tfdtype$DT_FLOAT", data_format = "NHWC"} : (tensor<1x10x10x32xf32>, tensor<32xf32>) -> tensor<1x10x10x32xf32>
-  %2 = "tf.Relu6"(%1) : (tensor<1x10x10x32xf32>) -> tensor<1x10x10x32xf32>
-  return %2 : tensor<1x10x10x32xf32>
+  return %0 : tensor<1x10x10x32xf32>
 
 // CHECK-LABEL: biasAdd
 // CHECK:  "tfl.add"(%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<1x10x10x32xf32>, tensor<32xf32>) -> tensor<1x10x10x32xf32>
-// CHECK:  %1 = "tfl.add"(%0, %arg1) {fused_activation_function = "RELU6"} : (tensor<1x10x10x32xf32>, tensor<32xf32>) -> tensor<1x10x10x32xf32>
 }
 
 func @biasAddInt(%arg0: tensor<1x10x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x10x10x32xi32> {
@@ -55,9 +41,9 @@ func @squeezeAndReshape(%arg0: tensor<1x1x10xf32>, %arg1: tensor<?x10xf32>) -> i
   %4 = "some_op"(%1, %3) : (tensor<*xf32>, tensor<2x5xf32>) -> i32
   return %4 : i32
 // CHECK-LABEL: squeezeAndReshape
-// CHECK:  %cst = constant dense<[2, 5]> : tensor<2xi32>
 // CHECK:  "tfl.squeeze"(%arg0) {squeeze_dims = [0]} : (tensor<1x1x10xf32>) -> tensor<1x10xf32>
 // CHECK:  %1 = "tfl.squeeze"(%arg1) {squeeze_dims = []} : (tensor<?x10xf32>) -> tensor<*xf32>
+// CHECK:  %cst = constant dense<[2, 5]> : tensor<2xi32>
 // CHECK:  %2 = "tfl.reshape"(%0, %cst) : (tensor<1x10xf32>, tensor<2xi32>) -> tensor<2x5xf32>
 // CHECK:  %3 = "some_op"(%1, %2) : (tensor<*xf32>, tensor<2x5xf32>) -> i32
 // CHECK:  return
@@ -88,7 +74,7 @@ func @dynamicReshapeI64Fold(%arg0: tensor<*xf32>) -> tensor<1x2xf32> {
   return %0 : tensor<1x2xf32>
 
 // CHECK-LABEL: dynamicReshapeI64Fold
-// CHECK-NEXT:  %[[cst:.*]] = constant dense<[1, 2]> : tensor<2xi32>
+// CHECK:  %[[cst:.*]] = constant dense<[1, 2]> : tensor<2xi32>
 // CHECK-NEXT:  %[[reshape:.*]] = "tfl.reshape"(%arg0, %[[cst]]) : (tensor<*xf32>, tensor<2xi32>) -> tensor<1x2xf32>
 // CHECK-NEXT:  return %[[reshape]] : tensor<1x2xf32>
 }
@@ -128,10 +114,10 @@ func @softplus(%arg0: tensor<8x16xf32>) -> tensor<8x16xf32> {
   return %0 : tensor<8x16xf32>
 
 // CHECK-LABEL: softplus
-// CHECK-NEXT:  %[[cst:.*]] = constant dense<1.000000e+00> : tensor<f32>
-// CHECK-NEXT:  %[[exp:.*]] = "tfl.exp"(%arg0) : (tensor<8x16xf32>) -> tensor<8x16xf32>
-// CHECK-NEXT:  %[[add:.*]] = "tfl.add"(%[[exp]], %[[cst]]) {fused_activation_function = "NONE"} : (tensor<8x16xf32>, tensor<f32>) -> tensor<8x16xf32>
-// CHECK-NEXT:  %[[log:.*]] = "tfl.log"(%[[add]]) : (tensor<8x16xf32>) -> tensor<8x16xf32>
+// CHECK:  %[[exp:.*]] = "tfl.exp"(%arg0) : (tensor<8x16xf32>) -> tensor<8x16xf32>
+// CHECK:  %[[cst:.*]] = constant dense<1.000000e+00> : tensor<f32>
+// CHECK:  %[[add:.*]] = "tfl.add"(%[[exp]], %[[cst]]) {fused_activation_function = "NONE"} : (tensor<8x16xf32>, tensor<f32>) -> tensor<8x16xf32>
+// CHECK:  %[[log:.*]] = "tfl.log"(%[[add]]) : (tensor<8x16xf32>) -> tensor<8x16xf32>
 }
 
 func @fakeQuantArgsFalse(%arg0: tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32> {
@@ -255,20 +241,12 @@ func @zeros_like(%arg0: tensor<8x16xf32>) -> tensor<8x16xf32> {
 // CHECK:  "tfl.zeros_like"(%arg0) : (tensor<8x16xf32>) -> tensor<8x16xf32>
 }
 
-func @divRelu(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
+func @div(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
   %0 = "tf.Div"(%arg0, %arg1) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
-  %1 = "tf.Div"(%arg0, %0) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
-  %2 = "tf.Relu"(%1) : (tensor<1xf32>) -> tensor<1xf32>
-  %3 = "tf.Relu"(%arg0) : (tensor<1xf32>) -> tensor<1xf32>
-  %4 = "tf.Div"(%3, %2) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
-  %5 = "tf.Relu6"(%4) : (tensor<1xf32>) -> tensor<1xf32>
-  return %5: tensor<1xf32>
+  return %0: tensor<1xf32>
 
-// CHECK-LABEL: divRelu
+// CHECK-LABEL: div
 // CHECK:  tfl.div %arg0, %arg1 {fused_activation_function = "NONE"} : tensor<1xf32>
-// CHECK:  %1 = tfl.div %arg0, %0 {fused_activation_function = "RELU"} : tensor<1xf32>
-// CHECK:  %2 = "tfl.relu"(%arg0) : (tensor<1xf32>) -> tensor<1xf32>
-// CHECK:  %3 = tfl.div %2, %1 {fused_activation_function = "RELU6"} : tensor<1xf32>
 // CHECK:  return
 }
 
@@ -698,8 +676,9 @@ func @matrix_diag_v2_no_match(%arg0: tensor<8x16xf32>) -> tensor<8x16x16xf32> {
 // CHECK-SAME:                                  [[VAL_0:%.*]]: tensor<8x16xf32>) -> tensor<8x16x16xf32> {
 // CHECK:           [[VAL_1:%.*]] = constant dense<1> : tensor<1xi32>
 // CHECK:           [[VAL_2:%.*]] = constant dense<-1> : tensor<1xi32>
+// CHECK:           [[VAL_5:%.*]] = constant dense<-1> : tensor<1xi32>
 // CHECK:           [[VAL_3:%.*]] = constant dense<0> : tensor<2xi32>
-// CHECK:           [[VAL_4:%.*]] = "tf.MatrixDiagV2"([[VAL_0]], [[VAL_1]], [[VAL_2]], [[VAL_2]], [[VAL_3]]) : (tensor<8x16xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<2xi32>) -> tensor<8x16x16xf32>
+// CHECK:           [[VAL_4:%.*]] = "tf.MatrixDiagV2"([[VAL_0]], [[VAL_1]], [[VAL_2]], [[VAL_5]], [[VAL_3]]) : (tensor<8x16xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<2xi32>) -> tensor<8x16x16xf32>
 // CHECK:           return [[VAL_4]] : tensor<8x16x16xf32>
 }
 
@@ -731,8 +710,9 @@ func @matrix_diag_v3_no_match(%arg0: tensor<8x16xf32>) -> tensor<8x16x16xf32> {
 // CHECK-SAME:      [[VAL_0:%.*]]: tensor<8x16xf32>) -> tensor<8x16x16xf32> {
 // CHECK:           [[VAL_1:%.*]] = constant dense<1> : tensor<1xi32>
 // CHECK:           [[VAL_2:%.*]] = constant dense<-1> : tensor<1xi32>
+// CHECK:           [[VAL_5:%.*]] = constant dense<-1> : tensor<1xi32>
 // CHECK:           [[VAL_3:%.*]] = constant dense<0> : tensor<2xi32>
-// CHECK:           [[VAL_4:%.*]] = "tf.MatrixDiagV3"([[VAL_0]], [[VAL_1]], [[VAL_2]], [[VAL_2]], [[VAL_3]]) : (tensor<8x16xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<2xi32>) -> tensor<8x16x16xf32>
+// CHECK:           [[VAL_4:%.*]] = "tf.MatrixDiagV3"([[VAL_0]], [[VAL_1]], [[VAL_2]], [[VAL_5]], [[VAL_3]]) : (tensor<8x16xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<2xi32>) -> tensor<8x16x16xf32>
 // CHECK:           return [[VAL_4]] : tensor<8x16x16xf32>
 }
 
@@ -1295,7 +1275,8 @@ func @conv2d_backprop_input(%arg0: tensor<4xi32>, %arg1: tensor<3x3x1x32xf32>, %
   // CHECK: %[[CST:.*]] = constant dense<[2, 0, 1, 3]> : tensor<4xi32>
   // CHECK: %[[ARG0:.*]] = "tfl.transpose"(%arg1, %[[CST]]) : (tensor<3x3x1x32xf32>, tensor<4xi32>) -> tensor<1x3x3x32xf32>
   // CHECK: %[[ARG1:.*]] = "tfl.transpose_conv"(%arg0, %[[ARG0]], %arg2) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<1x3x3x32xf32>, tensor<15x14x14x32xf32>) -> tensor<15x28x28x1xf32>
-  // CHECK: %[[ARG2:.*]] = "tfl.transpose"(%arg1, %[[CST]]) : (tensor<3x3x1x32xf32>, tensor<4xi32>) -> tensor<1x3x3x32xf32>
+  // CHECK: %[[CST_1:.*]] = constant dense<[2, 0, 1, 3]> : tensor<4xi32>
+  // CHECK: %[[ARG2:.*]] = "tfl.transpose"(%arg1, %[[CST_1]]) : (tensor<3x3x1x32xf32>, tensor<4xi32>) -> tensor<1x3x3x32xf32>
   // CHECK: %[[ARG3:.*]] = "tfl.transpose_conv"(%arg0, %[[ARG2]], %arg2) {padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<1x3x3x32xf32>, tensor<15x14x14x32xf32>) -> tensor<15x28x28x1xf32>
   // CHECK: %[[RESULT:.*]] = tfl.add %[[ARG1]], %[[ARG3]] {fused_activation_function = "NONE"} : tensor<15x28x28x1xf32>
   // CHECK: return %[[RESULT]] : tensor<15x28x28x1xf32>
@@ -1340,8 +1321,8 @@ func @reciprocal_f16(%arg0: tensor<8xf16>) -> tensor<8xf16> {
   return %0: tensor<8xf16>
 
 // CHECK-LABEL: reciprocal_f16
-// CHECK:  %cst = constant dense<1.000000e+00> : tensor<1xf16>
-// CHECK:  "tfl.div"(%cst, %arg0) {fused_activation_function = "NONE"} : (tensor<1xf16>, tensor<8xf16>) -> tensor<8xf16>
+// CHECK:  %cst = constant dense<1.000000e+00> : tensor<f16>
+// CHECK:  "tfl.div"(%cst, %arg0) {fused_activation_function = "NONE"} : (tensor<f16>, tensor<8xf16>) -> tensor<8xf16>
 // CHECK:  return
 }
 
@@ -1350,8 +1331,8 @@ func @reciprocal_f32(%arg0: tensor<8xf32>) -> tensor<8xf32> {
   return %0: tensor<8xf32>
 
 // CHECK-LABEL: reciprocal_f32
-// CHECK:  %cst = constant dense<1.000000e+00> : tensor<1xf32>
-// CHECK:  "tfl.div"(%cst, %arg0) {fused_activation_function = "NONE"} : (tensor<1xf32>, tensor<8xf32>) -> tensor<8xf32>
+// CHECK:  %cst = constant dense<1.000000e+00> : tensor<f32>
+// CHECK:  "tfl.div"(%cst, %arg0) {fused_activation_function = "NONE"} : (tensor<f32>, tensor<8xf32>) -> tensor<8xf32>
 // CHECK:  return
 }
 
@@ -1360,8 +1341,8 @@ func @reciprocal_complex_f32(%arg0: tensor<8xcomplex<f32>>) -> tensor<8xcomplex<
   return %0: tensor<8xcomplex<f32>>
 
 // CHECK-LABEL: reciprocal_complex_f32
-// CHECK:  %cst = constant opaque<"tf", "0x746674656E736F722464747970653A2044545F434F4D504C455836342074656E736F725F7368617065207B2064696D207B2073697A653A2031207D207D2074656E736F725F636F6E74656E743A20225C3030305C3030305C3230303F5C3030305C3030305C3030305C30303022"> : tensor<1xcomplex<f32>>
-// CHECK:  "tfl.div"(%cst, %arg0) {fused_activation_function = "NONE"} : (tensor<1xcomplex<f32>>, tensor<8xcomplex<f32>>) -> tensor<8xcomplex<f32>>
+// CHECK:  %cst = constant opaque<"tf", "0x746674656E736F722464747970653A2044545F434F4D504C455836342074656E736F725F7368617065207B2064696D207B2073697A653A2031207D207D2074656E736F725F636F6E74656E743A20225C3030305C3030305C3230303F5C3030305C3030305C3030305C30303022"> : tensor<complex<f32>>
+// CHECK:  "tfl.div"(%cst, %arg0) {fused_activation_function = "NONE"} : (tensor<complex<f32>>, tensor<8xcomplex<f32>>) -> tensor<8xcomplex<f32>>
 // CHECK:  return
 }
 
@@ -1370,8 +1351,8 @@ func @reciprocal_i32(%arg0: tensor<8xi32>) -> tensor<8xi32> {
   return %0: tensor<8xi32>
 
 // CHECK-LABEL: reciprocal_i32
-// CHECK:  %cst = constant dense<1> : tensor<1xi32>
-// CHECK:  "tfl.div"(%cst, %arg0) {fused_activation_function = "NONE"} : (tensor<1xi32>, tensor<8xi32>) -> tensor<8xi32>
+// CHECK:  %cst = constant dense<1> : tensor<i32>
+// CHECK:  "tfl.div"(%cst, %arg0) {fused_activation_function = "NONE"} : (tensor<i32>, tensor<8xi32>) -> tensor<8xi32>
 // CHECK:  return
 }
 
@@ -1380,8 +1361,8 @@ func @reciprocal_i64(%arg0: tensor<8xi64>) -> tensor<8xi64> {
   return %0: tensor<8xi64>
 
 // CHECK-LABEL: reciprocal_i64
-// CHECK:  %cst = constant dense<1> : tensor<1xi64>
-// CHECK:  "tfl.div"(%cst, %arg0) {fused_activation_function = "NONE"} : (tensor<1xi64>, tensor<8xi64>) -> tensor<8xi64>
+// CHECK:  %cst = constant dense<1> : tensor<i64>
+// CHECK:  "tfl.div"(%cst, %arg0) {fused_activation_function = "NONE"} : (tensor<i64>, tensor<8xi64>) -> tensor<8xi64>
 // CHECK:  return
 }
 
@@ -1436,7 +1417,7 @@ func @LstmWithoutProjection(%arg: tensor<28x1x28xf32>) -> (tensor<28x1x16xf32>)
 // CHECK:           [[VAL_3:%.*]] = constant dense<0.000000e+00> : tensor<16xf32>
 // CHECK:           [[VAL_4:%.*]] = constant dense<0.000000e+00> : tensor<1x16xf32>
 // CHECK:           [[VAL_5:%.*]] = constant unit
-// CHECK:           [[VAL_6:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_1]], [[VAL_1]], [[VAL_1]], [[VAL_1]], [[VAL_2]], [[VAL_2]], [[VAL_2]], [[VAL_2]], [[VAL_3]], [[VAL_3]], [[VAL_3]], [[VAL_3]], [[VAL_3]], [[VAL_3]], [[VAL_3]], [[VAL_5]], [[VAL_5]], [[VAL_4]], [[VAL_4]], [[VAL_5]], [[VAL_5]], [[VAL_5]], [[VAL_5]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<28x1x28xf32>, tensor<16x28xf32>, tensor<16x28xf32>, tensor<16x28xf32>, tensor<16x28xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, none, none, tensor<1x16xf32>, tensor<1x16xf32>, none, none, none, none) -> tensor<28x1x16xf32>
+// CHECK:           [[VAL_6:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_1]], [[VAL_1]], [[VAL_1]], [[VAL_1]], [[VAL_2]], [[VAL_2]], [[VAL_2]], [[VAL_2]], [[VAL_3]], [[VAL_3]], [[VAL_3]], [[VAL_3]], [[VAL_3]], [[VAL_3]], [[VAL_3]], [[VAL_5]], [[VAL_5]], [[VAL_4]], [[VAL_4]], [[VAL_5]], [[VAL_5]], [[VAL_5]], [[VAL_5]]) {cell_clip = 0.000000e+00 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<28x1x28xf32>, tensor<16x28xf32>, tensor<16x28xf32>, tensor<16x28xf32>, tensor<16x28xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, none, none, tensor<1x16xf32>, tensor<1x16xf32>, none, none, none, none) -> tensor<28x1x16xf32>
 // CHECK:           return [[VAL_6]] : tensor<28x1x16xf32>
 // CHECK:         }
 
@@ -1461,7 +1442,7 @@ func @LstmWithProjection(%arg: tensor<28x1x16xf32>) -> (tensor<28x1x8xf32>) {
 // CHECK:           [[VAL_12:%.*]] = constant dense<0.000000e+00> : tensor<8x16xf32>
 // CHECK:           [[VAL_13:%.*]] = constant dense<0.000000e+00> : tensor<1x8xf32>
 // CHECK:           [[VAL_14:%.*]] = constant unit
-// CHECK:           [[VAL_15:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_7]], [[VAL_8]], [[VAL_8]], [[VAL_8]], [[VAL_8]], [[VAL_9]], [[VAL_9]], [[VAL_9]], [[VAL_9]], [[VAL_14]], [[VAL_14]], [[VAL_14]], [[VAL_10]], [[VAL_10]], [[VAL_10]], [[VAL_10]], [[VAL_12]], [[VAL_14]], [[VAL_13]], [[VAL_11]], [[VAL_14]], [[VAL_14]], [[VAL_14]], [[VAL_14]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<28x1x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x8xf32>, tensor<16x8xf32>, tensor<16x8xf32>, tensor<16x8xf32>, none, none, none, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<8x16xf32>, none, tensor<1x8xf32>, tensor<1x16xf32>, none, none, none, none) -> tensor<28x1x8xf32>
+// CHECK:           [[VAL_15:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_7]], [[VAL_8]], [[VAL_8]], [[VAL_8]], [[VAL_8]], [[VAL_9]], [[VAL_9]], [[VAL_9]], [[VAL_9]], [[VAL_14]], [[VAL_14]], [[VAL_14]], [[VAL_10]], [[VAL_10]], [[VAL_10]], [[VAL_10]], [[VAL_12]], [[VAL_14]], [[VAL_13]], [[VAL_11]], [[VAL_14]], [[VAL_14]], [[VAL_14]], [[VAL_14]]) {cell_clip = 0.000000e+00 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<28x1x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x8xf32>, tensor<16x8xf32>, tensor<16x8xf32>, tensor<16x8xf32>, none, none, none, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<8x16xf32>, none, tensor<1x8xf32>, tensor<1x16xf32>, none, none, none, none) -> tensor<28x1x8xf32>
 // CHECK:           return [[VAL_15]] : tensor<28x1x8xf32>
 // CHECK:         }
 
@@ -1480,3 +1461,25 @@ func @UnidirectionalRnn(%arg: tensor<28x1x28xf32>) -> (tensor<28x1x28xf32>) {
 // CHECK:           [[VAL_4:%.*]] = "tfl.unidirectional_sequence_rnn"([[VAL_0]], [[VAL_1]], [[VAL_1]], [[VAL_2]], [[VAL_3]]) {fused_activation_function = "TANH", time_major = true} : (tensor<28x1x28xf32>, tensor<28x28xf32>, tensor<28x28xf32>, tensor<28xf32>, tensor<1x28xf32>) -> tensor<28x1x28xf32>
 // CHECK:           return [[VAL_4]] : tensor<28x1x28xf32>
 // CHECK:         }
+
+func @broadcast_to_f32(%arg0: tensor<3xf32>, %arg1: tensor<2xi32>) -> tensor<3x3xf32> {
+  %0 = "tf.BroadcastTo"(%arg0, %arg1) : (tensor<3xf32>, tensor<2xi32>) -> tensor<3x3xf32>
+  return %0: tensor<3x3xf32>
+
+// CHECK-LABEL: broadcast_to_f32
+// CHECK:  [[CST:%.*]] = constant dense<1.000000e+00> : tensor<f32>
+// CHECK:  [[FILL:%.*]] = "tfl.fill"(%arg1, [[CST]]) : (tensor<2xi32>, tensor<f32>) -> tensor<3x3xf32>
+// CHECK:  [[MUL:%.*]] = "tfl.mul"(%arg0, [[FILL]]) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
+// CHECK   return [[MUL]] : tensor<3x3xf32>
+}
+
+func @broadcast_to_i32(%input: tensor<3xi32>, %shape: tensor<2xi32>) -> tensor<3x3xi32> {
+  %0 = "tf.BroadcastTo"(%input, %shape) : (tensor<3xi32>, tensor<2xi32>) -> tensor<3x3xi32>
+  return %0: tensor<3x3xi32>
+
+// CHECK-LABEL: broadcast_to_i32
+// CHECK:  [[CST:%.*]] = constant dense<1> : tensor<i32>
+// CHECK:  [[FILL:%.*]] = "tfl.fill"(%arg1, [[CST]]) : (tensor<2xi32>, tensor<i32>) -> tensor<3x3xi32>
+// CHECK:  [[MUL:%.*]] = "tfl.mul"(%arg0, [[FILL]]) {fused_activation_function = "NONE"} : (tensor<3xi32>, tensor<3x3xi32>) -> tensor<3x3xi32>
+// CHECK   return [[MUL]] : tensor<3x3xi32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2exec/tfl_while_op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2exec/tfl_while_op.mlir
index 39a93e1d03b..3addd8a9248 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2exec/tfl_while_op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2exec/tfl_while_op.mlir
@@ -12,11 +12,9 @@
 // CHECK: Tensor 0 pconst kTfLiteInt32 kTfLiteMmapRo 4 bytes
 // CHECK-NEXT: Tensor 1 N kTfLiteInt32 kTfLiteMmapRo 4 bytes
 // CHECK-NEXT: Tensor 2 val kTfLiteFloat32 kTfLiteMmapRo 4 bytes
-// CHECK-NEXT: Tensor 3 std.constant kTfLiteInt32 kTfLiteMmapRo 4 bytes
-// CHECK-NEXT: Tensor 4 tfl.while kTfLiteInt32 kTfLiteArenaRw 4 bytes
-// CHECK-NEXT: Tensor 5 result kTfLiteFloat32 kTfLiteArenaRw 4 bytes
-// CHECK-NEXT: Tensor 6 tfl.while:2 kTfLiteInt32 kTfLiteArenaRw 4 bytes
-// CHECK-NEXT: Tensor 7 tfl.while:3 kTfLiteInt32 kTfLiteArenaRw 4 bytes
+// CHECK-NEXT: Tensor 3 tfl.while kTfLiteInt32 kTfLiteArenaRw 4 bytes
+// CHECK-NEXT: Tensor 4 result kTfLiteFloat32 kTfLiteArenaRw 4 bytes
+// CHECK-NEXT: Tensor 5 tfl.while:2 kTfLiteInt32 kTfLiteArenaRw 4 bytes
 
 // Verify while was not folded away:
 // ------------------------------------
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/basic_lstm.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/basic_lstm.mlir
index 5ede7c05234..47e1ccee3c9 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/basic_lstm.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/basic_lstm.mlir
@@ -108,6 +108,12 @@ func @main(tensor<1x384xf32>, tensor<1x96xf32>, tensor<384x480xf32>, tensor<384x
 // CHECK-EMPTY:
 // CHECK-NEXT:  }, {
 // CHECK-EMPTY:
+// CHECK-NEXT:  }, {
+// CHECK-NEXT:  data: [ 49, 46, 49, 48, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:  } ],
+// CHECK-NEXT:  metadata: [ {
+// CHECK-NEXT:  name: "min_runtime_version",
+// CHECK-NEXT:  buffer: 10
 // CHECK-NEXT:  } ]
 // CHECK-NEXT:}
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/convolution_2d_transpose_bias.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/convolution_2d_transpose_bias.mlir
index 8d4c93fccc0..9d134a3fcad 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/convolution_2d_transpose_bias.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/convolution_2d_transpose_bias.mlir
@@ -1,4 +1,4 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -emit-custom-ops -o - | flatbuffer_to_string - | FileCheck %s
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -emit-custom-ops -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
 // RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir -o - | FileCheck --check-prefix=MLIR %s
 
 
@@ -61,6 +61,12 @@ func @main(%arg0: tensor<32x4x4x128xf32>, %arg1: tensor<1x32x42x128xf32>, %arg2:
 // CHECK-EMPTY:
 // CHECK-NEXT:  }, {
 // CHECK-EMPTY:
+// CHECK-NEXT:  }, {
+// CHECK-NEXT:    data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:  } ],
+// CHECK-NEXT:  metadata: [ {
+// CHECK-NEXT:  name: "min_runtime_version",
+// CHECK-NEXT:  buffer: 5
 // CHECK-NEXT:  } ]
 // CHECK-NEXT:}
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/custom_op_with_tflite_op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/custom_op_with_tflite_op.mlir
index ec6b9e313f6..1b46fa3d0e5 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/custom_op_with_tflite_op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/custom_op_with_tflite_op.mlir
@@ -1,4 +1,4 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -emit-custom-ops -o - | flatbuffer_to_string - | FileCheck %s
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -emit-custom-ops -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
 
 func @main(tensor<4xf32>) -> tensor<4xf32> {
 ^bb0(%arg0: tensor<4xf32>):
@@ -90,6 +90,12 @@ func @main(tensor<4xf32>) -> tensor<4xf32> {
 // CHECK-EMPTY:
 // CHECK-NEXT:  }, {
 // CHECK-EMPTY:
+// CHECK-NEXT:  }, {
+// CHECK-NEXT:    data: [ 49, 46, 55, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:  } ],
+// CHECK-NEXT:  metadata: [ {
+// CHECK-NEXT:  name: "min_runtime_version",
+// CHECK-NEXT:  buffer: 6
 // CHECK-NEXT:  } ]
 // CHECK-NEXT:}
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/depthwise_conv2d.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/depthwise_conv2d.mlir
index 10a62121485..ffa379124e6 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/depthwise_conv2d.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/depthwise_conv2d.mlir
@@ -82,6 +82,12 @@ func @main(tensor<1x224x224x3xf32>) -> tensor<1x112x112x32xf32> {
   // CHECK-EMPTY:
   // CHECK-NEXT:  }, {
   // CHECK-EMPTY:
+  // CHECK-NEXT:  }, {
+  // CHECK-NEXT:    data: [ 49, 46, 49, 51, 46, 49, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+  // CHECK-NEXT:  } ],
+  // CHECK-NEXT:  metadata: [ {
+  // CHECK-NEXT:  name: "min_runtime_version",
+  // CHECK-NEXT:  buffer: 6
   // CHECK-NEXT:  } ]
   // CHECK-NEXT:}
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/depthwise_conv2d_v2.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/depthwise_conv2d_v2.mlir
index ce079ccccf7..627de564931 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/depthwise_conv2d_v2.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/depthwise_conv2d_v2.mlir
@@ -84,6 +84,12 @@ func @main(tensor<1x224x224x3xf32>) -> tensor<1x112x112x32xf32> {
   // CHECK-EMPTY:
   // CHECK-NEXT:  }, {
   // CHECK-EMPTY:
+  // CHECK-NEXT:  }, {
+  // CHECK-NEXT:    data: [ 49, 46, 49, 51, 46, 49, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+  // CHECK-NEXT:  } ],
+  // CHECK-NEXT:  metadata: [ {
+  // CHECK-NEXT:  name: "min_runtime_version",
+  // CHECK-NEXT:  buffer: 6
   // CHECK-NEXT:  } ]
   // CHECK-NEXT:}
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/disable_flex_enable_builtin.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/disable_flex_enable_builtin.mlir
index 236fc605c9d..13f8b998fff 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/disable_flex_enable_builtin.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/disable_flex_enable_builtin.mlir
@@ -1,4 +1,4 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
 
 func @main(tensor<4xf32>) -> tensor<4xf32> {
 ^bb0(%arg0: tensor<4xf32>):
@@ -88,6 +88,12 @@ func @main(tensor<4xf32>) -> tensor<4xf32> {
 // CHECK-EMPTY:
 // CHECK-NEXT:  }, {
 // CHECK-EMPTY:
+// CHECK-NEXT:  }, {
+// CHECK-NEXT:    data: [ 49, 46, 55, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:  } ],
+// CHECK-NEXT:  metadata: [ {
+// CHECK-NEXT:  name: "min_runtime_version",
+// CHECK-NEXT:  buffer: 6
 // CHECK-NEXT:  } ]
 // CHECK-NEXT:}
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fake_quant.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fake_quant.mlir
index 2505f73ee31..48994bf4617 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fake_quant.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fake_quant.mlir
@@ -1,4 +1,4 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
 // RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate -tflite-flatbuffer-to-mlir - -o - | FileCheck --check-prefix=IMPORT %s
 
 func @main(tensor<4xf32>) -> tensor<4xf32> {
@@ -46,6 +46,12 @@ func @main(tensor<4xf32>) -> tensor<4xf32> {
 // CHECK-EMPTY:
 // CHECK-NEXT:     }, {
 // CHECK-EMPTY:
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       data: [ 49, 46, 53, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:     } ],
+// CHECK-NEXT:     metadata: [ {
+// CHECK-NEXT:     name: "min_runtime_version",
+// CHECK-NEXT:     buffer: 3
 // CHECK-NEXT:     } ]
 // CHECK-NEXT:   }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_exclusively.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_exclusively.mlir
index c98fdeb514e..9c4524586a5 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_exclusively.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_exclusively.mlir
@@ -1,4 +1,4 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -emit-select-tf-ops=true -emit-builtin-tflite-ops=false -o - | flatbuffer_to_string - | FileCheck %s
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -emit-select-tf-ops=true -emit-builtin-tflite-ops=false -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
 
 func @main(%arg0: tensor<3x2xf32>) -> tensor<3x2xf32> {
 // CHECK:  {
@@ -39,6 +39,12 @@ func @main(%arg0: tensor<3x2xf32>) -> tensor<3x2xf32> {
 // CHECK-EMPTY:
 // CHECK-NEXT:    }, {
 // CHECK-EMPTY:
+// CHECK-NEXT:    }, {
+// CHECK-NEXT:      data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:    } ],
+// CHECK-NEXT:    metadata: [ {
+// CHECK-NEXT:    name: "min_runtime_version",
+// CHECK-NEXT:    buffer: 3
 // CHECK-NEXT:    } ]
 // CHECK-NEXT:  }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_tflite_op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_tflite_op.mlir
index 0bde1879b10..6f1bafcd7a9 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_tflite_op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_tflite_op.mlir
@@ -1,4 +1,4 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -emit-select-tf-ops -o - | flatbuffer_to_string - | FileCheck %s
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -emit-select-tf-ops -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
 
 func @main(tensor<4xf32>) -> tensor<4xf32> {
 ^bb0(%arg0: tensor<4xf32>):
@@ -89,6 +89,12 @@ func @main(tensor<4xf32>) -> tensor<4xf32> {
 // CHECK-EMPTY:
 // CHECK-NEXT:  }, {
 // CHECK-EMPTY:
+// CHECK-NEXT:  }, {
+// CHECK-NEXT:    data: [ 49, 46, 55, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:  } ],
+// CHECK-NEXT:  metadata: [ {
+// CHECK-NEXT:  name: "min_runtime_version",
+// CHECK-NEXT:  buffer: 6
 // CHECK-NEXT:  } ]
 // CHECK-NEXT:}
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fully_connected.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fully_connected.mlir
index 85ad8f01dbe..2015d694e7f 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fully_connected.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fully_connected.mlir
@@ -61,6 +61,12 @@ func @main(tensor<40x37xf32>, tensor<40x37xf32>) -> tensor<40x40xf32> {
   // CHECK-EMPTY:
   // CHECK-NEXT:  }, {
   // CHECK-EMPTY:
+  // CHECK-NEXT:  }, {
+  // CHECK-NEXT:  data: [ 49, 46, 53, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+  // CHECK-NEXT:  } ],
+  // CHECK-NEXT:  metadata: [ {
+  // CHECK-NEXT:  name: "min_runtime_version",
+  // CHECK-NEXT:  buffer: 5
   // CHECK-NEXT:  } ]
   // CHECK-NEXT:}
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fully_connected_v2.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fully_connected_v2.mlir
index 6f7fc9c967d..44c757d2fa8 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fully_connected_v2.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fully_connected_v2.mlir
@@ -61,6 +61,12 @@ func @main(tensor<40x37xf32>, tensor<40x37xf32>) -> tensor<40x40xf32> {
   // CHECK-EMPTY:
   // CHECK-NEXT:  }, {
   // CHECK-EMPTY:
+  // CHECK-NEXT:  }, {
+  // CHECK-NEXT:  data: [ 49, 46, 49, 48, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+  // CHECK-NEXT:  } ],
+  // CHECK-NEXT:  metadata: [ {
+  // CHECK-NEXT:  name: "min_runtime_version",
+  // CHECK-NEXT:  buffer: 5
   // CHECK-NEXT:  } ]
   // CHECK-NEXT:}
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/if_op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/if_op.mlir
index 8ad0e1b0278..e325262eaa4 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/if_op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/if_op.mlir
@@ -156,6 +156,12 @@
 // CHECK-EMPTY:
 // CHECK-NEXT:   }, {
 // CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-NEXT:   data: [ 49, 46, 49, 52, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:   } ],
+// CHECK-NEXT:   metadata: [ {
+// CHECK-NEXT:   name: "min_runtime_version",
+// CHECK-NEXT:   buffer: 11
 // CHECK-NEXT:   } ]
 // CHECK-NEXT: }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/logical.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/logical.mlir
index fd3f37eec73..4cedc6a218e 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/logical.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/logical.mlir
@@ -1,4 +1,4 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
 
 func @main(tensor<4xi1>) -> tensor<4xi1> {
 ^bb0(%arg0: tensor<4xi1>):
@@ -78,6 +78,12 @@ func @main(tensor<4xi1>) -> tensor<4xi1> {
   // CHECK-EMPTY:
   // CHECK-NEXT:   }, {
   // CHECK-EMPTY:
+  // CHECK-NEXT:   }, {
+  // CHECK-NEXT:     data: [ 49, 46, 49, 49, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+  // CHECK-NEXT:   } ],
+  // CHECK-NEXT:   metadata: [ {
+  // CHECK-NEXT:   name: "min_runtime_version",
+  // CHECK-NEXT:   buffer: 6
   // CHECK-NEXT:   } ]
   // CHECK-NEXT: }
   // CHECK-EMPTY:
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir
index ed3c8a6f702..2ddb78dd4e5 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir
@@ -1,4 +1,4 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
 
 func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>) -> tensor<4 x f32> {
 // CHECK: {
@@ -192,7 +192,8 @@ func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, t
 // CHECK-NEXT:       builtin_options_type: LSTMOptions,
 // CHECK-NEXT:       builtin_options: {
 // CHECK-EMPTY:
-// CHECK-NEXT:       }
+// CHECK-NEXT:       },
+// CHECK-NEXT:       intermediates: [ ]
 // CHECK-NEXT:     } ],
 // CHECK-NEXT:     name: "main"
 // CHECK-NEXT:   } ],
@@ -249,6 +250,12 @@ func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, t
 // CHECK-NEXT:     data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
 // CHECK-NEXT:   }, {
 // CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-NEXT:     data: [ 49, 46, 55, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:   } ],
+// CHECK-NEXT:   metadata: [ {
+// CHECK-NEXT:   name: "min_runtime_version",
+// CHECK-NEXT:   buffer: 26
 // CHECK-NEXT:   } ]
 // CHECK-NEXT: }
 // CHECK-EMPTY:
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm_quantized.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm_quantized.mlir
new file mode 100644
index 00000000000..6ae8ec8f3c7
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm_quantized.mlir
@@ -0,0 +1,323 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
+
+func @main(%arg0: tensor<1x528x!quant.uniform<i8:f32, 0.037248000502586365:-19>>, %arg1: tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, %arg2: tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.031925998628139496>>, %arg3: tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.056272000074386597>>, %arg4: tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.063763998448848724>>, %arg5: tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.013358999975025654>>, %arg6: tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.022830000147223473>>, %arg7: tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.032276000827550888>>, %arg8: tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.035427000373601913>>, %arg9: tensor<2048x!quant.uniform<i32:f32, 4.2675782196965883E-7>>, %arg10: tensor<2048x!quant.uniform<i32:f32, 1.0742187583900886E-7>>, %arg11: tensor<2048x!quant.uniform<i32:f32, 1.6406249869760359E-7>>, %arg12: tensor<2048x!quant.uniform<i32:f32, 1.523437447303877E-7>>, %arg13: tensor<640x2048x!quant.uniform<i8<-127:127>:f32, 0.021174000576138496>>, %arg14: tensor<640x!quant.uniform<i32:f32, 1.601389680352559E-4>>, %arg15: tensor<2048x!quant.uniform<i16:f32, 4.3700000969693065E-4>>, %arg16: tensor<2048x!quant.uniform<i16:f32, 1.1000000085914508E-4>>, %arg17: tensor<2048x!quant.uniform<i16:f32, 1.6799999866634607E-4>>, %arg18: tensor<2048x!quant.uniform<i16:f32, 1.55999994603917E-4>>, %arg19: tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>, %arg20: tensor<1x2048x!quant.uniform<i16:f32, 4.8799999058246613E-4>>) -> tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>> {
+    %cst = constant unit
+    %0 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %cst, %cst, %cst, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg19, %arg20, %arg15, %arg16, %arg17, %arg18) ({}) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", input_to_input_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0049890000373125076>>, input_to_forget_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0078849997371435165>>, input_to_cell_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0087630003690719604>>, input_to_output_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0057529998011887074>>, effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8<-127:127>:f32, 0.0075630000792443752:2>>, kernel_type = "FULL", proj_clip = 0.01 : f32} : (tensor<1x528x!quant.uniform<i8:f32, 0.037248000502586365:-19>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.031925998628139496>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.056272000074386597>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.063763998448848724>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.013358999975025654>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.022830000147223473>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.032276000827550888>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.035427000373601913>>, none, none, none, tensor<2048x!quant.uniform<i32:f32, 4.2675782196965883E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.0742187583900886E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.6406249869760359E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.523437447303877E-7>>, tensor<640x2048x!quant.uniform<i8<-127:127>:f32, 0.021174000576138496>>, tensor<640x!quant.uniform<i32:f32, 1.601389680352559E-4>>, tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>, tensor<1x2048x!quant.uniform<i16:f32, 4.8799999058246613E-4>>, tensor<2048x!quant.uniform<i16:f32, 4.3700000969693065E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.1000000085914508E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.6799999866634607E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.55999994603917E-4>>) -> tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>
+  return %0 : tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>
+// CHECK: {
+// CHECK-NEXT:   version: 3,
+// CHECK-NEXT:   operator_codes: [ {
+// CHECK-NEXT:     builtin_code: LSTM,
+// CHECK-NEXT:     version: 1
+// CHECK-NEXT:   } ],
+// CHECK-NEXT:   subgraphs: [ {
+// CHECK-NEXT:     tensors: [ {
+// CHECK-NEXT:       shape: [ 1, 528 ],
+// CHECK-NEXT:       type: INT8,
+// CHECK-NEXT:       buffer: 1,
+// CHECK-NEXT:       name: "arg0",
+// CHECK-NEXT:       quantization: {
+// CHECK-NEXT:         scale: [ 0.037248 ],
+// CHECK-NEXT:         zero_point: [ -19 ]
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 2048, 528 ],
+// CHECK-NEXT:       type: INT8,
+// CHECK-NEXT:       buffer: 2,
+// CHECK-NEXT:       name: "arg1",
+// CHECK-NEXT:       quantization: {
+// CHECK-NEXT:         scale: [ 0.059802 ],
+// CHECK-NEXT:         zero_point: [ 0 ]
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 2048, 528 ],
+// CHECK-NEXT:       type: INT8,
+// CHECK-NEXT:       buffer: 3,
+// CHECK-NEXT:       name: "arg2",
+// CHECK-NEXT:       quantization: {
+// CHECK-NEXT:         scale: [ 0.031926 ],
+// CHECK-NEXT:         zero_point: [ 0 ]
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 2048, 528 ],
+// CHECK-NEXT:       type: INT8,
+// CHECK-NEXT:       buffer: 4,
+// CHECK-NEXT:       name: "arg3",
+// CHECK-NEXT:       quantization: {
+// CHECK-NEXT:         scale: [ 0.056272 ],
+// CHECK-NEXT:         zero_point: [ 0 ]
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 2048, 528 ],
+// CHECK-NEXT:       type: INT8,
+// CHECK-NEXT:       buffer: 5,
+// CHECK-NEXT:       name: "arg4",
+// CHECK-NEXT:       quantization: {
+// CHECK-NEXT:         scale: [ 0.063764 ],
+// CHECK-NEXT:         zero_point: [ 0 ]
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 2048, 640 ],
+// CHECK-NEXT:       type: INT8,
+// CHECK-NEXT:       buffer: 6,
+// CHECK-NEXT:       name: "arg5",
+// CHECK-NEXT:       quantization: {
+// CHECK-NEXT:         scale: [ 0.013359 ],
+// CHECK-NEXT:         zero_point: [ 0 ]
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 2048, 640 ],
+// CHECK-NEXT:       type: INT8,
+// CHECK-NEXT:       buffer: 7,
+// CHECK-NEXT:       name: "arg6",
+// CHECK-NEXT:       quantization: {
+// CHECK-NEXT:         scale: [ 0.02283 ],
+// CHECK-NEXT:         zero_point: [ 0 ]
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 2048, 640 ],
+// CHECK-NEXT:       type: INT8,
+// CHECK-NEXT:       buffer: 8,
+// CHECK-NEXT:       name: "arg7",
+// CHECK-NEXT:       quantization: {
+// CHECK-NEXT:         scale: [ 0.032276 ],
+// CHECK-NEXT:         zero_point: [ 0 ]
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 2048, 640 ],
+// CHECK-NEXT:       type: INT8,
+// CHECK-NEXT:       buffer: 9,
+// CHECK-NEXT:       name: "arg8",
+// CHECK-NEXT:       quantization: {
+// CHECK-NEXT:         scale: [ 0.035427 ],
+// CHECK-NEXT:         zero_point: [ 0 ]
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 2048 ],
+// CHECK-NEXT:       type: INT32,
+// CHECK-NEXT:       buffer: 10,
+// CHECK-NEXT:       name: "arg9",
+// CHECK-NEXT:       quantization: {
+// CHECK-NEXT:         scale: [ 0.0 ],
+// CHECK-NEXT:         zero_point: [ 0 ]
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 2048 ],
+// CHECK-NEXT:       type: INT32,
+// CHECK-NEXT:       buffer: 11,
+// CHECK-NEXT:       name: "arg10",
+// CHECK-NEXT:       quantization: {
+// CHECK-NEXT:         scale: [ 0.0 ],
+// CHECK-NEXT:         zero_point: [ 0 ]
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 2048 ],
+// CHECK-NEXT:       type: INT32,
+// CHECK-NEXT:       buffer: 12,
+// CHECK-NEXT:       name: "arg11",
+// CHECK-NEXT:       quantization: {
+// CHECK-NEXT:         scale: [ 0.0 ],
+// CHECK-NEXT:         zero_point: [ 0 ]
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 2048 ],
+// CHECK-NEXT:       type: INT32,
+// CHECK-NEXT:       buffer: 13,
+// CHECK-NEXT:       name: "arg12",
+// CHECK-NEXT:       quantization: {
+// CHECK-NEXT:         scale: [ 0.0 ],
+// CHECK-NEXT:         zero_point: [ 0 ]
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 640, 2048 ],
+// CHECK-NEXT:       type: INT8,
+// CHECK-NEXT:       buffer: 14,
+// CHECK-NEXT:       name: "arg13",
+// CHECK-NEXT:       quantization: {
+// CHECK-NEXT:         scale: [ 0.021174 ],
+// CHECK-NEXT:         zero_point: [ 0 ]
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 640 ],
+// CHECK-NEXT:       type: INT32,
+// CHECK-NEXT:       buffer: 15,
+// CHECK-NEXT:       name: "arg14",
+// CHECK-NEXT:       quantization: {
+// CHECK-NEXT:         scale: [ 0.00016 ],
+// CHECK-NEXT:         zero_point: [ 0 ]
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 2048 ],
+// CHECK-NEXT:       type: INT16,
+// CHECK-NEXT:       buffer: 16,
+// CHECK-NEXT:       name: "arg15",
+// CHECK-NEXT:       quantization: {
+// CHECK-NEXT:         scale: [ 0.000437 ],
+// CHECK-NEXT:         zero_point: [ 0 ]
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 2048 ],
+// CHECK-NEXT:       type: INT16,
+// CHECK-NEXT:       buffer: 17,
+// CHECK-NEXT:       name: "arg16",
+// CHECK-NEXT:       quantization: {
+// CHECK-NEXT:         scale: [ 0.00011 ],
+// CHECK-NEXT:         zero_point: [ 0 ]
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 2048 ],
+// CHECK-NEXT:       type: INT16,
+// CHECK-NEXT:       buffer: 18,
+// CHECK-NEXT:       name: "arg17",
+// CHECK-NEXT:       quantization: {
+// CHECK-NEXT:         scale: [ 0.000168 ],
+// CHECK-NEXT:         zero_point: [ 0 ]
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 2048 ],
+// CHECK-NEXT:       type: INT16,
+// CHECK-NEXT:       buffer: 19,
+// CHECK-NEXT:       name: "arg18",
+// CHECK-NEXT:       quantization: {
+// CHECK-NEXT:         scale: [ 0.000156 ],
+// CHECK-NEXT:         zero_point: [ 0 ]
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 1, 640 ],
+// CHECK-NEXT:       type: INT8,
+// CHECK-NEXT:       name: "arg19",
+// CHECK-NEXT:       quantization: {
+// CHECK-NEXT:         scale: [ 0.096711 ],
+// CHECK-NEXT:         zero_point: [ 10 ]
+// CHECK-NEXT:       },
+// CHECK-NEXT:       is_variable: true
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 1, 2048 ],
+// CHECK-NEXT:       type: INT16,
+// CHECK-NEXT:       name: "arg20",
+// CHECK-NEXT:       quantization: {
+// CHECK-NEXT:         scale: [ 0.000488 ],
+// CHECK-NEXT:         zero_point: [ 0 ]
+// CHECK-NEXT:       },
+// CHECK-NEXT:       is_variable: true
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 0 ],
+// CHECK-NEXT:       type: INT16,
+// CHECK-NEXT:       name: "input_to_input_intermediate",
+// CHECK-NEXT:       quantization: {
+// CHECK-NEXT:         scale: [ 0.004989 ],
+// CHECK-NEXT:         zero_point: [ 0 ]
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 0 ],
+// CHECK-NEXT:       type: INT16,
+// CHECK-NEXT:       name: "input_to_forget_intermediate",
+// CHECK-NEXT:       quantization: {
+// CHECK-NEXT:         scale: [ 0.007885 ],
+// CHECK-NEXT:         zero_point: [ 0 ]
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 0 ],
+// CHECK-NEXT:       type: INT16,
+// CHECK-NEXT:       name: "input_to_cell_intermediate",
+// CHECK-NEXT:       quantization: {
+// CHECK-NEXT:         scale: [ 0.008763 ],
+// CHECK-NEXT:         zero_point: [ 0 ]
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 0 ],
+// CHECK-NEXT:       type: INT16,
+// CHECK-NEXT:       name: "input_to_output_intermediate",
+// CHECK-NEXT:       quantization: {
+// CHECK-NEXT:         scale: [ 0.005753 ],
+// CHECK-NEXT:         zero_point: [ 0 ]
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 0 ],
+// CHECK-NEXT:       type: INT8,
+// CHECK-NEXT:       name: "effective_hidden_scale_intermediate",
+// CHECK-NEXT:       quantization: {
+// CHECK-NEXT:         scale: [ 0.007563 ],
+// CHECK-NEXT:         zero_point: [ 2 ]
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:       shape: [ 1, 640 ],
+// CHECK-NEXT:       type: INT8,
+// CHECK-NEXT:       buffer: 22,
+// CHECK-NEXT:       name: "tfl.lstm",
+// CHECK-NEXT:       quantization: {
+// CHECK-NEXT:         scale: [ 0.096711 ],
+// CHECK-NEXT:         zero_point: [ 10 ]
+// CHECK-NEXT:       }
+// CHECK-NEXT:     } ],
+// CHECK-NEXT:     inputs: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 ],
+// CHECK-NEXT:     outputs: [ 26 ],
+// CHECK-NEXT:     operators: [ {
+// CHECK-NEXT:       inputs: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, 12, 13, 14, 19, 20, 15, 16, 17, 18 ],
+// CHECK-NEXT:       outputs: [ 26 ],
+// CHECK-NEXT:       builtin_options_type: LSTMOptions,
+// CHECK-NEXT:       builtin_options: {
+// CHECK-NEXT:         fused_activation_function: TANH,
+// CHECK-NEXT:         cell_clip: 10.0,
+// CHECK-NEXT:         proj_clip: 0.01
+// CHECK-NEXT:       },
+// CHECK-NEXT:       intermediates: [ 21, 22, 23, 24, 25 ]
+// CHECK-NEXT:     } ],
+// CHECK-NEXT:     name: "main"
+// CHECK-NEXT:   } ],
+// CHECK-NEXT:   description: "MLIR Converted.",
+// CHECK-NEXT:   buffers: [ {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-NEXT:     data: [ 49, 46, 55, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:   } ],
+// CHECK-NEXT:   metadata: [ {
+// CHECK-NEXT:     name: "min_runtime_version",
+// CHECK-NEXT:     buffer: 23
+// CHECK-NEXT:   } ]
+// CHECK-NEXT: }
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/math.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/math.mlir
index d39a8353c6f..6c9dd515ca8 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/math.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/math.mlir
@@ -128,6 +128,12 @@ func @main(tensor<4xf32>) -> tensor<4xf32> {
   // CHECK-EMPTY:
   // CHECK-NEXT:   }, {
   // CHECK-EMPTY:
+  // CHECK-NEXT:   }, {
+  // CHECK-NEXT:     data: [ 49, 46, 49, 51, 46, 49, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+  // CHECK-NEXT:   } ],
+  // CHECK-NEXT:   metadata: [ {
+  // CHECK-NEXT:   name: "min_runtime_version",
+  // CHECK-NEXT:   buffer: 8
   // CHECK-NEXT:   } ]
   // CHECK-NEXT: }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/max_pooling_with_arg_max_2d.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/max_pooling_with_arg_max_2d.mlir
index 47935358512..fc7ef307bae 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/max_pooling_with_arg_max_2d.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/max_pooling_with_arg_max_2d.mlir
@@ -1,4 +1,4 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -emit-custom-ops -o - | flatbuffer_to_string - | FileCheck %s
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -emit-custom-ops -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
 // RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir -o - | FileCheck --check-prefix=MLIR %s
 
 func @main(%arg0: tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32xf32>, tensor<1x32x32x32xf32>) {
@@ -50,6 +50,12 @@ func @main(%arg0: tensor<1x64x64x32xf32>) -> (tensor<1x32x32x32xf32>, tensor<1x3
 // CHECK-EMPTY:
 // CHECK-NEXT:  }, {
 // CHECK-EMPTY:
+// CHECK-NEXT:  }, {
+// CHECK-NEXT:    data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:  } ],
+// CHECK-NEXT:  metadata: [ {
+// CHECK-NEXT:  name: "min_runtime_version",
+// CHECK-NEXT:  buffer: 4
 // CHECK-NEXT:  } ]
 // CHECK-NEXT:}
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/max_unpool_2d.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/max_unpool_2d.mlir
index be2cc62e156..0dc6f7ea165 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/max_unpool_2d.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/max_unpool_2d.mlir
@@ -1,4 +1,4 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -emit-custom-ops -o - | flatbuffer_to_string - | FileCheck %s
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -emit-custom-ops -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
 // RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir -o - | FileCheck --check-prefix=MLIR %s
 
 func @main(%arg0: tensor<1x8x8x128xf32>, %arg1: tensor<1x8x8x128xf32>) -> tensor<1x8x8x128xf32> {
@@ -50,6 +50,12 @@ func @main(%arg0: tensor<1x8x8x128xf32>, %arg1: tensor<1x8x8x128xf32>) -> tensor
 // CHECK-EMPTY:
 // CHECK-NEXT:  }, {
 // CHECK-EMPTY:
+// CHECK-NEXT:  }, {
+// CHECK-NEXT:    data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:  } ],
+// CHECK-NEXT:  metadata: [ {
+// CHECK-NEXT:  name: "min_runtime_version",
+// CHECK-NEXT:  buffer: 4
 // CHECK-NEXT:  } ]
 // CHECK-NEXT:}
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/metadata.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/metadata.mlir
index 560d849ece3..8d2f63a8f15 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/metadata.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/metadata.mlir
@@ -20,6 +20,8 @@ module attributes {
 // CHECK-NEXT:   data: [ 118, 97, 108, 117, 101, 49 ]
 // CHECK-NEXT: }, {
 // CHECK-NEXT:   data: [ 118, 97, 108, 117, 101, 50 ]
+// CHECK-NEXT: }, {
+// CHECK-NEXT:   data: [ 49, 46, 54, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
 // CHECK-NEXT: } ],
 // CHECK-NEXT: metadata: [ {
 // CHECK-NEXT:   name: "key1",
@@ -27,4 +29,8 @@ module attributes {
 // CHECK-NEXT: }, {
 // CHECK-NEXT:   name: "key2",
 // CHECK-NEXT:   buffer: 5
+// CHECK-NEXT: }, {
+// CHECK-NEXT:   name: "min_runtime_version",
+// CHECK-NEXT:   buffer: 6
 // CHECK-NEXT: } ]
+// CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/mul_v2.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/mul_v2.mlir
index 2f77163e7a9..3879fc3f1aa 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/mul_v2.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/mul_v2.mlir
@@ -58,6 +58,12 @@ func @main(tensor<3x!quant.uniform<i8:f32, 0.1>>) -> tensor<3x!quant.uniform<i8:
   // CHECK-NEXT:    data: [ 2, 2, 2 ]
   // CHECK-NEXT:  }, {
   // CHECK-EMPTY:
+  // CHECK-NEXT:  }, {
+  // CHECK-NEXT:    data: [ 49, 46, 49, 52, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+  // CHECK-NEXT:  } ],
+  // CHECK-NEXT:  metadata: [ {
+  // CHECK-NEXT:  name: "min_runtime_version",
+  // CHECK-NEXT:  buffer: 4
   // CHECK-NEXT:  } ]
   // CHECK-NEXT:}
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/mul_v3.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/mul_v3.mlir
index ea7dca7871e..f659395d4a1 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/mul_v3.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/mul_v3.mlir
@@ -58,6 +58,12 @@ func @main(tensor<3x!quant.uniform<i8:f32, 1.0>>) -> tensor<3x!quant.uniform<i8:
   // CHECK-NEXT:    data: [ 2, 2, 2 ]
   // CHECK-NEXT:  }, {
   // CHECK-EMPTY:
+  // CHECK-NEXT:  }, {
+  // CHECK-NEXT:    data: [ 49, 46, 49, 53, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+  // CHECK-NEXT:  } ],
+  // CHECK-NEXT:  metadata: [ {
+  // CHECK-NEXT:  name: "min_runtime_version",
+  // CHECK-NEXT:  buffer: 4
   // CHECK-NEXT:  } ]
   // CHECK-NEXT:}
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/nn.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/nn.mlir
index 6c63e69ab2f..402bbabf0c5 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/nn.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/nn.mlir
@@ -1,4 +1,4 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
 
 func @main(tensor<1x6x6x16xf32>) -> tensor<1x1x1x16xf32> {
 ^bb0(%arg0: tensor<1x6x6x16xf32>):
@@ -47,6 +47,12 @@ func @main(tensor<1x6x6x16xf32>) -> tensor<1x1x1x16xf32> {
   // CHECK-EMPTY:
   // CHECK-NEXT:   }, {
   // CHECK-EMPTY:
+  // CHECK-NEXT:   }, {
+  // CHECK-NEXT:     data: [ 49, 46, 53, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+  // CHECK-NEXT:   } ],
+  // CHECK-NEXT:   metadata: [ {
+  // CHECK-NEXT:   name: "min_runtime_version",
+  // CHECK-NEXT:   buffer: 3
   // CHECK-NEXT:   } ]
   // CHECK-NEXT: }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/numeric_verify.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/numeric_verify.mlir
index 8b2f6ea8b0e..f7830acabf7 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/numeric_verify.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/numeric_verify.mlir
@@ -1,4 +1,4 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
 
 // CHECK:      {
 // CHECK-NEXT:    version: 3,
@@ -40,6 +40,12 @@
 // CHECK-EMPTY:
 // CHECK-NEXT:  }, {
 // CHECK-EMPTY:
+// CHECK-NEXT:  }, {
+// CHECK-NEXT:    data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:  } ],
+// CHECK-NEXT:  metadata: [ {
+// CHECK-NEXT:  name: "min_runtime_version",
+// CHECK-NEXT:  buffer: 3
 // CHECK-NEXT:  } ]
 // CHECK-NEXT:}
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/quantization.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/quantization.mlir
index 7d9b113de65..c50857fa2ea 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/quantization.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/quantization.mlir
@@ -1,4 +1,4 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
 
 func @main(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x1001xf32> {
 // CHECK: {
@@ -153,6 +153,12 @@ func @main(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x1001xf32> {
 // CHECK-EMPTY:
 // CHECK-NEXT:  }, {
 // CHECK-EMPTY:
+// CHECK-NEXT:  }, {
+// CHECK-NEXT:    data: [ 49, 46, 49, 51, 46, 49, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:  } ],
+// CHECK-NEXT:  metadata: [ {
+// CHECK-NEXT:  name: "min_runtime_version",
+// CHECK-NEXT:  buffer: 10
 // CHECK-NEXT:  } ]
 // CHECK-NEXT:}
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/reshape.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/reshape.mlir
index c019cf12f05..6ef628229a4 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/reshape.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/reshape.mlir
@@ -1,4 +1,4 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
 
 func @main(tensor<3x2xi32>) -> tensor<6xi32> {
 ^bb0(%arg0: tensor<3x2xi32>):
@@ -51,6 +51,12 @@ func @main(tensor<3x2xi32>) -> tensor<6xi32> {
 // CHECK-NEXT:     data: [ 6, 0, 0, 0 ]
 // CHECK-NEXT:   }, {
 // CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-NEXT:     data: [ 49, 46, 53, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:   } ],
+// CHECK-NEXT:   metadata: [ {
+// CHECK-NEXT:   name: "min_runtime_version",
+// CHECK-NEXT:   buffer: 4
 // CHECK-NEXT:   } ]
 // CHECK-NEXT: }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/simple.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/simple.mlir
index ee731c383f3..148039a1b41 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/simple.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/simple.mlir
@@ -97,6 +97,12 @@ func @main(tensor<3x2xi32>) -> tensor<3x2xi32>
 // CHECK-NEXT:     data: [ 10, 0, 0, 0 ]
 // CHECK-NEXT:   }, {
 // CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-NEXT:     data: [ 49, 46, 54, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:   } ],
+// CHECK-NEXT:   metadata: [ {
+// CHECK-NEXT:   name: "min_runtime_version",
+// CHECK-NEXT:   buffer: 6
 // CHECK-NEXT:   } ]
 // CHECK-NEXT: }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/svdf.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/svdf.mlir
index db9668b8e30..559f3745149 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/svdf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/svdf.mlir
@@ -1,4 +1,4 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
 
 func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>) -> tensor<4 x f32> {
 // CHECK:      {
@@ -79,6 +79,12 @@ func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>) -
 // CHECK-NEXT:      data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
 // CHECK-NEXT:     }, {
 // CHECK-EMPTY:
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:      data: [ 49, 46, 53, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:     } ],
+// CHECK-NEXT:     metadata: [ {
+// CHECK-NEXT:     name: "min_runtime_version",
+// CHECK-NEXT:     buffer: 7
 // CHECK-NEXT:     } ]
 // CHECK-NEXT:   }
 // CHECK-EMPTY:
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/svdf_v2.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/svdf_v2.mlir
index 8967822e234..ebfd1807280 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/svdf_v2.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/svdf_v2.mlir
@@ -1,4 +1,4 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
 
 func @main(tensor<4 x f32>, tensor<4 x i8>, tensor<4 x f32>, tensor<4 x f32>) -> tensor<4 x f32> {
 // CHECK:      {
@@ -80,6 +80,12 @@ func @main(tensor<4 x f32>, tensor<4 x i8>, tensor<4 x f32>, tensor<4 x f32>) ->
 // CHECK-NEXT:      data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
 // CHECK-NEXT:     }, {
 // CHECK-EMPTY:
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:      data: [ 49, 46, 49, 52, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:     } ],
+// CHECK-NEXT:     metadata: [ {
+// CHECK-NEXT:     name: "min_runtime_version",
+// CHECK-NEXT:     buffer: 7
 // CHECK-NEXT:     } ]
 // CHECK-NEXT:   }
 // CHECK-EMPTY:
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/tf_entry_function.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/tf_entry_function.mlir
new file mode 100644
index 00000000000..f1dc92678ed
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/tf_entry_function.mlir
@@ -0,0 +1,56 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
+
+module {
+func @serving_default(%arg0: tensor<3x2xf32>) -> tensor<3x2xf32> attributes {tf.entry_function = {inputs = "serving_default_x", outputs = "outputs"}} {
+// CHECK:       {
+
+// CHECK-LABEL:   version: 3,
+
+// CHECK-LABEL:   operator_codes: [ {
+// CHECK:           version: 1
+// CHECK:         } ],
+
+// CHECK-LABEL:   subgraphs: [ {
+// CHECK:           tensors: [ {
+// CHECK:             shape: [ 3, 2 ],
+// CHECK:             buffer: 1,
+// CHECK:             name: "serving_default_x",
+// CHECK:             quantization: {
+// CHECK:             }
+// CHECK:           }, {
+// CHECK:             shape: [ 3, 2 ],
+// CHECK:             buffer: 2,
+// CHECK:             name: "tfl.pseudo_const",
+// CHECK:             quantization: {
+// CHECK:             }
+// CHECK:           }, {
+// CHECK:             shape: [ 3, 2 ],
+// CHECK:             buffer: 3,
+// CHECK:             name: "outputs",
+// CHECK:             quantization: {
+// CHECK:             }
+// CHECK:           } ],
+// CHECK:           inputs: [ 0 ],
+// CHECK:           outputs: [ 2 ],
+// CHECK:           operators: [ {
+// CHECK:             inputs: [ 1, 0 ],
+// CHECK:             outputs: [ 2 ],
+// CHECK:             builtin_options_type: AddOptions,
+// CHECK:             builtin_options: {
+// CHECK:             }
+// CHECK:           } ],
+// CHECK:           name: "main"
+// CHECK:         } ],
+// CHECK-LABEL:   description: "MLIR Converted.",
+// CHECK-LABEL:   buffers: [ {
+// CHECK:         }, {
+// CHECK:         }, {
+// CHECK:           data: [ 0, 0, 128, 63, 0, 0, 0, 64, 0, 0, 64, 64, 0, 0, 128, 64, 0, 0, 160, 64, 0, 0, 192, 64 ]
+// CHECK:         }, {
+// CHECK:         } ]
+// CHECK:       }
+  %0 = "tfl.pseudo_const" () {value = dense<[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]> : tensor<3x2xf32>} : () -> tensor<3x2xf32>
+  %1 = "tfl.add" (%0, %arg0) {fused_activation_function = "NONE"} : (tensor<3x2xf32>, tensor<3x2xf32>) -> tensor<3x2xf32>
+  return %1 : tensor<3x2xf32>
+}
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/tfl_while_op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/tfl_while_op.mlir
index 3ed6e43479e..bb9278c0d87 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/tfl_while_op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/tfl_while_op.mlir
@@ -189,6 +189,12 @@
 // CHECK-EMPTY:
 // CHECK-NEXT:   }, {
 // CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-NEXT:     data: [ 49, 46, 49, 52, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:   } ],
+// CHECK-NEXT:  metadata: [ {
+// CHECK-NEXT:  name: "min_runtime_version",
+// CHECK-NEXT:  buffer: 14
 // CHECK-NEXT:   } ]
 // CHECK-NEXT: }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir
index 019c96cab6c..8e579421b0b 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir
@@ -1,4 +1,4 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
 
 func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>) -> tensor<4 x f32> {
 // CHECK: {
@@ -249,6 +249,12 @@ func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, t
 // CHECK-NEXT:     data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
 // CHECK-NEXT:   }, {
 // CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-NEXT:     data: [ 49, 46, 49, 51, 46, 49, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:   } ],
+// CHECK-NEXT:   metadata: [ {
+// CHECK-NEXT:   name: "min_runtime_version",
+// CHECK-NEXT:   buffer: 26
 // CHECK-NEXT:   } ]
 // CHECK-NEXT: }
 // CHECK-EMPTY:
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_rnn.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_rnn.mlir
index 88e31b2cf78..7ba24bd5c51 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_rnn.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_rnn.mlir
@@ -1,4 +1,4 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
 
 func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>) -> tensor<4 x f32> {
 // CHECK:      {
@@ -79,6 +79,12 @@ func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>) -
 // CHECK-NEXT:      data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
 // CHECK-NEXT:     }, {
 // CHECK-EMPTY:
+// CHECK-NEXT:     }, {
+// CHECK-NEXT:      data: [ 49, 46, 49, 52, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:     } ],
+// CHECK-NEXT:     metadata: [ {
+// CHECK-NEXT:     name: "min_runtime_version",
+// CHECK-NEXT:     buffer: 7
 // CHECK-NEXT:     } ]
 // CHECK-NEXT:   }
 // CHECK-EMPTY:
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/while_op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/while_op.mlir
index 58f19b66370..b40c9fb2044 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/while_op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/while_op.mlir
@@ -1,4 +1,4 @@
-// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
 
 // CHECK: {
 // CHECK-NEXT:   version: 3,
@@ -189,6 +189,12 @@
 // CHECK-EMPTY:
 // CHECK-NEXT:   }, {
 // CHECK-EMPTY:
+// CHECK-NEXT:   }, {
+// CHECK-NEXT:     data: [ 49, 46, 49, 52, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:   } ],
+// CHECK-NEXT:   metadata: [ {
+// CHECK-NEXT:   name: "min_runtime_version",
+// CHECK-NEXT:   buffer: 14
 // CHECK-NEXT:   } ]
 // CHECK-NEXT: }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/ops.mlir b/tensorflow/compiler/mlir/lite/tests/ops.mlir
index 57e2340dd37..995f20c4a07 100644
--- a/tensorflow/compiler/mlir/lite/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/ops.mlir
@@ -103,7 +103,7 @@ func @testAddN(tensor<? x f32>, tensor<? x f32>, tensor<? x f32>) -> tensor<? x
 // test invalid AddN
 func @testAddNWrongOperandResultType(tensor<? x f16>, tensor<? x f16>, tensor<? x f16>) -> tensor<? x f16> {
 ^bb0(%arg0: tensor<? x f16>, %arg1: tensor<? x f16>, %arg2: tensor<? x f16>):
-  // expected-error @+1 {{'tfl.add_n' op operand #0 must be tensor of 32-bit float or 32-bit integer or QI16 type or QUI16 type values}}
+  // expected-error @+1 {{'tfl.add_n' op operand #0 must be tensor of 32-bit float or 32-bit signless integer or QI16 type or QUI16 type values}}
   %0 = "tfl.add_n"(%arg0, %arg1, %arg2): (tensor<? x f16>, tensor<? x f16>, tensor<? x f16>) -> tensor<? x f16>
   return %0 : tensor<? x f16>
 }
@@ -244,7 +244,7 @@ func @testLogicalNot(tensor<? x i1>) -> tensor<? x i1> {
 
 func @testLogicalNotWrongOperandType(tensor<? x i32>) -> tensor<? x i32> {
 ^bb0(%arg0: tensor<? x i32>):
-  // expected-error @+1 {{'tfl.logical_not' op operand #0 must be tensor of 1-bit integer values}}
+  // expected-error @+1 {{'tfl.logical_not' op operand #0 must be tensor of 1-bit signless integer values}}
   %0 = "tfl.logical_not"(%arg0) : (tensor<? x i32>) -> tensor<? x i32>
   return %0 : tensor<? x i32>
 }
@@ -380,7 +380,7 @@ func @testLogicalAnd(tensor<? x i1>, tensor<? x i1>) -> tensor<? x i1> {
 
 func @testLogicalAndWrongOperandType(tensor<? x i32>, tensor<? x i32>) -> tensor<? x i32> {
 ^bb0(%arg0: tensor<? x i32>, %arg1: tensor<? x i32>):
-  // expected-error @+1 {{'tfl.logical_and' op operand #0 must be tensor of 1-bit integer values}}
+  // expected-error @+1 {{'tfl.logical_and' op operand #0 must be tensor of 1-bit signless integer values}}
   %0 = "tfl.logical_and"(%arg0, %arg1) : (tensor<? x i32>, tensor<? x i32>) -> tensor<? x i32>
   return %0 : tensor<? x i32>
 }
@@ -399,7 +399,7 @@ func @testLogicalOr(tensor<? x i1>, tensor<? x i1>) -> tensor<? x i1> {
 
 func @testLogicalOrWrongOperandType(tensor<? x i32>, tensor<? x i32>) -> tensor<? x i32> {
 ^bb0(%arg0: tensor<? x i32>, %arg1: tensor<? x i32>):
-  // expected-error @+1 {{'tfl.logical_or' op operand #0 must be tensor of 1-bit integer values}}
+  // expected-error @+1 {{'tfl.logical_or' op operand #0 must be tensor of 1-bit signless integer values}}
   %0 = "tfl.logical_or"(%arg0, %arg1) : (tensor<? x i32>, tensor<? x i32>) -> tensor<? x i32>
   return %0 : tensor<? x i32>
 }
@@ -593,6 +593,28 @@ func @testUnidirectionalSequenceLstmWithInvalidNoneType(%arg0: tensor<? x f32>,
   return %0 : tensor<?xf32>
 }
 
+// -----
+// CHECK-LABEL: testLstmIntermediates
+
+
+func @testLstmIntermediates(%arg0: tensor<1x528x!quant.uniform<i8:f32, 0.037248000502586365:-19>>, %arg1: tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, %arg2: tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.031925998628139496>>, %arg3: tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.056272000074386597>>, %arg4: tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.063763998448848724>>, %arg5: tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.013358999975025654>>, %arg6: tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.022830000147223473>>, %arg7: tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.032276000827550888>>, %arg8: tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.035427000373601913>>, %arg9: tensor<2048x!quant.uniform<i32:f32, 4.2675782196965883E-7>>, %arg10: tensor<2048x!quant.uniform<i32:f32, 1.0742187583900886E-7>>, %arg11: tensor<2048x!quant.uniform<i32:f32, 1.6406249869760359E-7>>, %arg12: tensor<2048x!quant.uniform<i32:f32, 1.523437447303877E-7>>, %arg13: tensor<640x2048x!quant.uniform<i8<-127:127>:f32, 0.021174000576138496>>, %arg14: tensor<640x!quant.uniform<i32:f32, 1.601389680352559E-4>>, %arg15: tensor<2048x!quant.uniform<i16:f32, 4.3700000969693065E-4>>, %arg16: tensor<2048x!quant.uniform<i16:f32, 1.1000000085914508E-4>>, %arg17: tensor<2048x!quant.uniform<i16:f32, 1.6799999866634607E-4>>, %arg18: tensor<2048x!quant.uniform<i16:f32, 1.55999994603917E-4>>, %arg19: tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>, %arg20: tensor<1x2048x!quant.uniform<i16:f32, 4.8799999058246613E-4>>) -> tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>> {
+    %cst = constant unit
+    %0 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %cst, %cst, %cst, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg19, %arg20, %arg15, %arg16, %arg17, %arg18) ({}) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", input_to_input_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0049890000373125076>>, input_to_forget_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0078849997371435165>>, input_to_cell_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0087630003690719604>>, input_to_output_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0057529998011887074>>, effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8<-127:127>:f32, 0.0075630000792443752:2>>, kernel_type = "FULL", proj_clip = 0.01 : f32} : (tensor<1x528x!quant.uniform<i8:f32, 0.037248000502586365:-19>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.031925998628139496>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.056272000074386597>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.063763998448848724>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.013358999975025654>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.022830000147223473>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.032276000827550888>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.035427000373601913>>, none, none, none, tensor<2048x!quant.uniform<i32:f32, 4.2675782196965883E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.0742187583900886E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.6406249869760359E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.523437447303877E-7>>, tensor<640x2048x!quant.uniform<i8<-127:127>:f32, 0.021174000576138496>>, tensor<640x!quant.uniform<i32:f32, 1.601389680352559E-4>>, tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>, tensor<1x2048x!quant.uniform<i16:f32, 4.8799999058246613E-4>>, tensor<2048x!quant.uniform<i16:f32, 4.3700000969693065E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.1000000085914508E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.6799999866634607E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.55999994603917E-4>>) -> tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>
+    return %0 : tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>
+// CHECK: %[[RES0:.*]] = constant unit
+// CHECK: %[[RES1:.*]] = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %[[RES0]], %[[RES0]], %[[RES0]], %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg19, %arg20, %arg15, %arg16, %arg17, %arg18) ( {
+// CHECK: }) {cell_clip = 1.000000e+01 : f32, effective_hidden_scale_intermediate = tensor<0x!quant.uniform<i8<-127:127>:f32, 0.0075630000792443752:2>>, fused_activation_function = "TANH", input_to_cell_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0087630003690719604>>, input_to_forget_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0078849997371435165>>, input_to_input_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0049890000373125076>>, input_to_output_intermediate = tensor<0x!quant.uniform<i16:f32, 0.0057529998011887074>>, kernel_type = "FULL", proj_clip = 0.00999999977 : f32} : (tensor<1x528x!quant.uniform<i8:f32, 0.037248000502586365:-19>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.059801999479532242>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.031925998628139496>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.056272000074386597>>, tensor<2048x528x!quant.uniform<i8<-127:127>:f32, 0.063763998448848724>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.013358999975025654>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.022830000147223473>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.032276000827550888>>, tensor<2048x640x!quant.uniform<i8<-127:127>:f32, 0.035427000373601913>>, none, none, none, tensor<2048x!quant.uniform<i32:f32, 4.2675782196965883E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.0742187583900886E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.6406249869760359E-7>>, tensor<2048x!quant.uniform<i32:f32, 1.523437447303877E-7>>, tensor<640x2048x!quant.uniform<i8<-127:127>:f32, 0.021174000576138496>>, tensor<640x!quant.uniform<i32:f32, 1.601389680352559E-4>>, tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>, tensor<1x2048x!quant.uniform<i16:f32, 4.8799999058246613E-4>>, tensor<2048x!quant.uniform<i16:f32, 4.3700000969693065E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.1000000085914508E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.6799999866634607E-4>>, tensor<2048x!quant.uniform<i16:f32, 1.55999994603917E-4>>) -> tensor<1x640x!quant.uniform<i8:f32, 0.09671100229024887:10>>
+}
+
+// -----
+
+// CHECK-LABEL: testBidirectionalSequenceLstm
+func @testBidirectionalSequenceLstm(%arg0: tensor<? x f32>, %arg1: none, %arg2: tensor<? x f32>, %arg3: tensor<? x f32>, %arg4: tensor<? x f32>, %arg5: tensor<? x f32>, %arg6: tensor<? x f32>, %arg7: tensor<? x f32>, %arg8: tensor<? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x f32>, %arg17: tensor<? x f32>, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>, %arg24: tensor<? x f32>, %arg25: tensor<? x f32>, %arg26: tensor<? x f32>, %arg27: tensor<? x f32>, %arg28: tensor<? x f32>, %arg29: tensor<? x f32>, %arg30: tensor<? x f32>, %arg31: tensor<? x f32>, %arg32: tensor<? x f32>, %arg33: tensor<? x f32>, %arg34: tensor<? x f32>, %arg35: tensor<? x f32>, %arg36: tensor<? x f32>, %arg37: tensor<? x f32>, %arg38: tensor<? x f32>, %arg39: tensor<? x f32>, %arg40: tensor<? x f32>, %arg41: tensor<? x f32>, %arg42: tensor<? x f32>, %arg43: tensor<? x f32>, %arg44: tensor<? x f32>, %arg45: tensor<? x f32>, %arg46: tensor<? x f32>, %arg47: tensor<? x f32>) -> tensor<? x f32> {
+  // CHECK: "tfl.bidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg30, %arg31, %arg32, %arg33, %arg34, %arg35, %arg36, %arg37, %arg38, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg45, %arg46, %arg47) {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", merge_outputs = true, time_major = false} : (tensor<?xf32>, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> (tensor<?xf32>, tensor<?xf32>)
+  %0:2 = "tfl.bidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg30, %arg31, %arg32, %arg33, %arg34, %arg35, %arg36, %arg37, %arg38, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg45, %arg46, %arg47) {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", merge_outputs = true, time_major = false} : (tensor<?xf32>, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,  tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> (tensor<?xf32>, tensor<?xf32>)
+  return %0#0 : tensor<?xf32>
+}
+
 // -----
 
 // CHECK-LABEL: testLstmQuantizedType
@@ -692,7 +714,7 @@ func @testSelectMultiDim(%cond : tensor<?xi1>, %arg0 : tensor<?x4xi32>, %arg1 :
 // -----
 
 func @testSelectWithUnsupportedType(%cond : tensor<?xi32>, %arg0 : tensor<?xi32>, %arg1 : tensor<?xi32>) -> tensor<?xi32> {
-  // expected-error @+1 {{op operand #0 must be tensor of 1-bit integer values}}
+  // expected-error @+1 {{op operand #0 must be tensor of 1-bit signless integer values}}
   %0 = "tfl.select"(%cond, %arg0, %arg1): (tensor<?xi32>,tensor<?xi32>,tensor<?xi32>) -> tensor<?xi32>
   return %0 : tensor<?xi32>
 }
@@ -1141,8 +1163,8 @@ func @testStridedSliceWithQUI8(%arg0: tensor<12x2x2x5x!quant.uniform<u8:f32, 0.1
 }
 
 // CHECK-LABEL: testStridedSliceTFType
-func @testStridedSliceTFType(%arg0: tensor<12x2x2x5x!tf.uint8>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<1xi32>) -> tensor<1x2x2x5x!tf.quint8> {
-  %0 = "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5x!tf.uint8>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5x!tf.quint8>
+func @testStridedSliceTFType(%arg0: tensor<12x2x2x5xui8>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<1xi32>) -> tensor<1x2x2x5x!tf.quint8> {
+  %0 = "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5xui8>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5x!tf.quint8>
   return %0 : tensor<1x2x2x5x!tf.quint8>
 }
 
@@ -1166,7 +1188,7 @@ func @testOneHot(%arg0: tensor<3xi32>, %arg1: tensor<i32>, %arg2: tensor<f32>, %
 // -----
 
 func @testOneHotWithInvalidOutputType(%arg0: tensor<3xi32>, %arg1: tensor<i32>, %arg2: tensor<f32>, %arg3: tensor<f32>) -> tensor<*xi8> {
-  // expected-error @+1 {{'tfl.one_hot' op result #0 must be tensor of 32-bit float or 32-bit integer or 64-bit integer or 1-bit integer values}}
+  // expected-error @+1 {{'tfl.one_hot' op result #0 must be tensor of 32-bit float or 32-bit signless integer or 64-bit signless integer or 1-bit signless integer values}}
   %0 = "tfl.one_hot"(%arg0, %arg1, %arg2, %arg3) {axis = -1 : i32} : (tensor<3xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<*xi8>
   return %0 : tensor<*xi8>
 }
@@ -1239,7 +1261,7 @@ func @transpose(%arg0 : tensor<2x2xi32>, %arg1 : tensor<2xi32>) -> tensor<2x2xi3
 // -----
 
 func @transpose_perm_not_i32(%arg0 : tensor<2x2xi32>, %arg1 : tensor<2xf32>) -> tensor<2x2xi32> {
-  // expected-error @+1 {{op operand #1 must be tensor of 32-bit integer values}}
+  // expected-error @+1 {{op operand #1 must be tensor of 32-bit signless integer values}}
   %0 = "tfl.transpose"(%arg0, %arg1) : (tensor<2x2xi32>, tensor<2xf32>) -> tensor<2x2xi32>
   return %0 : tensor<2x2xi32>
 }
@@ -1322,7 +1344,7 @@ func @transpose_1d_perm(%arg0 : tensor<2x2xi32>, %arg1 : tensor<2x2xi32>) -> ten
 // -----
 
 func @anyWithI64Axis(%arg0: tensor<2x2xi1>, %arg1: tensor<i64>) -> tensor<i1> {
-  // expected-error @+1 {{tfl.reduce_any' op operand #1 must be tensor of 32-bit integer values}}
+  // expected-error @+1 {{tfl.reduce_any' op operand #1 must be tensor of 32-bit signless integer values}}
   %0 = "tfl.reduce_any"(%arg0, %arg1) {keep_dims = false} : (tensor<2x2xi1>, tensor<i64>) -> tensor<i1>
   return %0 : tensor<i1>
 }
@@ -1352,7 +1374,7 @@ func @testSplitVWithQuantizedTypes(%arg0 : tensor<10x!quant.uniform<u8:f32, 1.0>
 // -----
 
 func @whereWithI32Input(%arg0: tensor<3x5xi32>) -> tensor<?x2xi64> {
-  // expected-error @+1 {{'tfl.where' op operand #0 must be tensor of 1-bit integer values}}
+  // expected-error @+1 {{'tfl.where' op operand #0 must be tensor of 1-bit signless integer values}}
   %0 = "tfl.where"(%arg0) : (tensor<3x5xi32>) -> tensor<?x2xi64>
   return %0 : tensor<?x2xi64>
 }
@@ -1559,7 +1581,7 @@ func @testSliceBeginOutOfRange(%arg0: tensor<2x3x5xf32>, %arg1: tensor<3xi32>) -
 
 func @testSplitOpWithBadNumSplits(%arg0 : tensor<16xf32>) -> () {
   %split_dim = constant dense<0> : tensor<i32>
-  // expected-error @+1 {{'tfl.split' op attribute 'num_splits' failed to satisfy constraint: positive 32-bit integer attribute}}
+  // expected-error @+1 {{'tfl.split' op attribute 'num_splits' failed to satisfy constraint: 32-bit signless integer attribute whose value is positive}}
   "tfl.split"(%split_dim, %arg0) {num_splits = 0 : i32} : (tensor<i32>, tensor<16xf32>) -> ()
   return
 }
@@ -1682,7 +1704,7 @@ func @testSplitOpWithValidTensorTypeDynamic(%arg0 : tensor<16x?xf32>) -> (tensor
 func @testSplitVOpWithBadNumSplits(%arg0 : tensor<16xf32>) -> () {
   %size_splits = constant dense<[]> : tensor<0xi32>
   %split_dim = constant dense<0> : tensor<i32>
-  // expected-error @+1 {{'tfl.split_v' op attribute 'num_splits' failed to satisfy constraint: positive 32-bit integer attribute}}
+  // expected-error @+1 {{'tfl.split_v' op attribute 'num_splits' failed to satisfy constraint: 32-bit signless integer attribute whose value is positive}}
   "tfl.split_v"(%arg0, %size_splits, %split_dim) {num_splits = 0 : i32} : (tensor<16xf32>, tensor<0xi32>, tensor<i32>) -> ()
   return
 }
@@ -1702,7 +1724,7 @@ func @testSplitVOpWithMismatchedNumResults(%arg0 : tensor<16xf32>) -> (tensor<8x
 func @testSplitVOpWithBadSizeSplitsTensorType(%arg0: tensor<16x4x4xf32>) -> tensor<16x4x4xf32> {
   %size_splits = constant dense<[[8, 8], [2, 2]]> : tensor<2x2xi32>
   %split_dim = constant dense<0> : tensor<i32>
-  // expected-error @+1 {{'tfl.split_v' op operand #1 must be 1D tensor of 32-bit integer values}}
+  // expected-error @+1 {{'tfl.split_v' op operand #1 must be 1D tensor of 32-bit signless integer values}}
   %0 = "tfl.split_v"(%arg0, %size_splits, %split_dim) {num_splits = 1 : i32} : (tensor<16x4x4xf32>, tensor<2x2xi32>, tensor<i32>) -> tensor<16x4x4xf32>
   return %0 : tensor<16x4x4xf32>
 }
@@ -1711,7 +1733,7 @@ func @testSplitVOpWithBadSizeSplitsTensorType(%arg0: tensor<16x4x4xf32>) -> tens
 
 func @testSplitVOpWithBadSizeSplitsUnrankedTensorType(%arg0: tensor<16x4x4xf32>, %size_splits: tensor<*xi32>) -> tensor<16x4x4xf32> {
   %split_dim = constant dense<0> : tensor<i32>
-  // expected-error @+1 {{'tfl.split_v' op operand #1 must be 1D tensor of 32-bit integer values}}
+  // expected-error @+1 {{'tfl.split_v' op operand #1 must be 1D tensor of 32-bit signless integer values}}
   %0 = "tfl.split_v"(%arg0, %size_splits, %split_dim) {num_splits = 1 : i32} : (tensor<16x4x4xf32>, tensor<*xi32>, tensor<i32>) -> tensor<16x4x4xf32>
   return %0 : tensor<16x4x4xf32>
 }
@@ -1761,7 +1783,7 @@ func @testSplitVOpWithBadSizeSplitsSize(%arg0: tensor<16x4x4xf32>) -> tensor<15x
 func @testSplitVOpWithBadSplitDimTensorType(%arg0: tensor<16x4x4xf32>) -> tensor<16x4x4xf32> {
   %size_splits = constant dense<[16]> : tensor<1xi32>
   %split_dim = constant dense<0> : tensor<2x2xi32>
-  // expected-error @+1 {{'tfl.split_v' op operand #2 must be 0D tensor of 32-bit integer values}}
+  // expected-error @+1 {{'tfl.split_v' op operand #2 must be 0D tensor of 32-bit signless integer values}}
   %0 = "tfl.split_v"(%arg0, %size_splits, %split_dim) {num_splits = 1 : i32} : (tensor<16x4x4xf32>, tensor<1xi32>, tensor<2x2xi32>) -> tensor<16x4x4xf32>
   return %0 : tensor<16x4x4xf32>
 }
@@ -1770,7 +1792,7 @@ func @testSplitVOpWithBadSplitDimTensorType(%arg0: tensor<16x4x4xf32>) -> tensor
 
 func @testSplitVOpWithBadSplitDimUnrankedTensorType(%arg0: tensor<16x4x4xf32>, %split_dim : tensor<*xi32>) -> tensor<16x4x4xf32> {
   %size_splits = constant dense<[16]> : tensor<1xi32>
-  // expected-error @+1 {{'tfl.split_v' op operand #2 must be 0D tensor of 32-bit integer values}}
+  // expected-error @+1 {{'tfl.split_v' op operand #2 must be 0D tensor of 32-bit signless integer values}}
   %0 = "tfl.split_v"(%arg0, %size_splits, %split_dim) {num_splits = 1 : i32} : (tensor<16x4x4xf32>, tensor<1xi32>, tensor<*xi32>) -> tensor<16x4x4xf32>
   return %0 : tensor<16x4x4xf32>
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/optimize.mlir b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
index aaf2664ea3c..ae5bd6ced5e 100644
--- a/tensorflow/compiler/mlir/lite/tests/optimize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
@@ -3,6 +3,9 @@
 // Run optimize pass and then canonicalize pass, and make sure some folding is applied.
 // RUN: tf-opt %s -tfl-optimize -canonicalize | FileCheck --check-prefix=FOLD %s
 
+// Run legalize pass and then optimize pass, and make sure some fusing is applied.
+// RUN: tf-opt %s -tfl-legalize-tf -tfl-optimize | FileCheck --check-prefix=Fusing --dump-input-on-failure %s
+
 // CHECK-LABEL: fusedConv2dRelu
 func @fusedConv2dRelu(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<16x3x3x3xf32>, %arg2: tensor<16xf32>) -> tensor<256x30x30x16xf32> {
   %0 = "tfl.conv_2d"(%arg0, %arg1, %arg2) {dilation_h_factor = 2 : i32, dilation_w_factor = 3 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 4 : i32, stride_w = 5 : i32} : (tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<256x30x30x16xf32>
@@ -680,6 +683,18 @@ func @InvalidFuseTileWithBinaryOp(%arg0: tensor<2x3xf32>) -> tensor<2x6xf32> {
   // CHECK: %[[TILE:[0-9].*]] = "tfl.tile"
 }
 
+// CHECK-LABEL: InvalidFuseTileAlreadyBroadcastAlongTileDim
+func @InvalidFuseTileAlreadyBroadcastAlongTileDim(%arg0: tensor<1x1x1x1xf32>) -> tensor<1x6x8x1xf32> {
+  %cst_1 = constant dense<[1, 6, 8, 1]> : tensor<4xi32>
+  %cst_2 = constant dense<[1, 1, 1, 46]> : tensor<4xi32>
+  %cst_20 = constant dense<4.600000e+01> : tensor<f32>
+  %0 = "tfl.tile"(%arg0, %cst_1) : (tensor<1x1x1x1xf32>, tensor<4xi32>) -> tensor<1x6x8x1xf32>
+  %1 = "tfl.mul"(%0, %cst_20) {fused_activation_function = "NONE"} : (tensor<1x6x8x1xf32>, tensor<f32>) -> tensor<1x6x8x1xf32>
+  return %1 : tensor<1x6x8x1xf32>
+
+  // CHECK: %[[TILE:[0-9].*]] = "tfl.tile"
+}
+
 // CHECK-LABEL: FuseHardswish
 func @FuseHardswish(%arg0: tensor<1x112x112x16xf32>) -> tensor<1x56x56x16xf32> {
   %cst_0 = constant dense<3.0> : tensor<f32>
@@ -835,3 +850,51 @@ func @NotfuseAddIntoConv2d_MultipleUsers(%arg0: tensor<256x32x32x3xf32>, %arg1:
   // CHECK: tfl.add
   // CHECK-NEXT: tfl.add
 }
+
+func @FusingaddRelu(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
+  %0 = "tf.Add"(%arg0, %arg1) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+  %1 = "tf.Add"(%arg0, %0) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+  %2 = "tf.Relu"(%1) : (tensor<1xf32>) -> tensor<1xf32>
+  %3 = "tf.Relu"(%arg0) : (tensor<1xf32>) -> tensor<1xf32>
+  %4 = "tf.Add"(%3, %2) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+  %5 = "tf.Relu6"(%4) : (tensor<1xf32>) -> tensor<1xf32>
+  %6 = "tfl.add"(%5, %3) {fused_activation_function = "NONE"} : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+  %7 = "tf.Relu6"(%6) : (tensor<1xf32>) -> tensor<1xf32>
+  return %7: tensor<1xf32>
+
+// Fusing-LABEL: FusingaddRelu
+// Fusing:  %[[add:[0-9].*]] = tfl.add %arg0, %arg1 {fused_activation_function = "NONE"} : tensor<1xf32>
+// Fusing:  %[[add1:[0-9].*]] = tfl.add %arg0, %[[add]] {fused_activation_function = "RELU"} : tensor<1xf32>
+// Fusing:  %[[relu:[0-9].*]] = "tfl.relu"(%arg0) : (tensor<1xf32>) -> tensor<1xf32>
+// Fusing:  %[[add2:[0-9].*]] = tfl.add %[[relu]], %[[add1]] {fused_activation_function = "RELU6"} : tensor<1xf32>
+// Fusing:  %[[add3:[0-9].*]] = tfl.add %[[add2]], %[[relu]] {fused_activation_function = "RELU6"} : tensor<1xf32>
+// Fusing:  return
+}
+
+func @FusingbiasAdd(%arg0: tensor<1x10x10x32xf32>, %arg1: tensor<32xf32>) -> tensor<1x10x10x32xf32> {
+  %0 = "tf.BiasAdd"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", data_format = "NHWC"} : (tensor<1x10x10x32xf32>, tensor<32xf32>) -> tensor<1x10x10x32xf32>
+  %1 = "tf.BiasAdd"(%0, %arg1) {T = "tfdtype$DT_FLOAT", data_format = "NHWC"} : (tensor<1x10x10x32xf32>, tensor<32xf32>) -> tensor<1x10x10x32xf32>
+  %2 = "tf.Relu6"(%1) : (tensor<1x10x10x32xf32>) -> tensor<1x10x10x32xf32>
+  return %2 : tensor<1x10x10x32xf32>
+
+// Fusing-LABEL: FusingbiasAdd
+// Fusing:  %[[add:[0-9].*]] = "tfl.add"(%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<1x10x10x32xf32>, tensor<32xf32>) -> tensor<1x10x10x32xf32>
+// Fusing:  %[[add1:[0-9].*]] = "tfl.add"(%[[add]], %arg1) {fused_activation_function = "RELU6"} : (tensor<1x10x10x32xf32>, tensor<32xf32>) -> tensor<1x10x10x32xf32>
+}
+
+func @FusingdivRelu(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
+  %0 = "tf.Div"(%arg0, %arg1) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+  %1 = "tf.Div"(%arg0, %0) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+  %2 = "tf.Relu"(%1) : (tensor<1xf32>) -> tensor<1xf32>
+  %3 = "tf.Relu"(%arg0) : (tensor<1xf32>) -> tensor<1xf32>
+  %4 = "tf.Div"(%3, %2) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+  %5 = "tf.Relu6"(%4) : (tensor<1xf32>) -> tensor<1xf32>
+  return %5: tensor<1xf32>
+
+// Fusing-LABEL: FusingdivRelu
+// Fusing:  %[[div:[0-9].*]] = tfl.div %arg0, %arg1 {fused_activation_function = "NONE"} : tensor<1xf32>
+// Fusing:  %[[div1:[0-9].*]] = tfl.div %arg0, %[[div]] {fused_activation_function = "RELU"} : tensor<1xf32>
+// Fusing:  %[[relu:[0-9].*]] = "tfl.relu"(%arg0) : (tensor<1xf32>) -> tensor<1xf32>
+// Fusing:  %[[div2:[0-9].*]] = tfl.div %[[relu]], %[[div1]] {fused_activation_function = "RELU6"} : tensor<1xf32>
+// Fusing:  return
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/optimize_functional_ops.mlir b/tensorflow/compiler/mlir/lite/tests/optimize_functional_ops.mlir
index 846f0126f21..dfd0c870a22 100644
--- a/tensorflow/compiler/mlir/lite/tests/optimize_functional_ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/optimize_functional_ops.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s -tfl-optimize-functional-ops -split-input-file | FileCheck %s
+// RUN: tf-opt %s -tfl-optimize-functional-ops -split-input-file | FileCheck %s --dump-input-on-failure
 
 // CHECK-LABEL: main
 func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>) {
@@ -131,5 +131,80 @@ func @_functionalize_if_then_branch_00(%arg0: tensor<*xi1>, %arg1: tensor<*xf32>
 // CHECK: func @main
 // CHECK-NOT: tf.If
 // CHECK: return
-// CHECK-NOT: func else_branch
-// CHECK-NOT: func then_branch
+// CHECK-NOT: func @_functionalize_if_else_branch_00
+// CHECK-NOT: func @_functionalize_if_then_branch_00
+
+// -----
+
+// Verify unused if with function with side-effects is not removed.
+
+func @main(%arg0: tensor<3x15x14x3xf32>) -> tensor<3x15x14x8xf32>
+    attributes {tf.entry_function = {inputs = "input", outputs = "Conv2D"}} {
+  %cst = constant dense<[0, 1, 2, 3]> : tensor<4xi32>
+  %cst_0 = constant dense<1.000000e+00> : tensor<f32>
+  %cst_1 = constant dense<0.000000e+00> : tensor<8xf32>
+  %cst_2 = constant dense<0.000000e+00> : tensor<8x3x3x3xf32>
+  %0 = "tfl.sub"(%arg0, %cst_0) {fused_activation_function = "NONE"} : (tensor<3x15x14x3xf32>, tensor<f32>) -> tensor<3x15x14x3xf32>
+  %1 = "tfl.greater_equal"(%arg0, %0) : (tensor<3x15x14x3xf32>, tensor<3x15x14x3xf32>) -> tensor<3x15x14x3xi1>
+  %2 = "tf.All"(%1, %cst) {Tidx = i32, device = "/device:CPU:0", keep_dims = false} : (tensor<3x15x14x3xi1>, tensor<4xi32>) -> tensor<i1>
+  %3 = "tf.If"(%2, %2, %arg0, %0) {Tcond = i1,
+    else_branch = @_functionalize_if_else_branch_01, is_stateless = false,
+    then_branch = @_functionalize_if_then_branch_01} :
+      (tensor<i1>, tensor<i1>, tensor<3x15x14x3xf32>, tensor<3x15x14x3xf32>) -> tensor<i1>
+  %4 = "tfl.conv_2d"(%arg0, %cst_2, %cst_1) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<3x15x14x3xf32>, tensor<8x3x3x3xf32>, tensor<8xf32>) -> tensor<3x15x14x8xf32>
+  return %4 : tensor<3x15x14x8xf32>
+}
+
+func @_functionalize_if_else_branch_01(%arg0: tensor<*xi1>, %arg1: tensor<*xf32>, %arg2: tensor<*xf32>) -> tensor<i1> {
+  %cst = constant dense<false> : tensor<i1>
+  return %cst : tensor<i1>
+}
+
+func @_functionalize_if_then_branch_01(%arg0: tensor<*xi1>, %arg1: tensor<*xf32>, %arg2: tensor<*xf32>) -> tensor<i1> {
+  %0 = "my_unknown_op.blah"() : () -> tensor<i1>
+  return %0 : tensor<i1>
+}
+
+// CHECK: func @main
+// CHECK: tf.If
+// CHECK: return
+// CHECK: func @_functionalize_if_else_branch_01
+// CHECK: func @_functionalize_if_then_branch_01
+
+// -----
+
+// Verify unused if with function with side-effects is removed if op says
+// stateless.
+
+func @main(%arg0: tensor<3x15x14x3xf32>) -> tensor<3x15x14x8xf32>
+    attributes {tf.entry_function = {inputs = "input", outputs = "Conv2D"}} {
+  %cst = constant dense<[0, 1, 2, 3]> : tensor<4xi32>
+  %cst_0 = constant dense<1.000000e+00> : tensor<f32>
+  %cst_1 = constant dense<0.000000e+00> : tensor<8xf32>
+  %cst_2 = constant dense<0.000000e+00> : tensor<8x3x3x3xf32>
+  %0 = "tfl.sub"(%arg0, %cst_0) {fused_activation_function = "NONE"} : (tensor<3x15x14x3xf32>, tensor<f32>) -> tensor<3x15x14x3xf32>
+  %1 = "tfl.greater_equal"(%arg0, %0) : (tensor<3x15x14x3xf32>, tensor<3x15x14x3xf32>) -> tensor<3x15x14x3xi1>
+  %2 = "tf.All"(%1, %cst) {Tidx = i32, device = "/device:CPU:0", keep_dims = false} : (tensor<3x15x14x3xi1>, tensor<4xi32>) -> tensor<i1>
+  %3 = "tf.If"(%2, %2, %arg0, %0) {Tcond = i1,
+    else_branch = @_functionalize_if_else_branch_02, is_stateless = true,
+    then_branch = @_functionalize_if_then_branch_02} :
+      (tensor<i1>, tensor<i1>, tensor<3x15x14x3xf32>, tensor<3x15x14x3xf32>) -> tensor<i1>
+  %4 = "tfl.conv_2d"(%arg0, %cst_2, %cst_1) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<3x15x14x3xf32>, tensor<8x3x3x3xf32>, tensor<8xf32>) -> tensor<3x15x14x8xf32>
+  return %4 : tensor<3x15x14x8xf32>
+}
+
+func @_functionalize_if_else_branch_02(%arg0: tensor<*xi1>, %arg1: tensor<*xf32>, %arg2: tensor<*xf32>) -> tensor<i1> {
+  %cst = constant dense<false> : tensor<i1>
+  return %cst : tensor<i1>
+}
+
+func @_functionalize_if_then_branch_02(%arg0: tensor<*xi1>, %arg1: tensor<*xf32>, %arg2: tensor<*xf32>) -> tensor<i1> {
+  %0 = "my_unknown_op.blah"() : () -> tensor<i1>
+  return %0 : tensor<i1>
+}
+
+// CHECK: func @main
+// CHECK-NOT: tf.If
+// CHECK: return
+// CHECK-NOT: func @_functionalize_if_else_branch_02
+// CHECK-NOT: func @_functionalize_if_then_branch_02
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir
index c34cfdf441c..83ab0f9cd0e 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir
@@ -42,10 +42,10 @@ func @layernormalizedlstmcellsimple(%arg0: tensor<1x?xf32>, %arg1: tensor<3x4xf3
 // CHECK-SAME:                          [[VAL_0]]: tensor<1x?xf32>, [[VAL_1]]: tensor<3x4xf32>, [[VAL_3:%.*]]: tensor<2xf32>, [[VAL_4:%.*]]: tensor<1x3xf32>, [[VAL_5:%.*]]: tensor<?xf32>) -> tensor<1x?xf32>
 
 // CHECK-LABEL:   attributes  {tf._implements = "LSTMCellSimple", tf._reference = "mlir"} {
-// CHECK:           [[VAL_6:%.*]] = constant dense<[1, 0]> : tensor<2xi64>
-// CHECK:           [[VAL_7:%.*]] = "tf.Transpose"([[VAL_1]], [[VAL_6]]) : (tensor<3x4xf32>, tensor<2xi64>) -> tensor<4x3xf32>
-// CHECK:           [[VAL_8:%.*]] = constant dense<[1, 0]> : tensor<2xi64>
-// CHECK:           [[VAL_9:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_8]]) : (tensor<1x3xf32>, tensor<2xi64>) -> tensor<3x1xf32>
+// CHECK:           [[VAL_6:%.*]] = constant dense<[1, 0]> : tensor<2xi32>
+// CHECK:           [[VAL_7:%.*]] = "tf.Transpose"([[VAL_1]], [[VAL_6]]) : (tensor<3x4xf32>, tensor<2xi32>) -> tensor<4x3xf32>
+// CHECK:           [[VAL_8:%.*]] = constant dense<[1, 0]> : tensor<2xi32>
+// CHECK:           [[VAL_9:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_8]]) : (tensor<1x3xf32>, tensor<2xi32>) -> tensor<3x1xf32>
 // CHECK:           [[VAL_10:%.*]] = constant unit
 // CHECK:           [[VAL_11:%.*]] = constant dense<0> : tensor<2xi64>
 // CHECK:           [[VAL_12:%.*]] = constant dense<[1, 0]> : tensor<2xi64>
@@ -94,10 +94,10 @@ func @layernormalizedlstmcellsimple(%arg0: tensor<1x?xf32>, %arg1: tensor<3x4xf3
 // CHECK-SAME:                                        [[VAL_0]]: tensor<1x?xf32>, [[VAL_1]]: tensor<3x4xf32>, [[VAL_3]]: tensor<2xf32>, [[VAL_4]]: tensor<1x3xf32>, [[VAL_5]]: tensor<2xf32>) -> tensor<1x?xf32>
 
 // CHECK-LABEL:   attributes  {tf._implements = "LayerNormalizedLstmCellSimple", tf._reference = "mlir"} {
-// CHECK:           [[VAL_52:%.*]] = constant dense<[1, 0]> : tensor<2xi64>
-// CHECK:           [[VAL_53:%.*]] = "tf.Transpose"([[VAL_1]], [[VAL_52]]) : (tensor<3x4xf32>, tensor<2xi64>) -> tensor<4x3xf32>
-// CHECK:           [[VAL_54:%.*]] = constant dense<[1, 0]> : tensor<2xi64>
-// CHECK:           [[VAL_55:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_54]]) : (tensor<1x3xf32>, tensor<2xi64>) -> tensor<3x1xf32>
+// CHECK:           [[VAL_52:%.*]] = constant dense<[1, 0]> : tensor<2xi32>
+// CHECK:           [[VAL_53:%.*]] = "tf.Transpose"([[VAL_1]], [[VAL_52]]) : (tensor<3x4xf32>, tensor<2xi32>) -> tensor<4x3xf32>
+// CHECK:           [[VAL_54:%.*]] = constant dense<[1, 0]> : tensor<2xi32>
+// CHECK:           [[VAL_55:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_54]]) : (tensor<1x3xf32>, tensor<2xi32>) -> tensor<3x1xf32>
 // CHECK:           [[VAL_56:%.*]] = constant unit
 // CHECK:           [[VAL_57:%.*]] = constant dense<0> : tensor<2xi64>
 // CHECK:           [[VAL_58:%.*]] = constant dense<[1, 0]> : tensor<2xi64>
@@ -165,11 +165,11 @@ func @inference_standard_lstm_time_major(%arg0: tensor<?x8x8xf32>, %arg1: tensor
   return %5, %4, %5, %5, %6 : tensor<?x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
 }
 
-// CHECK:       func @inference_standard_lstm_time_major([[VAL_0:%.*]]: tensor<?x8x8xf32>, [[VAL_1:%.*]]: tensor<?x10xf32>, [[VAL_2:%.*]]: tensor<?x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x40xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> (tensor<?x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = true} {
-// CHECK:           [[VAL_6:%.*]] = constant dense<[1, 0]> : tensor<2xi64>
-// CHECK:           [[VAL_7:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_6]]) : (tensor<8x40xf32>, tensor<2xi64>) -> tensor<40x8xf32>
-// CHECK:           [[VAL_8:%.*]] = constant dense<[1, 0]> : tensor<2xi64>
-// CHECK:           [[VAL_9:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_8]]) : (tensor<10x40xf32>, tensor<2xi64>) -> tensor<40x10xf32>
+// CHECK:       func @inference_standard_lstm_time_major([[VAL_0:%.*]]: tensor<?x8x8xf32>, [[VAL_1:%.*]]: tensor<?x10xf32>, [[VAL_2:%.*]]: tensor<?x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x40xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = true} {
+// CHECK:           [[VAL_6:%.*]] = constant dense<[1, 0]> : tensor<2xi32>
+// CHECK:           [[VAL_7:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_6]]) : (tensor<8x40xf32>, tensor<2xi32>) -> tensor<40x8xf32>
+// CHECK:           [[VAL_8:%.*]] = constant dense<[1, 0]> : tensor<2xi32>
+// CHECK:           [[VAL_9:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_8]]) : (tensor<10x40xf32>, tensor<2xi32>) -> tensor<40x10xf32>
 // CHECK:           [[VAL_10:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
 // CHECK:           [[VAL_11:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
 // CHECK:           [[VAL_12:%.*]]:4 = "tf.SplitV"([[VAL_7]], [[VAL_10]], [[VAL_11]]) : (tensor<40x8xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>)
@@ -181,11 +181,14 @@ func @inference_standard_lstm_time_major(%arg0: tensor<?x8x8xf32>, %arg1: tensor
 // CHECK:           [[VAL_18:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_16]], [[VAL_17]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
 // CHECK:           [[VAL_19:%.*]] = constant unit
 // CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<?x10xf32>, tensor<?x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
-// CHECK:           [[VAL_21:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
-// CHECK:           [[VAL_22:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
-// CHECK:           [[VAL_23:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
-// CHECK:           [[VAL_24:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
-// CHECK:           return [[VAL_21]], [[VAL_20]], [[VAL_22]], [[VAL_23]], [[VAL_24]] : tensor<?x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
+// CHECK:           [[VAL_21:%.*]] = constant dense<[-1, 0, 0]> : tensor<3xi32>
+// CHECK:           [[VAL_22:%.*]] = constant dense<0> : tensor<3xi32>
+// CHECK:           [[VAL_23:%.*]] = constant dense<1> : tensor<3xi32>
+// CHECK:           [[VAL_24:%.*]] = "tf.StridedSlice"([[VAL_20]], [[VAL_21]], [[VAL_22]], [[VAL_23]]) {begin_mask = 6 : i64, ellipsis_mask = 0 : i64, end_mask = 6 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
+// CHECK:           [[VAL_25:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
+// CHECK:           [[VAL_26:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
+// CHECK:           [[VAL_27:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
+// CHECK:           return [[VAL_24]], [[VAL_20]], [[VAL_25]], [[VAL_26]], [[VAL_27]] : tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
 // CHECK:         }
 }
 
@@ -203,32 +206,32 @@ func @inference_standard_lstm_non_time_major(%arg0: tensor<8x8x8xf32>, %arg1: te
   return %5, %4, %5, %5, %6 : tensor<?x10xf32>, tensor<8x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
 }
 
-// CHECK:       func @inference_standard_lstm_non_time_major([[VAL_0:%.*]]: tensor<8x8x8xf32>, [[VAL_1:%.*]]: tensor<?x10xf32>, [[VAL_2:%.*]]: tensor<?x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x40xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> (tensor<?x10xf32>, tensor<8x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = false} {
-// CHECK:           [[VAL_6:%.*]] = constant dense<[1, 0, 2]> : tensor<3xi64>
-// CHECK:           [[VAL_7:%.*]] = "tf.Transpose"([[VAL_0]], [[VAL_6]]) : (tensor<8x8x8xf32>, tensor<3xi64>) -> tensor<8x8x8xf32>
-// CHECK:           [[VAL_8:%.*]] = constant dense<[1, 0]> : tensor<2xi64>
-// CHECK:           [[VAL_9:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_8]]) : (tensor<8x40xf32>, tensor<2xi64>) -> tensor<40x8xf32>
-// CHECK:           [[VAL_10:%.*]] = constant dense<[1, 0]> : tensor<2xi64>
-// CHECK:           [[VAL_11:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_10]]) : (tensor<10x40xf32>, tensor<2xi64>) -> tensor<40x10xf32>
-// CHECK:           [[VAL_12:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK:           [[VAL_13:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK:           [[VAL_14:%.*]]:4 = "tf.SplitV"([[VAL_9]], [[VAL_12]], [[VAL_13]]) : (tensor<40x8xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>)
-// CHECK:           [[VAL_15:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK:           [[VAL_16:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK:           [[VAL_17:%.*]]:4 = "tf.SplitV"([[VAL_11]], [[VAL_15]], [[VAL_16]]) : (tensor<40x10xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>)
-// CHECK:           [[VAL_18:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK:           [[VAL_19:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK:           [[VAL_20:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_18]], [[VAL_19]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
-// CHECK:           [[VAL_21:%.*]] = constant unit
-// CHECK:           [[VAL_22:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_14]]#0, [[VAL_14]]#1, [[VAL_14]]#2, [[VAL_14]]#3, [[VAL_17]]#0, [[VAL_17]]#1, [[VAL_17]]#2, [[VAL_17]]#3, [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_20]]#0, [[VAL_20]]#1, [[VAL_20]]#2, [[VAL_20]]#3, [[VAL_21]], [[VAL_21]], [[VAL_1]], [[VAL_2]], [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_21]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<?x10xf32>, tensor<?x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
-// CHECK:           [[VAL_23:%.*]] = constant dense<[1, 0, 2]> : tensor<3xi64>
-// CHECK:           [[VAL_24:%.*]] = "tf.Transpose"([[VAL_22]], [[VAL_23]]) : (tensor<8x8x10xf32>, tensor<3xi64>) -> tensor<8x8x10xf32>
+// CHECK:       func @inference_standard_lstm_non_time_major([[VAL_0:%.*]]: tensor<8x8x8xf32>, [[VAL_1:%.*]]: tensor<?x10xf32>, [[VAL_2:%.*]]: tensor<?x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x40xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = false} {
+// CHECK:           [[VAL_6:%.*]] = constant dense<[1, 0]> : tensor<2xi32>
+// CHECK:           [[VAL_7:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_6]]) : (tensor<8x40xf32>, tensor<2xi32>) -> tensor<40x8xf32>
+// CHECK:           [[VAL_8:%.*]] = constant dense<[1, 0]> : tensor<2xi32>
+// CHECK:           [[VAL_9:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_8]]) : (tensor<10x40xf32>, tensor<2xi32>) -> tensor<40x10xf32>
+// CHECK:           [[VAL_10:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK:           [[VAL_11:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           [[VAL_12:%.*]]:4 = "tf.SplitV"([[VAL_7]], [[VAL_10]], [[VAL_11]]) : (tensor<40x8xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>)
+// CHECK:           [[VAL_13:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK:           [[VAL_14:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           [[VAL_15:%.*]]:4 = "tf.SplitV"([[VAL_9]], [[VAL_13]], [[VAL_14]]) : (tensor<40x10xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>)
+// CHECK:           [[VAL_16:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK:           [[VAL_17:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           [[VAL_18:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_16]], [[VAL_17]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
+// CHECK:           [[VAL_19:%.*]] = constant unit
+// CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<?x10xf32>, tensor<?x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
+// CHECK:           [[VAL_21:%.*]] = constant dense<[0, -1, 0]> : tensor<3xi32>
+// CHECK:           [[VAL_22:%.*]] = constant dense<0> : tensor<3xi32>
+// CHECK:           [[VAL_23:%.*]] = constant dense<1> : tensor<3xi32>
+// CHECK:           [[VAL_24:%.*]] = "tf.StridedSlice"([[VAL_20]], [[VAL_21]], [[VAL_22]], [[VAL_23]]) {begin_mask = 5 : i64, ellipsis_mask = 0 : i64, end_mask = 5 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 2 : i64} : (tensor<8x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
 // CHECK:           [[VAL_25:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
 // CHECK:           [[VAL_26:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
-// CHECK:           [[VAL_27:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
-// CHECK:           [[VAL_28:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
-// CHECK:           return [[VAL_25]], [[VAL_24]], [[VAL_26]], [[VAL_27]], [[VAL_28]] : tensor<?x10xf32>, tensor<8x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
+// CHECK:           [[VAL_27:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
+// CHECK:           return [[VAL_24]], [[VAL_20]], [[VAL_25]], [[VAL_26]], [[VAL_27]] : tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
 // CHECK:         }
+
 }
 
 // -----
@@ -245,13 +248,13 @@ func @inference_standard_lstm_time_major_go_backwards(%arg0: tensor<?x8x8xf32>,
   return %5, %4, %5, %5, %6 : tensor<?x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
 }
 
-// CHECK:       func @inference_standard_lstm_time_major_go_backwards([[VAL_0:%.*]]: tensor<?x8x8xf32>, [[VAL_1:%.*]]: tensor<?x10xf32>, [[VAL_2:%.*]]: tensor<?x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x40xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> (tensor<?x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = true, tf.time_major = true} {
+// CHECK:       func @inference_standard_lstm_time_major_go_backwards([[VAL_0:%.*]]: tensor<?x8x8xf32>, [[VAL_1:%.*]]: tensor<?x10xf32>, [[VAL_2:%.*]]: tensor<?x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x40xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = true, tf.time_major = true} {
 // CHECK:           [[VAL_6:%.*]] = constant dense<0> : tensor<1xi32>
 // CHECK:           [[VAL_7:%.*]] = "tf.ReverseV2"([[VAL_0]], [[VAL_6]]) : (tensor<?x8x8xf32>, tensor<1xi32>) -> tensor<?x8x8xf32>
-// CHECK:           [[VAL_8:%.*]] = constant dense<[1, 0]> : tensor<2xi64>
-// CHECK:           [[VAL_9:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_8]]) : (tensor<8x40xf32>, tensor<2xi64>) -> tensor<40x8xf32>
-// CHECK:           [[VAL_10:%.*]] = constant dense<[1, 0]> : tensor<2xi64>
-// CHECK:           [[VAL_11:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_10]]) : (tensor<10x40xf32>, tensor<2xi64>) -> tensor<40x10xf32>
+// CHECK:           [[VAL_8:%.*]] = constant dense<[1, 0]> : tensor<2xi32>
+// CHECK:           [[VAL_9:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_8]]) : (tensor<8x40xf32>, tensor<2xi32>) -> tensor<40x8xf32>
+// CHECK:           [[VAL_10:%.*]] = constant dense<[1, 0]> : tensor<2xi32>
+// CHECK:           [[VAL_11:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_10]]) : (tensor<10x40xf32>, tensor<2xi32>) -> tensor<40x10xf32>
 // CHECK:           [[VAL_12:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
 // CHECK:           [[VAL_13:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
 // CHECK:           [[VAL_14:%.*]]:4 = "tf.SplitV"([[VAL_9]], [[VAL_12]], [[VAL_13]]) : (tensor<40x8xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>)
@@ -262,12 +265,15 @@ func @inference_standard_lstm_time_major_go_backwards(%arg0: tensor<?x8x8xf32>,
 // CHECK:           [[VAL_19:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
 // CHECK:           [[VAL_20:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_18]], [[VAL_19]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
 // CHECK:           [[VAL_21:%.*]] = constant unit
-// CHECK:           [[VAL_22:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_14]]#0, [[VAL_14]]#1, [[VAL_14]]#2, [[VAL_14]]#3, [[VAL_17]]#0, [[VAL_17]]#1, [[VAL_17]]#2, [[VAL_17]]#3, [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_20]]#0, [[VAL_20]]#1, [[VAL_20]]#2, [[VAL_20]]#3, [[VAL_21]], [[VAL_21]], [[VAL_1]], [[VAL_2]], [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_21]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<?x10xf32>, tensor<?x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
-// CHECK:           [[VAL_23:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
-// CHECK:           [[VAL_24:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
-// CHECK:           [[VAL_25:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
-// CHECK:           [[VAL_26:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
-// CHECK:           return [[VAL_23]], [[VAL_22]], [[VAL_24]], [[VAL_25]], [[VAL_26]] : tensor<?x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
+// CHECK:           [[VAL_22:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_7]], [[VAL_14]]#0, [[VAL_14]]#1, [[VAL_14]]#2, [[VAL_14]]#3, [[VAL_17]]#0, [[VAL_17]]#1, [[VAL_17]]#2, [[VAL_17]]#3, [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_20]]#0, [[VAL_20]]#1, [[VAL_20]]#2, [[VAL_20]]#3, [[VAL_21]], [[VAL_21]], [[VAL_1]], [[VAL_2]], [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_21]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<?x10xf32>, tensor<?x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
+// CHECK:           [[VAL_23:%.*]] = constant dense<[-1, 0, 0]> : tensor<3xi32>
+// CHECK:           [[VAL_24:%.*]] = constant dense<0> : tensor<3xi32>
+// CHECK:           [[VAL_25:%.*]] = constant dense<1> : tensor<3xi32>
+// CHECK:           [[VAL_26:%.*]] = "tf.StridedSlice"([[VAL_22]], [[VAL_23]], [[VAL_24]], [[VAL_25]]) {begin_mask = 6 : i64, ellipsis_mask = 0 : i64, end_mask = 6 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
+// CHECK:           [[VAL_27:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
+// CHECK:           [[VAL_28:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
+// CHECK:           [[VAL_29:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
+// CHECK:           return [[VAL_26]], [[VAL_22]], [[VAL_27]], [[VAL_28]], [[VAL_29]] : tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
 // CHECK:         }
 
 }
@@ -286,33 +292,32 @@ func @inference_standard_lstm_non_time_major_go_backwards(%arg0: tensor<8x8x8xf3
   return %5, %4, %5, %5, %6 : tensor<?x10xf32>, tensor<8x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
 }
 
-// CHECK:       func @inference_standard_lstm_non_time_major_go_backwards([[VAL_0:%.*]]: tensor<8x8x8xf32>, [[VAL_1:%.*]]: tensor<?x10xf32>, [[VAL_2:%.*]]: tensor<?x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x40xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> (tensor<?x10xf32>, tensor<8x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = true, tf.time_major = false} {
-// CHECK:           [[VAL_6:%.*]] = constant dense<[1, 0, 2]> : tensor<3xi64>
-// CHECK:           [[VAL_7:%.*]] = "tf.Transpose"([[VAL_0]], [[VAL_6]]) : (tensor<8x8x8xf32>, tensor<3xi64>) -> tensor<8x8x8xf32>
-// CHECK:           [[VAL_8:%.*]] = constant dense<0> : tensor<1xi32>
-// CHECK:           [[VAL_9:%.*]] = "tf.ReverseV2"([[VAL_7]], [[VAL_8]]) : (tensor<8x8x8xf32>, tensor<1xi32>) -> tensor<8x8x8xf32>
-// CHECK:           [[VAL_10:%.*]] = constant dense<[1, 0]> : tensor<2xi64>
-// CHECK:           [[VAL_11:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_10]]) : (tensor<8x40xf32>, tensor<2xi64>) -> tensor<40x8xf32>
-// CHECK:           [[VAL_12:%.*]] = constant dense<[1, 0]> : tensor<2xi64>
-// CHECK:           [[VAL_13:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_12]]) : (tensor<10x40xf32>, tensor<2xi64>) -> tensor<40x10xf32>
-// CHECK:           [[VAL_14:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK:           [[VAL_15:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK:           [[VAL_16:%.*]]:4 = "tf.SplitV"([[VAL_11]], [[VAL_14]], [[VAL_15]]) : (tensor<40x8xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>)
-// CHECK:           [[VAL_17:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK:           [[VAL_18:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK:           [[VAL_19:%.*]]:4 = "tf.SplitV"([[VAL_13]], [[VAL_17]], [[VAL_18]]) : (tensor<40x10xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>)
-// CHECK:           [[VAL_20:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK:           [[VAL_21:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK:           [[VAL_22:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_20]], [[VAL_21]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
-// CHECK:           [[VAL_23:%.*]] = constant unit
-// CHECK:           [[VAL_24:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_16]]#0, [[VAL_16]]#1, [[VAL_16]]#2, [[VAL_16]]#3, [[VAL_19]]#0, [[VAL_19]]#1, [[VAL_19]]#2, [[VAL_19]]#3, [[VAL_23]], [[VAL_23]], [[VAL_23]], [[VAL_22]]#0, [[VAL_22]]#1, [[VAL_22]]#2, [[VAL_22]]#3, [[VAL_23]], [[VAL_23]], [[VAL_1]], [[VAL_2]], [[VAL_23]], [[VAL_23]], [[VAL_23]], [[VAL_23]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<?x10xf32>, tensor<?x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
-// CHECK:           [[VAL_25:%.*]] = constant dense<[1, 0, 2]> : tensor<3xi64>
-// CHECK:           [[VAL_26:%.*]] = "tf.Transpose"([[VAL_24]], [[VAL_25]]) : (tensor<8x8x10xf32>, tensor<3xi64>) -> tensor<8x8x10xf32>
+// CHECK:       func @inference_standard_lstm_non_time_major_go_backwards([[VAL_0:%.*]]: tensor<8x8x8xf32>, [[VAL_1:%.*]]: tensor<?x10xf32>, [[VAL_2:%.*]]: tensor<?x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x40xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = true, tf.time_major = false} {
+// CHECK:           [[VAL_6:%.*]] = constant dense<1> : tensor<1xi32>
+// CHECK:           [[VAL_7:%.*]] = "tf.ReverseV2"([[VAL_0]], [[VAL_6]]) : (tensor<8x8x8xf32>, tensor<1xi32>) -> tensor<8x8x8xf32>
+// CHECK:           [[VAL_8:%.*]] = constant dense<[1, 0]> : tensor<2xi32>
+// CHECK:           [[VAL_9:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_8]]) : (tensor<8x40xf32>, tensor<2xi32>) -> tensor<40x8xf32>
+// CHECK:           [[VAL_10:%.*]] = constant dense<[1, 0]> : tensor<2xi32>
+// CHECK:           [[VAL_11:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_10]]) : (tensor<10x40xf32>, tensor<2xi32>) -> tensor<40x10xf32>
+// CHECK:           [[VAL_12:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK:           [[VAL_13:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           [[VAL_14:%.*]]:4 = "tf.SplitV"([[VAL_9]], [[VAL_12]], [[VAL_13]]) : (tensor<40x8xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>)
+// CHECK:           [[VAL_15:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK:           [[VAL_16:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           [[VAL_17:%.*]]:4 = "tf.SplitV"([[VAL_11]], [[VAL_15]], [[VAL_16]]) : (tensor<40x10xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>)
+// CHECK:           [[VAL_18:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK:           [[VAL_19:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           [[VAL_20:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_18]], [[VAL_19]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
+// CHECK:           [[VAL_21:%.*]] = constant unit
+// CHECK:           [[VAL_22:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_7]], [[VAL_14]]#0, [[VAL_14]]#1, [[VAL_14]]#2, [[VAL_14]]#3, [[VAL_17]]#0, [[VAL_17]]#1, [[VAL_17]]#2, [[VAL_17]]#3, [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_20]]#0, [[VAL_20]]#1, [[VAL_20]]#2, [[VAL_20]]#3, [[VAL_21]], [[VAL_21]], [[VAL_1]], [[VAL_2]], [[VAL_21]], [[VAL_21]], [[VAL_21]], [[VAL_21]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = false} : (tensor<8x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<?x10xf32>, tensor<?x10xf32>, none, none, none, none) -> tensor<8x8x10xf32>
+// CHECK:           [[VAL_23:%.*]] = constant dense<[0, -1, 0]> : tensor<3xi32>
+// CHECK:           [[VAL_24:%.*]] = constant dense<0> : tensor<3xi32>
+// CHECK:           [[VAL_25:%.*]] = constant dense<1> : tensor<3xi32>
+// CHECK:           [[VAL_26:%.*]] = "tf.StridedSlice"([[VAL_22]], [[VAL_23]], [[VAL_24]], [[VAL_25]]) {begin_mask = 5 : i64, ellipsis_mask = 0 : i64, end_mask = 5 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 2 : i64} : (tensor<8x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
 // CHECK:           [[VAL_27:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
 // CHECK:           [[VAL_28:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
-// CHECK:           [[VAL_29:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
-// CHECK:           [[VAL_30:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
-// CHECK:           return [[VAL_27]], [[VAL_26]], [[VAL_28]], [[VAL_29]], [[VAL_30]] : tensor<?x10xf32>, tensor<8x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
+// CHECK:           [[VAL_29:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
+// CHECK:           return [[VAL_26]], [[VAL_22]], [[VAL_27]], [[VAL_28]], [[VAL_29]] : tensor<8x10xf32>, tensor<8x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
 // CHECK:         }
 
 }
@@ -338,11 +343,11 @@ func @inference_standard_lstm_time_major_can_fuse(%arg0: tensor<?x8x8xf32>, %arg
   return %5, %4, %5, %5, %6 : tensor<?x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
 }
 
-// CHECK:       func @inference_standard_lstm_time_major_can_fuse([[VAL_0:%.*]]: tensor<?x8x8xf32>, [[VAL_1:%.*]]: tensor<?x10xf32>, [[VAL_2:%.*]]: tensor<?x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x40xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> (tensor<?x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = true} {
-// CHECK:           [[VAL_6:%.*]] = constant dense<[1, 0]> : tensor<2xi64>
-// CHECK:           [[VAL_7:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_6]]) : (tensor<8x40xf32>, tensor<2xi64>) -> tensor<40x8xf32>
-// CHECK:           [[VAL_8:%.*]] = constant dense<[1, 0]> : tensor<2xi64>
-// CHECK:           [[VAL_9:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_8]]) : (tensor<10x40xf32>, tensor<2xi64>) -> tensor<40x10xf32>
+// CHECK:       func @inference_standard_lstm_time_major_can_fuse([[VAL_0:%.*]]: tensor<?x8x8xf32>, [[VAL_1:%.*]]: tensor<?x10xf32>, [[VAL_2:%.*]]: tensor<?x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x40xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = true} {
+// CHECK:           [[VAL_6:%.*]] = constant dense<[1, 0]> : tensor<2xi32>
+// CHECK:           [[VAL_7:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_6]]) : (tensor<8x40xf32>, tensor<2xi32>) -> tensor<40x8xf32>
+// CHECK:           [[VAL_8:%.*]] = constant dense<[1, 0]> : tensor<2xi32>
+// CHECK:           [[VAL_9:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_8]]) : (tensor<10x40xf32>, tensor<2xi32>) -> tensor<40x10xf32>
 // CHECK:           [[VAL_10:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
 // CHECK:           [[VAL_11:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
 // CHECK:           [[VAL_12:%.*]]:4 = "tf.SplitV"([[VAL_7]], [[VAL_10]], [[VAL_11]]) : (tensor<40x8xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>)
@@ -354,11 +359,65 @@ func @inference_standard_lstm_time_major_can_fuse(%arg0: tensor<?x8x8xf32>, %arg
 // CHECK:           [[VAL_18:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_16]], [[VAL_17]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
 // CHECK:           [[VAL_19:%.*]] = constant unit
 // CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<?x10xf32>, tensor<?x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
-// CHECK:           [[VAL_21:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
-// CHECK:           [[VAL_22:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
-// CHECK:           [[VAL_23:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
-// CHECK:           [[VAL_24:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
-// CHECK:           return [[VAL_21]], [[VAL_20]], [[VAL_22]], [[VAL_23]], [[VAL_24]] : tensor<?x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
+// CHECK:           [[VAL_21:%.*]] = constant dense<[-1, 0, 0]> : tensor<3xi32>
+// CHECK:           [[VAL_22:%.*]] = constant dense<0> : tensor<3xi32>
+// CHECK:           [[VAL_23:%.*]] = constant dense<1> : tensor<3xi32>
+// CHECK:           [[VAL_24:%.*]] = "tf.StridedSlice"([[VAL_20]], [[VAL_21]], [[VAL_22]], [[VAL_23]]) {begin_mask = 6 : i64, ellipsis_mask = 0 : i64, end_mask = 6 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
+// CHECK:           [[VAL_25:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
+// CHECK:           [[VAL_26:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
+// CHECK:           [[VAL_27:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
+// CHECK:           return [[VAL_24]], [[VAL_20]], [[VAL_25]], [[VAL_26]], [[VAL_27]] : tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
+// CHECK:         }
+
+}
+
+// -----
+
+module {
+func @inference_can_fuse_last_output(%arg0: tensor<?x8x8xf32>, %arg1: tensor<?x10xf32>, %arg2: tensor<?x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x40xf32>, %arg5: tensor<40xf32>) {
+  %0 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "", dtype = f32, value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  %1:5 = "tf.PartitionedCall"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5) {Tin = ["tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT"], _output_shapes = ["tfshape$dim { size: 9 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 9 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$"], _read_only_resource_inputs = [], config = "", config_proto = "\0A\07\0A\03CPU\10\01\0A\07\0A\03GPU\10\002\02J\008\01", device = "", executor_type = "", f = @inference_standard_lstm_time_major_can_fuse_last_output} : (tensor<?x8x8xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<8x40xf32>, tensor<10x40xf32>, tensor<40xf32>) -> (tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>)
+  %2 = "tf.Add"(%0, %1#0) : (tensor<f32>, tensor<8x10xf32>) -> tensor<8x10xf32>
+  return
+}
+
+func @inference_standard_lstm_time_major_can_fuse_last_output(%arg0: tensor<?x8x8xf32>, %arg1: tensor<?x10xf32>, %arg2: tensor<?x10xf32>, %arg3: tensor<8x40xf32>, %arg4: tensor<10x40xf32>, %arg5: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = true} {
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg3) {adj_x = false, adj_y = false} : (tensor<?x8x8xf32>, tensor<8x40xf32>) -> tensor<?x8x40xf32>
+  %1 = "tf.Add"(%0, %arg5) : (tensor<?x8x40xf32>, tensor<40xf32>) -> tensor<?x8x40xf32>
+  %2 = "tf.BatchMatMulV2"(%1, %arg4) {adj_x = false, adj_y = true} : (tensor<?x8x40xf32>, tensor<10x40xf32>) -> tensor<?x8x10xf32>
+  %3 = "tf.Add"(%2, %arg1) : (tensor<?x8x10xf32>, tensor<?x10xf32>) -> tensor<?x8x10xf32>
+  %4 = "tf.Add"(%2, %arg2) : (tensor<?x8x10xf32>, tensor<?x10xf32>) -> tensor<?x8x10xf32>
+  %5 = "tf.Add"(%arg1, %arg2) : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<?x10xf32>
+  %6 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "/device:CPU:0", dtype = f32, value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  %7 = "tf.Add"(%arg1, %arg2) : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<8x10xf32>
+  return %7, %4, %5, %5, %6 : tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
+}
+
+// CHECK:       func @inference_standard_lstm_time_major_can_fuse_last_output([[VAL_0:%.*]]: tensor<?x8x8xf32>, [[VAL_1:%.*]]: tensor<?x10xf32>, [[VAL_2:%.*]]: tensor<?x10xf32>, [[VAL_3:%.*]]: tensor<8x40xf32>, [[VAL_4:%.*]]: tensor<10x40xf32>, [[VAL_5:%.*]]: tensor<40xf32>) -> (tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>) attributes {tf._input_shapes = ["tfshape$dim { size: -1 } dim { size: 8 } dim { size: 8 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$dim { size: -1 } dim { size: 10 }", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true", "tfshape$unknown_rank: true"], tf.api_implements = "lstm_b4e9f0e7-ac55-42bc-8ef2-8496419a608c", tf.api_preferred_device = "CPU", tf.go_backwards = false, tf.time_major = true} {
+// CHECK:           [[VAL_6:%.*]] = constant dense<[1, 0]> : tensor<2xi32>
+// CHECK:           [[VAL_7:%.*]] = "tf.Transpose"([[VAL_3]], [[VAL_6]]) : (tensor<8x40xf32>, tensor<2xi32>) -> tensor<40x8xf32>
+// CHECK:           [[VAL_8:%.*]] = constant dense<[1, 0]> : tensor<2xi32>
+// CHECK:           [[VAL_9:%.*]] = "tf.Transpose"([[VAL_4]], [[VAL_8]]) : (tensor<10x40xf32>, tensor<2xi32>) -> tensor<40x10xf32>
+// CHECK:           [[VAL_10:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK:           [[VAL_11:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           [[VAL_12:%.*]]:4 = "tf.SplitV"([[VAL_7]], [[VAL_10]], [[VAL_11]]) : (tensor<40x8xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>)
+// CHECK:           [[VAL_13:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK:           [[VAL_14:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           [[VAL_15:%.*]]:4 = "tf.SplitV"([[VAL_9]], [[VAL_13]], [[VAL_14]]) : (tensor<40x10xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>)
+// CHECK:           [[VAL_16:%.*]] = "tf.Const"() {value = dense<10> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK:           [[VAL_17:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           [[VAL_18:%.*]]:4 = "tf.SplitV"([[VAL_5]], [[VAL_16]], [[VAL_17]]) : (tensor<40xf32>, tensor<4xi32>, tensor<i32>) -> (tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>)
+// CHECK:           [[VAL_19:%.*]] = constant unit
+// CHECK:           [[VAL_20:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_12]]#0, [[VAL_12]]#1, [[VAL_12]]#2, [[VAL_12]]#3, [[VAL_15]]#0, [[VAL_15]]#1, [[VAL_15]]#2, [[VAL_15]]#3, [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_18]]#0, [[VAL_18]]#1, [[VAL_18]]#2, [[VAL_18]]#3, [[VAL_19]], [[VAL_19]], [[VAL_1]], [[VAL_2]], [[VAL_19]], [[VAL_19]], [[VAL_19]], [[VAL_19]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<?x8x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x8xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, tensor<10x10xf32>, none, none, none, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, tensor<10xf32>, none, none, tensor<?x10xf32>, tensor<?x10xf32>, none, none, none, none) -> tensor<?x8x10xf32>
+// CHECK:           [[VAL_21:%.*]] = constant dense<[-1, 0, 0]> : tensor<3xi32>
+// CHECK:           [[VAL_22:%.*]] = constant dense<0> : tensor<3xi32>
+// CHECK:           [[VAL_23:%.*]] = constant dense<1> : tensor<3xi32>
+// CHECK:           [[VAL_24:%.*]] = "tf.StridedSlice"([[VAL_20]], [[VAL_21]], [[VAL_22]], [[VAL_23]]) {begin_mask = 6 : i64, ellipsis_mask = 0 : i64, end_mask = 6 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<?x8x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<8x10xf32>
+// CHECK:           [[VAL_25:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
+// CHECK:           [[VAL_26:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<?x10xf32>
+// CHECK:           [[VAL_27:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1xf32>} : () -> tensor<f32>
+// CHECK:           return [[VAL_24]], [[VAL_20]], [[VAL_25]], [[VAL_26]], [[VAL_27]] : tensor<8x10xf32>, tensor<?x8x10xf32>, tensor<?x10xf32>, tensor<?x10xf32>, tensor<f32>
+// CHECK:         }
 
 }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
index 1aa1311318a..5e456b1a7e5 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
@@ -377,6 +377,32 @@ func @perChannelFakeQuantWithDepthwiseConv2D(tensor<256x32x32x3xf32>) -> (tensor
 // CHECK: return %[[CONV]]
 }
 
+// CHECK-LABEL: perChannelFakeQuantWithDepthwiseConv2DWithReshape
+func @perChannelFakeQuantWithDepthwiseConv2DWithReshape(%arg: tensor<1x160x160x48xf32>) -> (tensor<1x160x160x48xf32>) {
+  %in = constant dense<0.0> : tensor<3x3x48x1xf32>
+  %min = constant dense<0.0> : tensor<48xf32>
+  %max = constant dense<255.0> : tensor<48xf32>
+  %mini = "tf.Identity"(%min) : (tensor<48xf32>) -> tensor<48xf32>
+  %maxi = "tf.Identity"(%max) : (tensor<48xf32>) -> tensor<48xf32>
+  %s1 = constant dense<[3, 3, 48]> : tensor<3xi32>
+  %s2 = constant dense<[3, 3, 48, 1]> : tensor<4xi32>
+  %r1 = "tf.Reshape"(%in, %s1) {T = f32, Tshape = i32, device = ""} : (tensor<3x3x48x1xf32>, tensor<3xi32>) -> tensor<3x3x48xf32>
+  %fq = "tf.FakeQuantWithMinMaxVarsPerChannel"(%r1, %mini, %maxi) {num_bits = 3, narrow_range = false} : (tensor<3x3x48xf32>, tensor<48xf32>, tensor<48xf32>) -> tensor<3x3x48xf32>
+  %r2 = "tf.Reshape"(%fq, %s2) {T = f32, Tshape = i32, device = ""} : (tensor<3x3x48xf32>, tensor<4xi32>) -> tensor<3x3x48x1xf32>
+  %rst = "tf.DepthwiseConv2dNative"(%arg, %r2) {T = f32, data_format = "NHWC", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x160x160x48xf32>, tensor<3x3x48x1xf32>) -> tensor<1x160x160x48xf32>
+  return %rst : tensor<1x160x160x48xf32>
+
+// CHECK: %[[CONSTANT:.*]] = constant dense<0.000000e+00> : tensor<48xf32>
+// CHECK: %[[CONSTANT0:.*]] = constant dense<0.000000e+00> : tensor<1x3x3x48xf32>
+// CHECK: %[[QUANTIZE:.*]] = "tfl.quantize"(%[[CONSTANT0]]) {qtype = tensor<1x3x3x48x!quant.uniform<u8:f32:3,
+// CHECK-SAME: {1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,
+// CHECK-SAME:  1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,
+// CHECK-SAME:  1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00}>>}
+// CHECK: %[[DEQUANTIZE:.*]] = "tfl.dequantize"(%[[QUANTIZE]])
+// CHECK: %[[CONV:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[DEQUANTIZE]], %[[CONSTANT]])
+// CHECK: return %[[CONV]]
+}
+
 func @identity(%arg0: tensor<10xi32>, %arg1: tensor<20xi32>, %arg2: tensor<30xi32>) -> (tensor<10xi32>, tensor<20xi32>, tensor<30xi32>) {
   %0 = "tf.Identity"(%arg0) : (tensor<10xi32>) -> tensor<10xi32>
   %1:2 = "tf.IdentityN"(%arg1,%arg2) : (tensor<20xi32>, tensor<30xi32>) -> (tensor<20xi32>, tensor<30xi32>)
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index b000de17020..a80a1612488 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -84,11 +84,6 @@ void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
     pass_manager->addPass(mlir::TFL::CreateLowerStaticTensorListPass());
   }
 
-  // Enable fusing composite ops that can be lowered to built-in TFLite ops.
-  if (pass_config.emit_builtin_tflite_ops) {
-    pass_manager->addPass(mlir::TFL::CreatePrepareCompositeFunctionsPass());
-  }
-
   // The ophint extractions happen before lots of other passes:
   // The assumption of ophint-extraction is each ophinted region is a black-box
   // and nodes within this black-box is NOT connected to the nodes OUTSIDE the
@@ -104,6 +99,27 @@ void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
     pass_manager->addPass(mlir::TFL::CreateLegalizeOphintFuncOpPass());
   }
 
+  // This decomposes resource ops like ResourceGather into read-variable op
+  // followed by gather. This is used when the saved model import path is used
+  // during which resources dont get frozen in the python layer.
+  pass_manager->addNestedPass<mlir::FuncOp>(
+      mlir::TFDevice::CreateDecomposeResourceOpsPass());
+
+  // This pass does resource analysis of saved model global tensors and marks
+  // those deemed read-only as immutable.
+  pass_manager->addPass(
+      mlir::tf_saved_model::CreateOptimizeGlobalTensorsPass());
+  // This pass marks non-exported functions as symbol visibility 'private'
+  // those deemed read-only as immutable.
+  pass_manager->addPass(
+      mlir::tf_saved_model::
+          CreateMarkFunctionVisibilityUsingSavedModelLinkagePass());
+
+  // Enable fusing composite ops that can be lowered to built-in TFLite ops.
+  if (pass_config.emit_builtin_tflite_ops) {
+    pass_manager->addPass(mlir::TFL::CreatePrepareCompositeFunctionsPass());
+  }
+
   // Legalize while early to allow further constant folding.
   // TODO(jpienaar): This may not actually matter as we do canonicalization
   // after the legalize below, for now it needs to be below the above passes
@@ -114,6 +130,10 @@ void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
         mlir::TFL::CreateLegalizeTFWhilePass());
   }
 
+  // Add function inlining pass. Both TF and TFLite dialects are opted into
+  // function inliner interface.
+  pass_manager->addPass(mlir::createInlinerPass());
+
   // TODO(jpienaar): Revise post dialect constants.
   pass_manager->addPass(mlir::TF::CreateDecodeConstantPass());
   // Canonicalization includes const folding, which is utilized here to optimize
@@ -121,9 +141,15 @@ void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
   // tf.Conv2D is split into tf.Transpose and tfl.Conv2D.
   pass_manager->addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
   pass_manager->addNestedPass<mlir::FuncOp>(mlir::createCSEPass());
+  // This pass does dead code elimination based on symbol visibility.
+  pass_manager->addPass(mlir::createSymbolDCEPass());
+  // This pass 'freezes' immutable global tensors and inlines them as tf
+  // constant ops.
+  pass_manager->addPass(mlir::tf_saved_model::CreateFreezeGlobalTensorsPass());
 
-  if (pass_config.inline_functions) {
-    pass_manager->addPass(mlir::createInlinerPass());
+  if (pass_config.shape_inference) {
+    // Add a shape inference pass to optimize away the unnecessary casts.
+    pass_manager->addPass(mlir::TF::CreateTFShapeInferencePass());
   }
 
   // The below passes only make sense if Builtin TFLite ops are enabled
@@ -160,3 +186,85 @@ void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
 }
 
 }  // namespace tensorflow
+
+namespace mlir {
+namespace TFL {
+
+struct StandardPipelineOptions
+    : public PassPipelineOptions<StandardPipelineOptions> {
+  // TODO(b/150915052): All the tf_tfl_translate_cl flags should
+  // move inside this.
+};
+
+// NOLINTNEXTLINE
+// This creates the standard pass pipeline for TF->TFLite. This
+// represents a std configuration for TFLite, for use with APIs like
+// tensorflow/python/pywrap_mlir.py::experimental_run_pass_pipeline
+// This does not yet include quantization passes.
+void CreateTFLStandardPipeline(OpPassManager& pm,
+                               const StandardPipelineOptions& options) {
+  OpPassManager& func_pm = pm.nest<FuncOp>();
+
+  // tf_executor dialect passes - Cleaning up the IR.
+  func_pm.addPass(tf_executor::CreateSwitchFoldPass());
+  func_pm.addPass(tf_executor::CreateTFExecutorGraphPruningPass());
+  func_pm.addPass(tf_executor::CreateTFExecutorIslandCoarseningPass());
+
+  // more cleanup of executor dialect and raise to control flow.
+  pm.addPass(mlir::CreateTFExecutorToControlDialectConversion());
+  pm.addPass(mlir::TFControlFlow::CreateRaiseTFControlFlowPass());
+
+  // This is needed for control flow support with TF TensorList.
+  pm.addPass(mlir::TFL::CreateLowerStaticTensorListPass());
+
+  // Saved model pass to mark global tensors immutable.
+  pm.addPass(mlir::tf_saved_model::CreateOptimizeGlobalTensorsPass());
+  // Used to mark non-exported functions in saved model private.
+  pm.addPass(mlir::tf_saved_model::
+                 CreateMarkFunctionVisibilityUsingSavedModelLinkagePass());
+  // Op fusion pass.
+  pm.addPass(mlir::TFL::CreatePrepareCompositeFunctionsPass());
+
+  pm.addNestedPass<mlir::FuncOp>(mlir::TFL::CreateLegalizeTFWhilePass());
+
+  pm.addPass(mlir::createInlinerPass());
+
+  // Canonicalize, CSE etc.
+  pm.addPass(mlir::TF::CreateDecodeConstantPass());
+  pm.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
+  pm.addNestedPass<mlir::FuncOp>(mlir::createCSEPass());
+  // DCE for private symbols.
+  pm.addPass(mlir::createSymbolDCEPass());
+
+  // freeze global tensors.
+  pm.addPass(mlir::tf_saved_model::CreateFreezeGlobalTensorsPass());
+
+  // TFLite dialect passes.
+  pm.addPass(mlir::TFL::CreatePrepareTFPass(true));
+  pm.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
+  pm.addPass(mlir::TFL::CreateLegalizeTFPass());
+  pm.addPass(mlir::TFL::CreateOptimizePass());
+  pm.addPass(mlir::TFL::CreateOptimizeFunctionalOpsPass());
+
+  // Canonicalize, CSE etc.
+  pm.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
+  pm.addNestedPass<mlir::FuncOp>(mlir::createCSEPass());
+
+  // Pass for stateful operands like LSTM.
+  pm.addPass(mlir::TFL::CreateSplitMergedOperandsPass());
+
+  pm.addPass(mlir::TFL::CreateWhileOutlinePass());
+
+  pm.addPass(mlir::TFL::CreateRuntimeTypeVerifyPass());
+}
+
+// Registers a pass pipeline for the standard TFL passes.
+static mlir::PassPipelineRegistration<StandardPipelineOptions> pipeline(
+    "tfl-standard-pipeline",
+    "Run the standard passes involved in transforming/optimizing the TF "
+    "program to TFLite after "
+    "importing into MLIR.",
+    CreateTFLStandardPipeline);
+
+}  // namespace TFL
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
index 7f8ce4cf3d4..74e48cd6d91 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
@@ -137,13 +137,14 @@ int main(int argc, char **argv) {
 
   // TODO(b/147435528): We need to test the e2e behavior once the graph freezing
   // inside mlir is done.
-  if (import_saved_model || import_saved_model_v1) {
+  if (import_saved_model_object_graph || import_saved_model_signature_defs) {
     if (input_mlir)
       module = tensorflow::errors::InvalidArgument(
           "Importing saved model should not have input_mlir set");
-    module = tensorflow::ImportSavedModel(
-        import_saved_model, import_saved_model_v1, input_file_name,
-        saved_model_tags, saved_model_exported_names, &context);
+    module = tensorflow::ImportSavedModel(import_saved_model_object_graph,
+                                          import_saved_model_signature_defs,
+                                          input_file_name, saved_model_tags,
+                                          saved_model_exported_names, &context);
   } else {
     module = tensorflow::LoadFromGraphdefOrMlirSource(
         input_file_name, input_mlir, use_splatted_constant, custom_opdefs,
@@ -194,9 +195,18 @@ int main(int argc, char **argv) {
   mlir::TFL::PassConfig pass_config(quant_specs);
   pass_config.emit_builtin_tflite_ops = emit_builtin_tflite_ops;
   pass_config.lower_tensor_list_ops = lower_tensor_list_ops;
-  pass_config.inline_functions = inline_functions;
+
+  // Currently we only do shape inference for saved model import.
+  if (import_saved_model_object_graph || import_saved_model_signature_defs) {
+    pass_config.shape_inference = true;
+  }
 
   tensorflow::AddTFToTFLConversionPasses(pass_config, &pm);
+  // TODO(b/150901738): Move those into tf_tfl_translate.cc.
+  // Convert back to outlined while format for export back to flatbuffer.
+  if (pass_config.legalize_tf_while) {
+    pm.addPass(mlir::TFL::CreateWhileOutlinePass());
+  }
   pm.addPass(mlir::TFL::CreateRuntimeTypeVerifyPass());
 
   std::string result;
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.cc b/tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.cc
index de569a3496c..e4687c515ac 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.cc
@@ -24,14 +24,14 @@ opt<std::string> input_file_name(llvm::cl::Positional,
                                  llvm::cl::init("-"));
 
 // NOLINTNEXTLINE
-opt<bool> import_saved_model(
-    "savedmodel-to-mlir",
+opt<bool> import_saved_model_object_graph(
+    "savedmodel-objectgraph-to-mlir",
     llvm::cl::desc("Import a saved model to its MLIR representation"),
     llvm::cl::value_desc("dir"));
 
 // NOLINTNEXTLINE
-opt<bool> import_saved_model_v1(
-    "savedmodel-v1-to-mlir",
+opt<bool> import_saved_model_signature_defs(
+    "savedmodel-signaturedefs-to-mlir",
     llvm::cl::desc("Import a saved model V1 to its MLIR representation"),
     llvm::cl::value_desc("dir"));
 
@@ -104,13 +104,6 @@ opt<std::string> quant_stats_file_name("quant-stats",
                                        llvm::cl::value_desc("filename"),
                                        llvm::cl::init(""));
 
-// NOLINTNEXTLINE
-opt<bool> inline_functions(
-    "inline",
-    llvm::cl::desc("Inline function calls within the main function "
-                   "before legalization to TFLite."),
-    llvm::cl::init(true));
-
 // NOLINTNEXTLINE
 opt<bool> legalize_while(
     "legalize-tf-while",
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.h b/tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.h
index d7e54d70b81..b42160a4a2a 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.h
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.h
@@ -35,14 +35,13 @@ extern llvm::cl::opt<std::string> output_file_name;
 extern llvm::cl::opt<bool> use_splatted_constant;
 extern llvm::cl::opt<bool> input_mlir;
 extern llvm::cl::opt<bool> output_mlir;
-extern llvm::cl::opt<bool> inline_functions;
 extern llvm::cl::list<std::string> custom_opdefs;
 extern llvm::cl::opt<bool> emit_quant_adaptor_ops;
 extern llvm::cl::opt<std::string> quant_stats_file_name;
 
 // Import saved model.
-extern llvm::cl::opt<bool> import_saved_model;
-extern llvm::cl::opt<bool> import_saved_model_v1;
+extern llvm::cl::opt<bool> import_saved_model_object_graph;
+extern llvm::cl::opt<bool> import_saved_model_signature_defs;
 extern llvm::cl::opt<std::string> saved_model_tags;
 extern llvm::cl::opt<std::string> saved_model_exported_names;
 #endif  // TENSORFLOW_COMPILER_MLIR_LITE_TF_TFL_TRANSLATE_CL_H_
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
index f5097e1c01b..b05dcaadab2 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
@@ -169,7 +169,7 @@ StatusOr<mlir::OwningModuleRef> ImportSavedModel(
     std::vector<std::string> exported_names =
         absl::StrSplit(saved_model_exported_names, ',', absl::SkipEmpty());
 
-    auto module = tensorflow::SavedModelToMlirImport(
+    auto module = tensorflow::SavedModelObjectGraphToMlirImport(
         input_filename, tags, absl::Span<std::string>(exported_names), context);
     if (!module)
       return tensorflow::errors::InvalidArgument("fail to open input file");
@@ -179,8 +179,8 @@ StatusOr<mlir::OwningModuleRef> ImportSavedModel(
     std::unordered_set<std::string> tags =
         absl::StrSplit(saved_model_tags, ',');
 
-    auto module =
-        tensorflow::SavedModelV1ToMlirImport(input_filename, tags, context);
+    auto module = tensorflow::SavedModelSignatureDefsToMlirImport(
+        input_filename, tags, context);
 
     if (!module)
       return tensorflow::errors::InvalidArgument("fail to open input file");
diff --git a/tensorflow/compiler/mlir/lite/transforms/extract_ophint.cc b/tensorflow/compiler/mlir/lite/transforms/extract_ophint.cc
index 3582046f13f..5893d4f3779 100644
--- a/tensorflow/compiler/mlir/lite/transforms/extract_ophint.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/extract_ophint.cc
@@ -426,7 +426,9 @@ void PreprocessTopoSortGraph(
 }
 
 bool IsSideEffectOp(Operation* op) {
-  if (op->hasNoSideEffect()) return false;
+  // TODO(riverriddle) Properly handle region side effects.
+  if (MemoryEffectOpInterface::hasNoEffect(op) && op->getNumRegions() == 0)
+    return false;
 
   // Identity op has no side effect.
   // Check the OperationName maybe more elegant here.
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
index 683905d06c7..586ddf6211f 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
@@ -206,32 +206,12 @@ def : Pat<(TF_LogicalAndOp $l, $r), (TFL_LogicalAndOp $l, $r)>;
 
 def : Pat<(TF_LogicalOrOp $l, $r), (TFL_LogicalOrOp $l, $r)>;
 
-// Multi-pattern consisting of matching stand-alone op or op followed by relu.
-// TODO(karimnosseir): Can the activation part here be removed by modifying the
-// very similar pass in optimize_patterns.td?
-multiclass FusedBinaryActivationFuncOpPat<dag FromOp, dag ToOp> {
-  def : Pat<(FromOp AnyTensor:$l, AnyTensor:$r),
-            (ToOp $l, $r, TFL_AF_None)>;
-  foreach actFnPair = [[TF_ReluOp, TFL_AF_Relu],
-                       [TF_Relu6Op, TFL_AF_Relu6]] in {
-    def : Pat<(actFnPair[0] (FromOp:$bin_out $lhs, $rhs)),
-              (ToOp $lhs, $rhs, actFnPair[1]),
-              [(HasOneUse $bin_out)]>;
-    // TODO: Maybe move these below to general pass?
-    def : Pat<(actFnPair[0] (ToOp:$bin_out $lhs, $rhs, TFL_AF_None)),
-              (ToOp $lhs, $rhs, actFnPair[1]),
-              [(HasOneUse $bin_out)]>;
-  }
-}
-
-// Instantiated FusedBinary patterns for the from-to pairs of ops.
-foreach fromToPair = [[TF_AddOp, TFL_AddOp],
-                      [TF_AddV2Op, TFL_AddOp],
-                      [TF_DivOp, TFL_DivOp],
-                      [TF_MulOp, TFL_MulOp],
-                      [TF_RealDivOp, TFL_DivOp],
-                      [TF_SubOp, TFL_SubOp]] in
-  defm : FusedBinaryActivationFuncOpPat<fromToPair[0], fromToPair[1]>;
+def : Pat<(TF_AddOp $lhs, $rhs), (TFL_AddOp $lhs, $rhs, TFL_AF_None)>;
+def : Pat<(TF_AddV2Op $lhs, $rhs), (TFL_AddOp $lhs, $rhs, TFL_AF_None)>;
+def : Pat<(TF_SubOp $lhs, $rhs), (TFL_SubOp $lhs, $rhs, TFL_AF_None)>;
+def : Pat<(TF_MulOp $lhs, $rhs), (TFL_MulOp $lhs, $rhs, TFL_AF_None)>;
+def : Pat<(TF_RealDivOp $lhs, $rhs), (TFL_DivOp $lhs, $rhs, TFL_AF_None)>;
+def : Pat<(TF_DivOp $lhs, $rhs), (TFL_DivOp $lhs, $rhs, TFL_AF_None)>;
 
 def : Pat<(TF_BiasAddOp F32Tensor:$l, F32Tensor:$r,
                         IsDataFormatNHWC:$data_format),
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
index cf24ed7e0f4..d2001db8b40 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // TF:llvm-project
 #include "mlir/Support/Functional.h"  // TF:llvm-project
 #include "mlir/Support/LLVM.h"  // TF:llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // TF:llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
@@ -122,6 +123,7 @@ DECL_CONVERT_OP(StridedSlice);
 DECL_CONVERT_OP(Unpack);
 DECL_CONVERT_OP(Reciprocal);
 DECL_CONVERT_OP(RandomUniform);
+DECL_CONVERT_OP(BroadcastTo);
 
 #undef DECL_CONVERT_OP
 
@@ -464,8 +466,7 @@ PatternMatchResult ConvertTFMatrixDiagV3Op::matchAndRewrite(
 // TF Lite doesn't support Assert, we just drop the assert from the graph.
 PatternMatchResult ConvertTFAssertOp::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
-  op->dropAllReferences();
-  op->erase();
+  rewriter.eraseOp(op);
   return matchSuccess();
 }
 
@@ -474,8 +475,7 @@ StatusOr<ConstantOp> CreateConstOpWithSingleValue(PatternRewriter* rewriter,
                                                   ShapedType shaped_type,
                                                   int value) {
   Type element_type = shaped_type.getElementType();
-  ShapedType ranked_tensor_type = RankedTensorType::get({1}, element_type);
-  Type type = ranked_tensor_type;
+  ShapedType scalar_type = RankedTensorType::get({}, element_type);
   Attribute attr;
   switch (element_type.getKind()) {
     case mlir::StandardTypes::F16: {
@@ -483,12 +483,12 @@ StatusOr<ConstantOp> CreateConstOpWithSingleValue(PatternRewriter* rewriter,
       auto floatAttr =
           mlir::FloatAttr::get(floatType, static_cast<float>(value));
       std::vector<Attribute> floatValues({floatAttr});
-      attr = DenseElementsAttr::get(ranked_tensor_type, floatValues);
+      attr = DenseElementsAttr::get(scalar_type, floatValues);
       break;
     }
     case mlir::StandardTypes::F32: {
-      attr = DenseElementsAttr::get<float>(ranked_tensor_type,
-                                           static_cast<float>(value));
+      attr =
+          DenseElementsAttr::get<float>(scalar_type, static_cast<float>(value));
       break;
     }
     case mlir::StandardTypes::Complex: {
@@ -509,8 +509,7 @@ StatusOr<ConstantOp> CreateConstOpWithSingleValue(PatternRewriter* rewriter,
         repr.set_tensor_content(content);
         std::string mangled = tensorflow::mangling_util::MangleTensor(repr);
 
-        attr =
-            mlir::OpaqueElementsAttr::get(dialect, ranked_tensor_type, mangled);
+        attr = mlir::OpaqueElementsAttr::get(dialect, scalar_type, mangled);
         break;
       }
       return Status(tensorflow::error::INVALID_ARGUMENT, "Unsupported type");
@@ -519,19 +518,19 @@ StatusOr<ConstantOp> CreateConstOpWithSingleValue(PatternRewriter* rewriter,
       const auto& itype = element_type.cast<mlir::IntegerType>();
       switch (itype.getWidth()) {
         case 8:
-          attr = DenseElementsAttr::get<int8_t>(ranked_tensor_type,
+          attr = DenseElementsAttr::get<int8_t>(scalar_type,
                                                 static_cast<int8_t>(value));
           break;
         case 16:
-          attr = DenseElementsAttr::get<int16_t>(ranked_tensor_type,
+          attr = DenseElementsAttr::get<int16_t>(scalar_type,
                                                  static_cast<int16_t>(value));
           break;
         case 32:
-          attr = DenseElementsAttr::get<int32_t>(ranked_tensor_type,
+          attr = DenseElementsAttr::get<int32_t>(scalar_type,
                                                  static_cast<int32_t>(value));
           break;
         case 64:
-          attr = DenseElementsAttr::get<int64_t>(ranked_tensor_type,
+          attr = DenseElementsAttr::get<int64_t>(scalar_type,
                                                  static_cast<int64_t>(value));
           break;
         default:
@@ -543,7 +542,7 @@ StatusOr<ConstantOp> CreateConstOpWithSingleValue(PatternRewriter* rewriter,
     default:
       return Status(tensorflow::error::INVALID_ARGUMENT, "Unsupported type");
   }
-  return rewriter->create<ConstantOp>(loc, type, attr);
+  return rewriter->create<ConstantOp>(loc, scalar_type, attr);
 }
 
 PatternMatchResult ConvertTFReciprocalOp::matchAndRewrite(
@@ -566,6 +565,31 @@ PatternMatchResult ConvertTFReciprocalOp::matchAndRewrite(
   return matchSuccess();
 }
 
+PatternMatchResult ConvertTFBroadcastToOp::matchAndRewrite(
+    Operation* op, PatternRewriter& rewriter) const {
+  auto tf_broadcast_to_op = cast<TF::BroadcastToOp>(op);
+  auto element_type = tf_broadcast_to_op.input().getType().cast<ShapedType>();
+  auto output_type = tf_broadcast_to_op.output().getType();
+
+  auto status_or_const_op =
+      CreateConstOpWithSingleValue(&rewriter, op->getLoc(), element_type, 1);
+  if (!status_or_const_op.ok()) {
+    return matchFailure();
+  }
+
+  auto tfl_fill_op = rewriter.create<TFL::FillOp>(
+      op->getLoc(), output_type, tf_broadcast_to_op.shape(),
+      status_or_const_op.ValueOrDie());
+
+  StringAttr fused_activation_function =
+      StringAttr::get("NONE", rewriter.getContext());
+
+  rewriter.replaceOpWithNewOp<TFL::MulOp>(
+      op, output_type, tf_broadcast_to_op.input(), tfl_fill_op,
+      fused_activation_function);
+  return matchSuccess();
+}
+
 // Legalize unidirectional sequence lstm.
 struct LegalizeUnidirectionalSequenceLstm : public RewritePattern {
   explicit LegalizeUnidirectionalSequenceLstm(MLIRContext* context)
@@ -616,7 +640,7 @@ struct LegalizeUnidirectionalSequenceLstm : public RewritePattern {
                                                rewriter.getStringAttr("TANH")));
     // cell_clip.
     attributes.push_back(
-        rewriter.getNamedAttr("cell_clip", rewriter.getF32FloatAttr(10.0)));
+        rewriter.getNamedAttr("cell_clip", rewriter.getF32FloatAttr(0.0)));
     // proj_clip.
     attributes.push_back(
         rewriter.getNamedAttr("proj_clip", rewriter.getF32FloatAttr(0.0)));
@@ -629,7 +653,7 @@ struct LegalizeUnidirectionalSequenceLstm : public RewritePattern {
 
     // Rewire the output.
     op->getResult(2).replaceAllUsesWith(lstm_op.getResult());
-    op->erase();
+    rewriter.eraseOp(op);
     return matchSuccess();
   }
 };
@@ -688,7 +712,7 @@ struct LegalizeUnidirectionalSequenceRnn : public RewritePattern {
 
     // Rewire the output.
     op->getResult(1).replaceAllUsesWith(rnn_op.getResult());
-    op->erase();
+    rewriter.eraseOp(op);
 
     return matchSuccess();
   }
@@ -696,22 +720,44 @@ struct LegalizeUnidirectionalSequenceRnn : public RewritePattern {
 
 void LegalizeTF::runOnFunction() {
   OwningRewritePatternList patterns;
-  auto* ctx = &getContext();
+  auto* context = &getContext();
   auto func = getFunction();
 
   // Add the generated patterns to the list.
-  populateWithGenerated(ctx, &patterns);
-  patterns
-      .insert<ConvertTFConcatOp, ConvertTFConcatV2Op, ConvertTFMatMulOp,
-              ConvertTFMatrixDiagV2Op, ConvertTFMatrixDiagV3Op, ConvertTFPackOp,
-              ConvertTFReshapeOp, ConvertTFSplitOp, ConvertTFSplitVOp,
-              ConvertTFStridedSliceOp, ConvertTFUnpackOp, ConvertTFAssertOp,
-              ConvertTFReciprocalOp, ConvertTFRandomUniformOp>(ctx);
+  populateWithGenerated(context, &patterns);
+  patterns.insert<ConvertTFConcatOp, ConvertTFConcatV2Op, ConvertTFMatMulOp,
+                  ConvertTFMatrixDiagV2Op, ConvertTFMatrixDiagV3Op,
+                  ConvertTFPackOp, ConvertTFReshapeOp, ConvertTFSplitOp,
+                  ConvertTFSplitVOp, ConvertTFStridedSliceOp, ConvertTFUnpackOp,
+                  ConvertTFAssertOp, ConvertTFReciprocalOp,
+                  ConvertTFRandomUniformOp, ConvertTFBroadcastToOp>(context);
 
   // Ophint python converter converted tf node pattern.
   patterns.insert<LegalizeUnidirectionalSequenceLstm,
-                  LegalizeUnidirectionalSequenceRnn>(ctx);
-  applyPatternsGreedily(func, patterns);
+                  LegalizeUnidirectionalSequenceRnn>(context);
+
+  ConversionTarget target(*context);
+  // It is legal to have TF ops in the graph still which can be
+  // used later or in the case of SELECT were we allow TF ops in the final
+  // graph.
+  target.addLegalOp<mlir::ConstantOp>();
+  target.addLegalOp<ConstOp>();
+  target.addDynamicallyLegalDialect<TensorFlowLiteDialect>(
+      Optional<ConversionTarget::DynamicLegalityCallbackFn>([](Operation* op) {
+        auto tfl_op = dyn_cast_or_null<TflRuntimeVerifyOpInterface>(op);
+        if (!tfl_op) return false;
+        return succeeded(tfl_op.VerifyTflRuntimeTypes(tfl_op.getOperation()));
+      }));
+  // Keep trying to convert.
+  // TODO(karimnosseir): This is similar to what apply greedy patterns does.
+  // Look if there is a function that tries until it converge.
+  // Currently unit-test doesn't do multiple tries, so we need this.
+  const int max_iterations = 15;
+  for (int i = 0; i < max_iterations; ++i) {
+    if (failed(applyPartialConversion(func, target, patterns))) {
+      return;
+    }
+  }
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
index 754333b175f..a13490ddb9f 100644
--- a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
@@ -864,13 +864,12 @@ LogicalResult LowerStaticTensorListPass::RewriteFunction(
   target.addLegalOp<ReturnOp>();
 
   OwningRewritePatternList patterns;
-  patterns
-      .insert<ConvertConst, ConvertEmptyTensorList, ConvertIdentity,
-              ConvertTensorListFromTensor, ConvertTensorListGetItem,
-              ConvertTensorListLength, ConvertTensorListPushBack,
-              ConvertTensorListReserve, ConvertTensorListSetItem,
-              ConvertTensorListStack, ConvertTensorListResize, ConvertWhile>(
-          context);
+  populateWithGenerated(context, &patterns);
+  patterns.insert<ConvertConst, ConvertEmptyTensorList, ConvertIdentity,
+                  ConvertTensorListGetItem, ConvertTensorListLength,
+                  ConvertTensorListPushBack, ConvertTensorListReserve,
+                  ConvertTensorListSetItem, ConvertTensorListStack,
+                  ConvertTensorListResize, ConvertWhile>(context);
   return applyFullConversion(func, target, patterns);
 }
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize.cc b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
index dbc12a85b67..bc39c0cf74b 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
@@ -83,6 +83,17 @@ bool IsBroadcastableElementsAttrAndType(Type a, Type b) {
   return OpTrait::util::getBroadcastedType(a, b) != Type();
 }
 
+// Returns whether the resultant type of any broadcastable operation with
+// operands `a` and `b` matches `expected_output`. Returns false if `a` is not
+// broadcast-compatible with `b`.
+bool OperandsBroadcastToOutputType(Type a, Type b, Type expected_output) {
+  Type output_element_type =
+      expected_output.cast<ShapedType>().getElementType();
+  Type broadcasted_type =
+      OpTrait::util::getBroadcastedType(a, b, output_element_type);
+  return broadcasted_type != Type() && broadcasted_type == expected_output;
+}
+
 // Returns whether if `type1` dimensions are the same as the ending dimensions
 // of `type2`. This is more restricted than broadcastable.
 bool IsTailOfShape(Type type1, Type type2) {
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc b/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc
index cde253b6ebc..83ecf0be820 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc
@@ -58,12 +58,14 @@ static void UpdateFuncType(FuncOp func) {
 
 // TODO(jpienaar): Remove when recursive side-effect modeling is added.
 static bool IsSideEffectFree(FuncOp func) {
-  return func.getBody()
-      .walk([&](Operation* op) {
-        if (!op->hasNoSideEffect()) return WalkResult::interrupt();
-        return WalkResult::advance();
-      })
-      .wasInterrupted();
+  return !func.getBody()
+              .walk([&](Operation* op) {
+                if (!MemoryEffectOpInterface::hasNoEffect(op) &&
+                    !op->isKnownTerminator())
+                  return WalkResult::interrupt();
+                return WalkResult::advance();
+              })
+              .wasInterrupted();
 }
 
 // Folds TensorFlow If op with constant conditional operand by inlining the
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
index 0ad5be055dc..144227b06af 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
@@ -285,6 +285,10 @@ foreach L2NormalizePairs = [[TFL_MulOp, TFL_RsqrtOp], [TFL_DivOp, TFL_SqrtOp]]
 def AreBroadcastableTypes : Constraint<CPred<
   "TFL::IsBroadcastableElementsAttrAndType($0.getType(), $1.getType())">>;
 
+def OperandsBroadcastToOutputType : Constraint<CPred<
+  "TFL::OperandsBroadcastToOutputType($0.getType(), $1.getType(), "
+                                     "$2.getType())">>;
+
 def IsTailOfShape : Constraint<CPred<
   "TFL::IsTailOfShape($0.getType(), $1.getType())">>;
 
@@ -293,15 +297,15 @@ def HaveSameType : Constraint<CPred<"$0.getType(), $1.getType()">>;
 // Pattern for skipping Tile if it is mainly for broadcasting and the
 // Op is already supporting broadcasting.
 multiclass FuseTileBroadcastIntoFollowingBinary<dag BinaryOp> {
-  def : Pat<(BinaryOp (TFL_TileOp $input, (ConstantOp $tile)),
+  def : Pat<(BinaryOp:$result (TFL_TileOp $input, (ConstantOp $tile)),
              $operand, $act_func),
   (BinaryOp $input, $operand, $act_func),
-  [(AreBroadcastableTypes $input, $operand)]>;
+  [(OperandsBroadcastToOutputType $input, $operand, $result)]>;
 
-  def : Pat<(BinaryOp $operand,
+  def : Pat<(BinaryOp:$result $operand,
              (TFL_TileOp $input, (ConstantOp $tile)), $act_func),
   (BinaryOp $operand, $input, $act_func),
-  [(AreBroadcastableTypes $operand, $input)]>;
+  [(OperandsBroadcastToOutputType $operand, $input, $result)]>;
 }
 
 // Multi-pattern consisting of matching stand-alone op or op followed by relu.
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
index 6cccdf5aa8d..b2cc58b863a 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
-#include "mlir/Analysis/CallInterfaces.h"  // TF:llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
@@ -35,6 +34,7 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // TF:llvm-project
 #include "mlir/IR/StandardTypes.h"  // TF:llvm-project
 #include "mlir/IR/SymbolTable.h"  // TF:llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // TF:llvm-project
 #include "mlir/Pass/Pass.h"  // TF:llvm-project
 #include "mlir/Support/LLVM.h"  // TF:llvm-project
 #include "mlir/Support/LogicalResult.h"  // TF:llvm-project
@@ -141,10 +141,7 @@ LogicalResult CheckOutputConsumer(
 
   for (int i = 0; i < expected_num_outputs; ++i) {
     auto it = expected_consumer_indices.find(i);
-    if (it != expected_consumer_indices.end()) {
-      // Expected consumer.
-      if (call_op->getResult(i).use_empty()) return failure();
-    } else {
+    if (it == expected_consumer_indices.end()) {
       // Unexpected consumer.
       if (!call_op->getResult(i).use_empty()) return failure();
     }
@@ -160,8 +157,9 @@ LogicalResult CheckFusableKerasLstm(FuncOp lstm_func, ModuleOp module) {
       if (call_op && op->getAttrOfType<SymbolRefAttr>("f").getRootReference() ==
                          lstm_func.getName()) {
         // Keras LSTM have 5 outputs.
-        // We should make sure only the second output is consumed.
-        if (failed(CheckOutputConsumer(call_op, 5, {1}))) check_failed = true;
+        // We should make sure only the first or the second output are consumed.
+        if (failed(CheckOutputConsumer(call_op, 5, {0, 1})))
+          check_failed = true;
       }
     });
   }
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
index ef6fd1899d2..7592f462f6b 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
@@ -56,6 +56,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/utils/attribute_utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/validators.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/einsum.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.h"
 
 #define DEBUG_TYPE "tf-tfl-legalization"
@@ -195,7 +196,7 @@ using PreparePerChannelFakeQuant =
 template <typename ConcreteType, typename TFConvOpType>
 struct ConvertTFConvOp : public RewritePattern {
   // Transient state for preserving data from match to rewrite
-  struct ConvertTFConvOpMatchState : public PatternState {
+  struct ConvertTFConvOpMatchState {
     IntegerAttr dilation_height_factor;
     IntegerAttr dilation_width_factor;
     StringAttr padding;
@@ -207,7 +208,8 @@ struct ConvertTFConvOp : public RewritePattern {
       : RewritePattern(TFConvOpType::getOperationName(), 1, context),
         intAttrOne(Builder(context).getI32IntegerAttr(1)) {}
 
-  PatternMatchResult match(Operation *op) const override {
+  PatternMatchResult matchAndRewrite(Operation *op,
+                                     PatternRewriter &rewriter) const override {
     // Assumes TensorFlow convolution op is already verified to be
     // in valid form.
 
@@ -226,38 +228,29 @@ struct ConvertTFConvOp : public RewritePattern {
     IntegerAttr height, width;
     if (!TFIntListIs1XY1(op, "strides", &height, &width)) return matchFailure();
 
-    auto state = std::make_unique<ConvertTFConvOpMatchState>();
-
-    state->stride_height = height;
-    state->stride_width = width;
+    ConvertTFConvOpMatchState state;
+    state.stride_height = height;
+    state.stride_width = width;
 
     if (TFIntListIs1XY1(op, "dilations", &height, &width)) {
-      state->dilation_height_factor = height;
-      state->dilation_width_factor = width;
+      state.dilation_height_factor = height;
+      state.dilation_width_factor = width;
     } else {
       // If the 'dilations' attribute is missing, we use the default value (1)
       // for both dilation height and width factor.
-      state->dilation_height_factor = intAttrOne;
-      state->dilation_width_factor = intAttrOne;
+      state.dilation_height_factor = intAttrOne;
+      state.dilation_width_factor = intAttrOne;
     }
 
-    StringAttr padding_attr;
-    if (!TFPaddingIsSameOrValid(op, &padding_attr)) return matchFailure();
-    state->padding = padding_attr;
+    if (!TFPaddingIsSameOrValid(op, &state.padding)) return matchFailure();
 
     // Additionally, we require the filter operand to be of 4-D tensor type so
     // that we can extract info from the shape (e.g., for constructing bias
     // tensor, for setting depth_multiplier attribute, etc.).
-    auto filter_type =
-        tf_op.filter().getType().template dyn_cast<RankedTensorType>();
-    if (filter_type && filter_type.getRank() == 4)
-      return matchSuccess(std::move(state));
+    auto filter = tf_op.filter();
+    auto filter_type = filter.getType().template dyn_cast<RankedTensorType>();
+    if (!filter_type || filter_type.getRank() != 4) return matchFailure();
 
-    return matchFailure();
-  }
-
-  void rewrite(Operation *op, std::unique_ptr<PatternState> state,
-               PatternRewriter &rewriter) const override {
     // TensorFlow convolution op only has two inputs, while the TFLite one has
     // three, with the bias vector marked as optional. However, TOCO has a
     // dedicated pass, EnsureBiasVectors, to create default bias vectors for all
@@ -267,11 +260,7 @@ struct ConvertTFConvOp : public RewritePattern {
 
     // TODO(antiagainst): also handle the case of tf.Add(tf.[op], <bias>)
 
-    TFConvOpType tf_op = cast<TFConvOpType>(op);
-
     // Get a splat zero tensor with the expected dimension for the bias tensor
-    auto filter = tf_op.filter();
-    auto filter_type = filter.getType().template cast<RankedTensorType>();
     auto elem_type = filter_type.getElementType();
     auto bias_dim = static_cast<const ConcreteType *>(this)->getBiasDim(
         filter_type.getShape());
@@ -280,12 +269,12 @@ struct ConvertTFConvOp : public RewritePattern {
     auto bias =
         rewriter.create<TF::ConstOp>(op->getLoc(), bias_type, bias_attr);
 
-    auto *conv_state = static_cast<ConvertTFConvOpMatchState *>(state.get());
     auto conv_op = static_cast<const ConcreteType *>(this)->createTFLOp(
-        conv_state, rewriter, op->getLoc(), tf_op.getType(), tf_op.input(),
-        filter, bias);
+        &state, rewriter, op->getLoc(), tf_op.getType(), tf_op.input(), filter,
+        bias);
 
     rewriter.replaceOp(op, conv_op.getResult());
+    return matchSuccess();
   }
 
   const IntegerAttr intAttrOne;
@@ -655,8 +644,8 @@ void PrepareTFPass::runOnFunction() {
     patterns.insert<TF::ConvertTFBatchMatMulOp<TF::BatchMatMulOp>,
                     TF::ConvertTFBatchMatMulOp<TF::BatchMatMulV2Op>>(ctx);
   }
-  patterns.insert<ConvertTFConv2D, ConvertTFDepthwiseConv2dNative,
-                  ConvertTFStridedSlice>(ctx);
+  patterns.insert<TF::ConvertTFEinsumOp, ConvertTFConv2D,
+                  ConvertTFDepthwiseConv2dNative, ConvertTFStridedSlice>(ctx);
   applyPatternsGreedily(func, patterns);
 }
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/tensorlist_patterns.td b/tensorflow/compiler/mlir/lite/transforms/tensorlist_patterns.td
index b0435b7cf4c..6943b9c03e4 100644
--- a/tensorflow/compiler/mlir/lite/transforms/tensorlist_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/tensorlist_patterns.td
@@ -26,3 +26,8 @@ include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 def ConvertTensorListFromTensor : Pat<
   (TF_TensorListFromTensorOp $tensor, $element_shape),
   (replaceWithValue $tensor)>;
+
+// This pattern is in PrepareTF pass and added here temporary
+// TODO(karimnosseir): Move away from here after looking in ordering
+// the passes.
+def : Pat<(TF_StopGradientOp $arg), (TF_IdentityOp $arg)>;
diff --git a/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc b/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc
index 8ed5b0e0341..be024eccd45 100644
--- a/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc
@@ -98,7 +98,6 @@ void WhileOutlinePass::OutlineWhile(WhileOp while_op) {
         extern_values.insert(extern_value);
         continue;
       }
-      assert(extern_value.getDefiningOp()->hasNoSideEffect());
       if (!const_none) {
         // Add constant at start of region.
         auto const_builder =
diff --git a/tensorflow/compiler/mlir/lite/utils/convert_type.cc b/tensorflow/compiler/mlir/lite/utils/convert_type.cc
index 85bd6a18764..7158d634a89 100644
--- a/tensorflow/compiler/mlir/lite/utils/convert_type.cc
+++ b/tensorflow/compiler/mlir/lite/utils/convert_type.cc
@@ -38,7 +38,7 @@ mlir::Type ConvertElementType(tflite::TensorType type, mlir::Builder builder) {
     case tflite::TensorType_INT32:
       return builder.getIntegerType(32);
     case tflite::TensorType_UINT8:
-      return mlir::TF::Uint8Type::get(builder.getContext());
+      return builder.getIntegerType(8, /*isSigned=*/false);
     case tflite::TensorType_INT64:
       return builder.getIntegerType(64);
     case tflite::TensorType_STRING:
diff --git a/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc b/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
index 398433ca996..a138812e54d 100644
--- a/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
@@ -73,21 +73,29 @@ Value CreateI64DenseConst(OpBuilder* builder, ArrayRef<int64_t> shape,
   return builder->create<ConstantOp>(location, type, attr);
 }
 
+Value CreateI32DenseConst(OpBuilder* builder, ArrayRef<int32_t> values,
+                          mlir::Location location) {
+  auto type = RankedTensorType::get(static_cast<int>(values.size()),
+                                    builder->getIntegerType(32));
+  auto attr = DenseElementsAttr::get(type, values);
+  return builder->create<ConstantOp>(location, type, attr);
+}
+
 Value CreateNoneValue(OpBuilder* builder, mlir::Location location) {
   return builder->create<mlir::ConstantOp>(location, builder->getNoneType(),
                                            builder->getUnitAttr());
 }
 
 Value Transpose(OpBuilder* builder, Value value_to_transpose,
-                SmallVector<int64_t, 4> perm, RankedTensorType original_type,
+                SmallVector<int32_t, 4> perm, RankedTensorType original_type,
                 mlir::Location location) {
   // Create a constant op for transpose permutation.
-  auto perm_op = CreateI64DenseConst(builder, perm, perm, location);
+  auto perm_op = CreateI32DenseConst(builder, perm, location);
 
   // Create tensor type for the transpose result.
   auto transpose_type = original_type;
   auto transpose_shape = functional::map(
-      [transpose_type](int64_t dim) { return transpose_type.getDimSize(dim); },
+      [transpose_type](int32_t dim) { return transpose_type.getDimSize(dim); },
       perm);
   auto elem_type = transpose_type.getElementType();
   auto result_type = RankedTensorType::get(transpose_shape, elem_type);
@@ -99,7 +107,7 @@ Value Transpose(OpBuilder* builder, Value value_to_transpose,
 Value Transpose2D(OpBuilder* builder, Value value_to_transpose,
                   RankedTensorType type, mlir::Location location) {
   // Create a constant op for transpose permutation.
-  SmallVector<int64_t, 4> perm = {1, 0};
+  SmallVector<int32_t, 4> perm = {1, 0};
   return Transpose(builder, value_to_transpose, perm, type, location);
 }
 
@@ -148,6 +156,27 @@ Value SliceRankedTensor(OpBuilder* builder, Value input,
       input, slice_i2c_begin, slice_i2c_size);
 }
 
+Value CreateStridedSliceOp(mlir::Location loc, ArrayRef<int64_t> output_shape,
+                           Value input, ArrayRef<int32_t> begin,
+                           ArrayRef<int32_t> end, ArrayRef<int32_t> strides,
+                           int64_t begin_mask, int64_t end_mask,
+                           int64_t ellipsis_mask, int64_t new_axis_mask,
+                           int64_t shrink_axis_mask, OpBuilder* builder) {
+  auto output_type = RankedTensorType::get(
+      output_shape, input.getType().cast<RankedTensorType>().getElementType());
+  auto begin_tensor = CreateI32DenseConst(builder, begin, loc);
+  auto end_tensor = CreateI32DenseConst(builder, end, loc);
+  auto strides_tensor = CreateI32DenseConst(builder, strides, loc);
+
+  return builder->create<TF::StridedSliceOp>(
+      loc, output_type, input, begin_tensor, end_tensor, strides_tensor,
+      builder->getI64IntegerAttr(begin_mask),
+      builder->getI64IntegerAttr(end_mask),
+      builder->getI64IntegerAttr(ellipsis_mask),
+      builder->getI64IntegerAttr(new_axis_mask),
+      builder->getI64IntegerAttr(shrink_axis_mask));
+}
+
 }  // namespace
 
 void ConvertLSTMCellSimpleToFusedLSTM::SetWeightForInputToCellGate() {
@@ -386,7 +415,12 @@ LogicalResult ConvertLSTMCellSimpleToFusedLSTM::RewriteFunc() {
       forget_layer_norm_coefficients_, cell_layer_norm_coefficients_,
       output_layer_norm_coefficients_, builder_.getStringAttr("TANH"),
       builder_.getF32FloatAttr(10.0), builder_.getF32FloatAttr(0.0),
-      builder_.getStringAttr("FULL"));
+      builder_.getStringAttr("FULL"),
+      /*input_to_input_intermediate=*/mlir::TypeAttr(),
+      /*input_to_forget_intermediate=*/mlir::TypeAttr(),
+      /*input_to_cell_intermediate=*/mlir::TypeAttr(),
+      /*input_to_output_intermediate=*/mlir::TypeAttr(),
+      /*effective_hidden_scale_intermediate=*/mlir::TypeAttr());
 
   // Cast the static shaped lstm result to FuncOp's signature -
   // Ranked but unknown 2nd dimension to support stacking these.
@@ -588,16 +622,6 @@ LogicalResult CreateEqualSizeSplitVOp(Value input, int axis, int splits,
   return success();
 }
 
-void UpdateFuncSignature(int batch, int time, int output,
-                         mlir::FuncOp* func_op) {
-  SmallVector<int64_t, 4> output_shape{batch, time, output};
-  auto input_types = func_op->getType().getInputs();
-  auto element_type = input_types[0].cast<RankedTensorType>().getElementType();
-  auto output_type = mlir::RankedTensorType::get(output_shape, element_type);
-  func_op->setType(
-      mlir::FunctionType::get(input_types, output_type, func_op->getContext()));
-}
-
 // TODO(b/147436982): Consider refactor this to be more general.
 LogicalResult ConvertKerasLSTMLayer(mlir::FuncOp func_op, OpBuilder* builder) {
   // For argument order, please check out standard_lstm under
@@ -626,26 +650,21 @@ LogicalResult ConvertKerasLSTMLayer(mlir::FuncOp func_op, OpBuilder* builder) {
 
   auto final_inputs = input;
   auto final_input_type = input_type;
-  // We will transpose the inputs.
-  if (!time_majored) {
-    SmallVector<int64_t, 4> perm = {1, 0, 2};
-    final_inputs =
-        Transpose(builder, final_inputs, perm, input_type, func_op.getLoc());
-    final_input_type = final_inputs.getType().dyn_cast<RankedTensorType>();
-  }
 
   // Handle go_backwards:
   // LSTM in Keras semantic will reverse the input sequence if it's go_backwards
   auto go_backwards_attr = func_op.getAttrOfType<BoolAttr>("tf.go_backwards");
 
   if (go_backwards_attr != nullptr && go_backwards_attr.getValue()) {
-    // We assume input is already in {time, batch, size} layout.
-    final_inputs =
-        Reverse(builder, final_inputs, 0, final_input_type, func_op.getLoc());
+    int time_dim = time_majored ? 0 : 1;
+    final_inputs = Reverse(builder, final_inputs, time_dim, final_input_type,
+                           func_op.getLoc());
   }
 
-  int batch = final_input_type.getDimSize(1);
-  int time = final_input_type.getDimSize(0);
+  int batch = time_majored ? final_input_type.getDimSize(1)
+                           : final_input_type.getDimSize(0);
+  int time = time_majored ? final_input_type.getDimSize(0)
+                          : final_input_type.getDimSize(1);
 
   // Setup correct weights.
   RankedTensorType weight_type =
@@ -686,14 +705,20 @@ LogicalResult ConvertKerasLSTMLayer(mlir::FuncOp func_op, OpBuilder* builder) {
     return failure();
 
   // Build the lstm op.
-  SmallVector<int64_t, 3> output_shape = {time, batch, n_output};
+  SmallVector<int64_t, 3> output_shape;
+  if (time_majored) {
+    output_shape = {time, batch, n_output};
+  } else {
+    output_shape = {batch, time, n_output};
+  }
   auto result_type = mlir::RankedTensorType::get(
-      output_shape, input.getType().cast<RankedTensorType>().getElementType());
+      output_shape,
+      final_inputs.getType().cast<RankedTensorType>().getElementType());
 
   Value none = builder->create<mlir::ConstantOp>(
       func_op.getLoc(), builder->getNoneType(), builder->getUnitAttr());
   auto lstm = builder->create<mlir::TFL::UnidirectionalSequenceLSTMOp>(
-      func_op.getLoc(), result_type, /*input=*/input,
+      func_op.getLoc(), result_type, /*input=*/final_inputs,
       /*input_to_input_weights=*/weights_array->getResult(0),
       /*input_to_forget_weights=*/weights_array->getResult(1),
       /*input_to_cell_weights=*/weights_array->getResult(2),
@@ -718,29 +743,80 @@ LogicalResult ConvertKerasLSTMLayer(mlir::FuncOp func_op, OpBuilder* builder) {
       /*cell_layer_norm_coefficients=*/none,
       /*output_layer_norm_coefficients=*/none, builder->getStringAttr("TANH"),
       builder->getF32FloatAttr(10.0), builder->getF32FloatAttr(0.0),
-      builder->getBoolAttr(true));
+      builder->getBoolAttr(time_majored));
 
-  auto final_output = lstm.getResult();
-  if (!time_majored) {
-    SmallVector<int64_t, 4> perm = {1, 0, 2};
-    final_output =
-        Transpose(builder, final_output, perm, result_type, func_op.getLoc());
+  auto final_output_full_sequences = lstm.getResult();
+
+  // Populate the last output: last output is sliced from the full sequences.
+  // If time_major: last_output = outputs[-1, :, :]
+  // else: last_output = outputs[:, -1, :]
+  //
+  // As we are creating the strided_slice op, we need to populate the following
+  // fields:
+  // end: should always be (0, 0, 0)
+  // strides: should always be (1, 1, 1)
+  // begin: should be (0, -1, 0) or (-1, 0, 0) if it's time-majored.
+  // new_axis_mask: should always be 0.
+  // ellipsis_mask: should always be 0.
+  // begin_mask & end_mask: should be 0b101 = 5 or 0b110 = 4 if it's
+  // time-majored. shrink_axis_mask: should be 0b010 = 2 or 0b001 = 1 if it's
+  // time-majored.
+  SmallVector<int64_t, 2> last_output_shape({batch, n_output});
+
+  SmallVector<int32_t, 3> end({0, 0, 0});
+  SmallVector<int32_t, 3> strides({1, 1, 1});
+  SmallVector<int32_t, 3> begin;
+
+  int64_t new_axis_mask = 0;
+  int64_t ellipsis_mask = 0;
+  int64_t begin_mask;
+  int64_t end_mask;
+  int64_t shrink_axis_mask;
+  if (time_majored) {
+    begin_mask = 6;
+    end_mask = 6;
+    shrink_axis_mask = 1;
+    begin = {-1, 0, 0};
+  } else {
+    begin_mask = 5;
+    end_mask = 5;
+    shrink_axis_mask = 2;
+    begin = {0, -1, 0};
   }
 
+  auto last_output = CreateStridedSliceOp(
+      func_op.getLoc(), last_output_shape, final_output_full_sequences, begin,
+      end, strides, begin_mask, end_mask, ellipsis_mask, new_axis_mask,
+      shrink_axis_mask, builder);
+
   SmallVector<Value, 5> outputs;
+  SmallVector<Type, 5> output_types;
 
-  for (int i = 0; i < 5; ++i) {
-    if (i == 1) {
-      // only this one is the real output.
-      outputs.push_back(final_output);
-    } else {
-      auto result_type =
-          func_op.getCallableResults()[i].dyn_cast<RankedTensorType>();
-      outputs.push_back(CreatTfF32ConstOp(builder, result_type.getShape(), 0.0f,
-                                          func_op.getLoc()));
-    }
+  // Due to the existence of the while loop, the timestamp may be unknown
+  // for the signature, for us, since we know the inputs, we can infer the time
+  // steps.
+
+  // Last output.
+  outputs.push_back(last_output);
+  output_types.push_back(last_output.getType());
+
+  // Full sequences.
+  outputs.push_back(final_output_full_sequences);
+  output_types.push_back(final_output_full_sequences.getType());
+
+  // All the rest: states, device.
+  for (int i = 2; i < 5; ++i) {
+    auto result_type =
+        func_op.getCallableResults()[i].dyn_cast<RankedTensorType>();
+    outputs.push_back(CreatTfF32ConstOp(builder, result_type.getShape(), 0.0f,
+                                        func_op.getLoc()));
+    output_types.push_back(result_type);
   }
 
+  // Update function signatures.
+  func_op.setType(mlir::FunctionType::get(func_op.getType().getInputs(),
+                                          output_types, func_op.getContext()));
+
   builder->create<mlir::ReturnOp>(func_op.getLoc(), outputs);
   return success();
 }
diff --git a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
new file mode 100644
index 00000000000..e554686531a
--- /dev/null
+++ b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
@@ -0,0 +1,215 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/mlir_graph_optimization_pass.h"
+
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_os_ostream.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+
+static inline absl::string_view StringRefToView(llvm::StringRef ref) {
+  return {ref.data(), ref.size()};
+}
+
+// Dumps the MLIR module to disk.
+// This require the TF_DUMP_GRAPH_PREFIX to be set to a path that exist (or can
+// be created).
+static void DumpModule(mlir::ModuleOp module, std::string file_prefix) {
+  std::string prefix = GetDumpDirFromEnvVar();
+  if (prefix.empty()) return;
+
+  auto* env = tensorflow::Env::Default();
+  auto status = env->RecursivelyCreateDir(prefix);
+  if (!status.ok()) {
+    LOG(WARNING) << "cannot create directory '" + prefix +
+                        "': " + status.error_message();
+    return;
+  }
+
+  prefix += "/" + file_prefix;
+  if (!tensorflow::Env::Default()->CreateUniqueFileName(&prefix, ".mlir")) {
+    LOG(WARNING) << "cannot create unique filename, won't dump MLIR module.";
+    return;
+  }
+
+  std::unique_ptr<WritableFile> file_writer;
+  status = env->NewWritableFile(prefix, &file_writer);
+  if (!status.ok()) {
+    LOG(WARNING) << "cannot open file '" + prefix +
+                        "': " + status.error_message();
+    return;
+  }
+
+  // Print the module to a string before writing to the file.
+  std::string txt_module;
+  {
+    llvm::raw_string_ostream os(txt_module);
+    module.print(os);
+  }
+
+  status = file_writer->Append(txt_module);
+  if (!status.ok()) {
+    LOG(WARNING) << "error writing to file '" + prefix +
+                        "': " + status.error_message();
+    return;
+  }
+  (void)file_writer->Close();
+  VLOG(1) << "Dumped MLIR module to " << prefix;
+}
+
+MlirOptimizationPassRegistry& MlirOptimizationPassRegistry::Global() {
+  static auto* global = new MlirOptimizationPassRegistry();
+  return *global;
+}
+
+Status MlirFunctionOptimizationPass::Run(
+    const DeviceSet& device_set, const ConfigProto& config_proto,
+    std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def,
+    std::vector<std::string>* control_ret_node_names,
+    bool* control_rets_updated) {
+  // Skip conversion from Graph to MLIR if none of the passes are enabled.
+  const bool is_enabled =
+      llvm::any_of(registry_->passes(), [&](auto& pass_registration) -> bool {
+        return pass_registration.pass->IsEnabled(config_proto);
+      });
+
+  if (!is_enabled) {
+    VLOG(1) << "None of the MLIR optimization passes are enabled "
+            << "(registered " << registry_->passes().size() << ")";
+    return Status::OK();
+  }
+
+  VLOG(1) << "Running MLIR Graph Optimization Passes "
+          << "(registered " << registry_->passes().size() << " passes)";
+
+  GraphDebugInfo debug_info;
+  mlir::MLIRContext context;
+  GraphImportConfig import_config;
+  import_config.graph_as_function = true;
+  import_config.control_outputs = *control_ret_node_names;
+  TF_ASSIGN_OR_RETURN(auto module_ref,
+                      ConvertGraphToMlir(**graph, debug_info, *flib_def,
+                                         import_config, &context));
+
+  AddDevicesToOp(*module_ref, &device_set);
+
+  for (auto& pass_registration : registry_->passes()) {
+    llvm::StringRef name = pass_registration.pass->name();
+    VLOG(2) << "Run MLIR graph optimization pass: " << StringRefToView(name);
+
+    if (VLOG_IS_ON(1)) {
+      DumpModule(*module_ref, llvm::formatv("mlir_{0}_before_", name));
+    }
+
+    TF_RETURN_IF_ERROR(pass_registration.pass->Run(config_proto, *module_ref));
+
+    if (VLOG_IS_ON(1)) {
+      DumpModule(*module_ref, llvm::formatv("mlir_{0}_after_", name));
+    }
+  }
+
+  GraphExportConfig export_config;
+  export_config.graph_as_function = true;
+  absl::flat_hash_set<Node*> control_ret_nodes;
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      ConvertMlirToGraph(*module_ref, export_config, graph, flib_def,
+                         &control_ret_nodes),
+      "Error converting MLIR module back to graph");
+
+  control_ret_node_names->clear();
+  control_ret_node_names->reserve(control_ret_nodes.size());
+  for (const auto* node : control_ret_nodes)
+    control_ret_node_names->push_back(node->name());
+
+  *control_rets_updated = true;
+
+  return Status::OK();
+}
+
+MlirV1CompatOptimizationPassRegistry&
+MlirV1CompatOptimizationPassRegistry::Global() {
+  static auto* global = new MlirV1CompatOptimizationPassRegistry();
+  return *global;
+}
+
+Status MlirV1CompatGraphOptimizationPass::Run(
+    const GraphOptimizationPassOptions& options) {
+  // Skip function graphs as MlirOptimizationPassRegistry_ will be used instead.
+  if (options.is_function_graph) return Status::OK();
+
+  // Skip conversion from Graph to MLIR if none of the passes are enabled.
+  const bool is_enabled =
+      absl::c_any_of(registry_->passes(), [&](auto& pass_registration) -> bool {
+        return pass_registration.pass->IsEnabled(
+            options.session_options->config);
+      });
+
+  if (!is_enabled) {
+    VLOG(1) << "None of the MLIR optimization passes are enabled "
+            << "(registered" << registry_->passes().size() << " passes)";
+    return Status::OK();
+  }
+
+  VLOG(1) << "Running MLIR Graph Optimization V1 Compat Passes "
+          << "(registered" << registry_->passes().size() << " passes)";
+
+  GraphDebugInfo debug_info;
+  mlir::MLIRContext context;
+  GraphImportConfig import_config;
+  import_config.upgrade_legacy = true;
+  TF_ASSIGN_OR_RETURN(
+      auto module_ref,
+      ConvertGraphToMlir(**options.graph, debug_info, *options.flib_def,
+                         import_config, &context));
+
+  AddDevicesToOp(*module_ref, options.device_set);
+
+  for (auto& pass_registration : registry_->passes()) {
+    llvm::StringRef name = pass_registration.pass->name();
+    VLOG(2) << "Run MLIR graph optimization pass: " << StringRefToView(name);
+
+    if (VLOG_IS_ON(1)) {
+      DumpModule(*module_ref, llvm::formatv("mlir_{0}_before_", name));
+    }
+
+    TF_RETURN_IF_ERROR(pass_registration.pass->Run(options, *module_ref));
+
+    if (VLOG_IS_ON(1)) {
+      DumpModule(*module_ref, llvm::formatv("mlir_{0}_after_", name));
+    }
+  }
+
+  GraphExportConfig export_config;
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      ConvertMlirToGraph(*module_ref, export_config, options.graph,
+                         options.flib_def),
+      "Error converting MLIR module back to graph");
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.h b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.h
new file mode 100644
index 00000000000..aed5307d39d
--- /dev/null
+++ b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.h
@@ -0,0 +1,179 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_MLIR_GRAPH_OPTIMIZATION_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_MLIR_GRAPH_OPTIMIZATION_PASS_H_
+
+#include "mlir/IR/Module.h"  // TF:llvm-project
+#include "tensorflow/core/common_runtime/function_optimization_registry.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+
+namespace tensorflow {
+
+// -------------------------------------------------------------------------- //
+// MLIR passes running on Tensorflow function graphs (Tensorflow V2).
+// -------------------------------------------------------------------------- //
+
+// An API for registering MLIR ModulePass with the Tensorflow runtime. These
+// passes are running only for function graphs built by Tensorflow V2 and
+// instantiated by the process_function_library_runtime (see
+// FunctionOptimizationPass for details).
+class MlirOptimizationPass {
+ public:
+  virtual ~MlirOptimizationPass() = default;
+  virtual llvm::StringRef name() const = 0;
+  virtual bool IsEnabled(const ConfigProto& config_proto) const = 0;
+
+  virtual Status Run(const ConfigProto& config_proto,
+                     mlir::ModuleOp module) = 0;
+};
+
+class MlirOptimizationPassRegistry {
+ public:
+  struct PassRegistration {
+    int priority;
+    std::unique_ptr<MlirOptimizationPass> pass;
+  };
+
+  struct PriorityComparator {
+    bool operator()(const PassRegistration& x,
+                    const PassRegistration& y) const {
+      return x.priority < y.priority;
+    }
+  };
+
+  using Passes = std::set<PassRegistration, PriorityComparator>;
+
+  // Returns the global registry of MLIR optimization passes.
+  static MlirOptimizationPassRegistry& Global();
+
+  void Add(int priority, std::unique_ptr<MlirOptimizationPass> pass) {
+    passes_.insert({priority, std::move(pass)});
+  }
+
+  const Passes& passes() const { return passes_; }
+
+ private:
+  Passes passes_;
+};
+
+// Function optimization pass that runs all MLIR passes registered in
+// MlirOptimizationPassRegistry.
+class MlirFunctionOptimizationPass : public FunctionOptimizationPass {
+ public:
+  explicit MlirFunctionOptimizationPass(
+      const MlirOptimizationPassRegistry* registry =
+          &MlirOptimizationPassRegistry::Global())
+      : registry_(registry) {}
+
+  Status Run(const DeviceSet& device_set, const ConfigProto& config_proto,
+             std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def,
+             std::vector<std::string>* control_ret_node_names,
+             bool* control_rets_updated) override;
+
+ private:
+  const MlirOptimizationPassRegistry* registry_;
+};
+
+// -------------------------------------------------------------------------- //
+// MLIR passes running on Tensorflow V1 graphs.
+// -------------------------------------------------------------------------- //
+
+// An API for registering MLIR ModulePass with the Tensorflow runtime. These
+// passes are running only for V1 graphs (legacy graphs) executed via Session
+// runtime. Graph importer updates legacy graph behavior to V2 constructs (e.g.
+// it raises control flow from Switch/Merge nodes to functional control flow
+// with If/While operations).
+class MlirV1CompatOptimizationPass {
+ public:
+  virtual ~MlirV1CompatOptimizationPass() = default;
+  virtual llvm::StringRef name() const = 0;
+  virtual bool IsEnabled(const ConfigProto& config_proto) const = 0;
+
+  virtual Status Run(const GraphOptimizationPassOptions& options,
+                     mlir::ModuleOp module) = 0;
+};
+
+class MlirV1CompatOptimizationPassRegistry {
+ public:
+  struct PassRegistration {
+    int priority;
+    std::unique_ptr<MlirV1CompatOptimizationPass> pass;
+  };
+
+  struct PriorityComparator {
+    bool operator()(const PassRegistration& x,
+                    const PassRegistration& y) const {
+      return x.priority < y.priority;
+    }
+  };
+
+  using Passes = std::set<PassRegistration, PriorityComparator>;
+
+  // Returns the global registry of MLIR optimization passes.
+  static MlirV1CompatOptimizationPassRegistry& Global();
+
+  void Add(int priority, std::unique_ptr<MlirV1CompatOptimizationPass> pass) {
+    passes_.insert({priority, std::move(pass)});
+  }
+
+  const Passes& passes() const { return passes_; }
+
+ private:
+  Passes passes_;
+};
+
+class MlirV1CompatGraphOptimizationPass : public GraphOptimizationPass {
+ public:
+  explicit MlirV1CompatGraphOptimizationPass(
+      const MlirV1CompatOptimizationPassRegistry* registry =
+          &MlirV1CompatOptimizationPassRegistry::Global())
+      : registry_(registry) {}
+
+  Status Run(const GraphOptimizationPassOptions& options) override;
+
+ private:
+  const MlirV1CompatOptimizationPassRegistry* registry_;
+};
+
+// -------------------------------------------------------------------------- //
+// Helper classes for static registration of MLIR (V1 Compat) passes in the
+// corresponding registry.
+// -------------------------------------------------------------------------- //
+
+namespace mlir_pass_registration {
+
+class MlirOptimizationPassRegistration {
+ public:
+  explicit MlirOptimizationPassRegistration(
+      int priority, std::unique_ptr<MlirOptimizationPass> pass) {
+    MlirOptimizationPassRegistry::Global().Add(priority, std::move(pass));
+  }
+};
+
+class MlirV1CompatOptimizationPassRegistration {
+ public:
+  explicit MlirV1CompatOptimizationPassRegistration(
+      int priority, std::unique_ptr<MlirV1CompatOptimizationPass> pass) {
+    MlirV1CompatOptimizationPassRegistry::Global().Add(priority,
+                                                       std::move(pass));
+  }
+};
+
+}  // namespace mlir_pass_registration
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_MLIR_GRAPH_OPTIMIZATION_PASS_H_
diff --git a/tensorflow/compiler/mlir/mlir_graph_optimization_pass_registration.cc b/tensorflow/compiler/mlir/mlir_graph_optimization_pass_registration.cc
new file mode 100644
index 00000000000..8155af6505e
--- /dev/null
+++ b/tensorflow/compiler/mlir/mlir_graph_optimization_pass_registration.cc
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "tensorflow/compiler/mlir/mlir_graph_optimization_pass.h"
+#include "tensorflow/core/common_runtime/function_optimization_registry.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+
+namespace tensorflow {
+
+static function_optimization_registration::FunctionOptimizationPassRegistration
+    register_mlir_passes(std::make_unique<MlirFunctionOptimizationPass>());
+
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 0,
+                      MlirV1CompatGraphOptimizationPass);
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/runlit.cfg.py b/tensorflow/compiler/mlir/runlit.cfg.py
index 3a308e2e9d2..67533197f3e 100644
--- a/tensorflow/compiler/mlir/runlit.cfg.py
+++ b/tensorflow/compiler/mlir/runlit.cfg.py
@@ -59,6 +59,9 @@ if platform.system() == 'Windows':
 else:
   llvm_config.use_default_substitutions()
 
+llvm_config.config.substitutions.append(
+    ('%tfrt_bindir', 'tensorflow/compiler/aot'))
+
 # Tweak the PATH to include the tools dir.
 llvm_config.with_environment('PATH', config.llvm_tools_dir, append_path=True)
 
@@ -68,7 +71,7 @@ tool_dirs = config.mlir_tf_tools_dirs + [
 tool_names = [
     'mlir-opt', 'mlir-translate', 'tf-opt', 'tf_tfl_translate',
     'flatbuffer_to_string', 'flatbuffer_translate', 'tf-mlir-translate',
-    'mlir-tflite-runner'
+    'mlir-tflite-runner', 'tfcompile'
 ]
 tools = [ToolSubst(s, unresolved='ignore') for s in tool_names]
 llvm_config.add_tool_substitutions(tools, tool_dirs)
diff --git a/tensorflow/compiler/mlir/runlit.site.cfg.py b/tensorflow/compiler/mlir/runlit.site.cfg.py
index b324386662e..6c369a5a24c 100644
--- a/tensorflow/compiler/mlir/runlit.site.cfg.py
+++ b/tensorflow/compiler/mlir/runlit.site.cfg.py
@@ -45,6 +45,7 @@ mlir_tf_tools_dirs = [
     'tensorflow/compiler/mlir/lite',
     'tensorflow/compiler/mlir/tensorflow',
     'tensorflow/compiler/mlir/xla',
+    'tensorflow/compiler/aot'
 ]
 config.mlir_tf_tools_dirs = [
     os.path.join(real_test_srcdir, os.environ['TEST_WORKSPACE'], s)
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index d52fd0c3b72..e2aae0ec52e 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -10,6 +10,7 @@ package_group(
     name = "friends",
     includes = ["//third_party/mlir:subpackages"],
     packages = [
+        "//learning/pathways/data_parallel/tf2xla/...",
         "//tensorflow/compiler/...",
         "//tensorflow/lite/experimental/tf_runtime/...",
         "//tensorflow/python/...",
@@ -24,7 +25,8 @@ filegroup(
         "ir/tf_op_interfaces.td",
         "ir/tf_ops.td",
         "@llvm-project//mlir:OpBaseTdFiles",
-        "@llvm-project//mlir:include/mlir/Analysis/CallInterfaces.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
     ],
 )
 
@@ -62,6 +64,14 @@ gentbl(
             "-gen-op-doc",
             "g3doc/tf_ops.md",
         ),
+        (
+            "-gen-struct-attr-decls",
+            "ir/tf_structs.h.inc",
+        ),
+        (
+            "-gen-struct-attr-defs",
+            "ir/tf_structs.cc.inc",
+        ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "ir/tf_ops.td",
@@ -186,6 +196,7 @@ cc_library(
         "ir/tf_ops.cc.inc",
         "ir/tf_ops.h.inc",
         "ir/tf_saved_model.cc",
+        "ir/tf_structs.cc",
         "ir/tf_verifiers.cc",
     ],
     hdrs = [
@@ -194,12 +205,14 @@ cc_library(
         "ir/tf_executor.h",
         "ir/tf_ops.h",
         "ir/tf_saved_model.h",
+        "ir/tf_structs.h",
         "ir/tf_traits.h",
         "ir/tf_verifiers.h",
         "transforms/bridge.h",
+        "transforms/einsum.h",
         "transforms/passes.h",
         "transforms/unroll_batch_matmul.h",
-        "@llvm-project//mlir:include/mlir/Analysis/CallInterfaces.h",
+        "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.h",
         "@llvm-project//mlir:include/mlir/Transforms/InliningUtils.h",
     ],
     includes = ["include"],
@@ -219,10 +232,12 @@ cc_library(
         "@llvm-project//llvm:support",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:CallOpInterfacesIncGen",
+        "@llvm-project//mlir:DerivedAttributeOpInterface",
         "@llvm-project//mlir:Dialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SideEffects",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
@@ -264,23 +279,49 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "unroll_batch_matmul_pass",
+    srcs = [
+        "transforms/unroll_batch_matmul.cc",
+    ],
+    hdrs = [
+        "transforms/unroll_batch_matmul.h",
+    ],
+    deps = [
+        ":tensorflow",
+        "//tensorflow/core:framework",
+        "@com_google_absl//absl/memory",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:Analysis",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
 cc_library(
     name = "tensorflow_passes",
     srcs = [
         "transforms/annotate_parameter_replication.cc",
+        "transforms/batchmatmul_to_einsum.cc",
         "transforms/bridge.cc",
         "transforms/bridge_pass.cc",
         "transforms/cluster_formation.cc",
         "transforms/cluster_outlining.cc",
+        "transforms/collection_ops_util.cc",
         "transforms/decompose_resource_ops_pass.cc",
+        "transforms/einsum.cc",
         "transforms/executor_island_coarsening.cc",
         "transforms/executor_tpuv1_inline_tpu_island.cc",
         "transforms/executor_tpuv1_island_coarsening.cc",
         "transforms/executor_tpuv1_outline_tpu_island.cc",
         "transforms/fold_switch.cc",
+        "transforms/freeze_global_tensors.cc",
         "transforms/functional_control_flow_to_cfg.cc",
         "transforms/generated_canonicalize.inc",
         "transforms/generated_optimize.inc",
+        "transforms/gpu_fusion.cc",
         "transforms/graph_pruning.cc",
         "transforms/launch_to_device_attribute.cc",
         "transforms/layout_optimization.cc",
@@ -299,6 +340,7 @@ cc_library(
         "transforms/shape_inference_pass.cc",
         "transforms/sink_constant.cc",
         "transforms/stack_ops_decomposition.cc",
+        "transforms/tensor_list_ops_decomposition.cc",
         "transforms/test_side_effect_analysis.cc",
         "transforms/tf_device_assignment.cc",
         "transforms/tpu_cluster_formation.cc",
@@ -308,17 +350,18 @@ cc_library(
         "transforms/tpu_rewrite_pass.cc",
         "transforms/tpu_sharding_identification_pass.cc",
         "transforms/tpu_variable_runtime_reformatting.cc",
-        "transforms/unroll_batch_matmul.cc",
         "translate/breakup-islands.cc",
         "translate/control_to_executor_dialect.cc",
         "translate/executor_to_control_dialect.cc",
         "translate/tf_functional_to_executor.cc",
     ],
     hdrs = [
+        "transforms/batchmatmul_to_einsum.h",
         "transforms/bridge.h",
+        "transforms/collection_ops_util.h",
+        "transforms/einsum.h",
         "transforms/passes.h",
         "transforms/shape_inference.h",
-        "transforms/unroll_batch_matmul.h",
     ],
     includes = ["include"],
     deps = [
@@ -337,6 +380,8 @@ cc_library(
         ":tensorflow_types",
         ":tpu_rewrite_device_util",
         ":translate_utils",
+        ":unroll_batch_matmul_pass",
+        ":xla_sharding_util",
         "//tensorflow/compiler/mlir/lite:validators",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla:xla_proto_cc",
@@ -355,7 +400,6 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
@@ -378,13 +422,34 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "graph_optimization_pass",
+    srcs = ["transforms/graph_optimization_pass.cc"],
+    hdrs = ["transforms/graph_optimization_pass.h"],
+    deps = [
+        ":tensorflow_passes",
+        "//tensorflow/compiler/mlir:mlir_graph_optimization_pass",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "graph_optimization_pass_registration",
+    srcs = ["transforms/graph_optimization_pass_registration.cc"],
+    deps = [
+        ":graph_optimization_pass",
+        "//tensorflow/compiler/mlir:mlir_graph_optimization_pass",
+        "//tensorflow/compiler/mlir:mlir_graph_optimization_pass_registration",
+    ],
+    alwayslink = 1,
+)
+
 # Library with TensorFlow dialect static initialization.
 cc_library(
     name = "tensorflow_dialect_registration",
     srcs = ["ir/dialect_registration.cc"],
     deps = [
         ":tensorflow",
-        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LoopOpsTransforms",
     ],
@@ -703,18 +768,34 @@ cc_library(
 )
 
 cc_library(
-    name = "tf_dialect_passes",
+    name = "decode_constant_pass",
     srcs = [
-        "transforms/constant_fold.cc",
         "transforms/decode_constant.cc",
-        "transforms/dialect_hooks.cc",
     ],
     hdrs = [
-        "transforms/constant_fold.h",
         "transforms/decode_constant.h",
     ],
     deps = [
         ":convert_tensor",
+        ":tensorflow",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "tf_dialect_passes",
+    srcs = [
+        "transforms/constant_fold.cc",
+        "transforms/dialect_hooks.cc",
+    ],
+    hdrs = [
+        "transforms/constant_fold.h",
+    ],
+    deps = [
+        ":convert_tensor",
+        ":decode_constant_pass",
         ":eval_util",
         ":tensorflow",
         ":tensorflow_types",
@@ -725,9 +806,8 @@ cc_library(
         "//tensorflow/stream_executor",
         "//tensorflow/stream_executor/lib",
         "@llvm-project//llvm:support",
-        "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SideEffects",
         "@llvm-project//mlir:Support",
     ],
     alwayslink = 1,
@@ -738,6 +818,7 @@ cc_library(
     deps = [
         ":tensorflow_dialect_registration",
         ":tf_dialect_passes",
+        "@llvm-project//mlir:AllPassesAndDialects",
     ],
 )
 
@@ -880,7 +961,8 @@ tf_native_cc_binary(
 genrule(
     name = "derived_attr_populator_inc",
     srcs = [
-        "@llvm-project//mlir:include/mlir/Analysis/CallInterfaces.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
         "@llvm-project//mlir:include/mlir/IR/OpBase.td",
         "ir/tf_generated_ops.td",
         "ir/tf_op_base.td",
@@ -931,7 +1013,6 @@ cc_library(
         ":error_util",
         ":tensorflow_dialect_registration",
         ":tensorflow_passes",
-        ":tf_dialect_passes",
         ":translate_utils",
         "//tensorflow/compiler/mlir/xla:hlo",
         "//tensorflow/compiler/mlir/xla:mlir_hlo_to_hlo",
@@ -943,6 +1024,7 @@ cc_library(
         "//tensorflow/core/platform:logging",
         "//tensorflow/stream_executor/lib",
         "@llvm-project//llvm:support",
+        "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
@@ -1043,7 +1125,7 @@ cc_library(
     srcs = ["utils/tpu_rewrite_device_util.cc"],
     hdrs = ["utils/tpu_rewrite_device_util.h"],
     deps = [
-        "//tensorflow/compiler/xla:array3d",
+        "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/core:framework",
@@ -1074,6 +1156,7 @@ cc_library(
     srcs = ["utils/device_util.cc"],
     hdrs = ["utils/device_util.h"],
     deps = [
+        ":tensorflow",
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
         "@llvm-project//llvm:support",
@@ -1154,3 +1237,20 @@ cc_library(
         "@llvm-project//mlir:Support",
     ],
 )
+
+cc_library(
+    name = "xla_sharding_util",
+    srcs = [
+        "utils/xla_sharding_util.cc",
+    ],
+    hdrs = [
+        "utils/xla_sharding_util.h",
+    ],
+    deps = [
+        ":tensorflow",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
index 84c3cd64a5f..931f24b9606 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
@@ -321,7 +321,13 @@ bool OpIsDeclaration(Operation* op,
 
 // Returns if `op` is know to not have any side effect.
 bool OpIsKnownToHaveNoSideEffect(Operation* op) {
-  if (op->hasNoSideEffect()) return true;
+  // TODO(riverriddle) We shouldn't treat all terminator operations as having
+  // side effects, this should be relaxed.
+  // TODO(riverriddle) Properly handle region side effects.
+  if (MemoryEffectOpInterface::hasNoEffect(op) && op->isKnownNonTerminator() &&
+      op->getNumRegions() == 0) {
+    return true;
+  }
   if (auto if_op = llvm::dyn_cast<TF::IfOp>(op)) {
     return if_op.is_stateless();
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h b/tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h
index 59a1cc21b28..0156d7e7e9d 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "mlir/IR/Dialect.h"  // TF:llvm-project
 #include "mlir/IR/OpDefinition.h"  // TF:llvm-project
 #include "mlir/IR/Types.h"  // TF:llvm-project
+#include "mlir/Interfaces/SideEffects.h"  // TF:llvm-project
 
 namespace mlir {
 namespace TFControlFlow {
@@ -84,7 +85,7 @@ class TFControlType : public Type::TypeBase<TFControlType, Type> {
 // Note: Additional result corresponds to the control output.
 class EnterOp
     : public Op<EnterOp, OpTrait::AtLeastNOperands<1>::Impl,
-                OpTrait::NResults<2>::Impl, OpTrait::HasNoSideEffect> {
+                OpTrait::NResults<2>::Impl, MemoryEffectOpInterface::Trait> {
  public:
   using Op::Op;
 
@@ -94,6 +95,9 @@ class EnterOp
   void setData(Value value) { setOperand(0, value); }
 
   LogicalResult verify();
+
+  // EnterOp has no side-effects.
+  void getEffects(SmallVectorImpl<MemoryEffects::EffectInstance> &) {}
 };
 
 // The "_tf.Merge" operation takes a list of input operands and returns a value
@@ -197,7 +201,7 @@ class NextIterationSinkOp
 // Note: Additional result corresponds to the control output.
 class LoopCondOp
     : public Op<LoopCondOp, OpTrait::AtLeastNOperands<1>::Impl,
-                OpTrait::NResults<2>::Impl, OpTrait::HasNoSideEffect> {
+                OpTrait::NResults<2>::Impl, MemoryEffectOpInterface::Trait> {
  public:
   using Op::Op;
   static StringRef getOperationName() { return "_tf.LoopCond"; }
@@ -206,6 +210,9 @@ class LoopCondOp
   void setData(Value value) { setOperand(0, value); }
 
   LogicalResult verify();
+
+  // LoopCondOp has no side-effects.
+  void getEffects(SmallVectorImpl<MemoryEffects::EffectInstance> &) {}
 };
 
 // The "_tf.Switch" operation takes a data operand and a boolean predicate
@@ -260,8 +267,9 @@ class SwitchOp : public Op<SwitchOp, OpTrait::AtLeastNOperands<2>::Impl,
 //       (tensor<*xi32>, !_tf.control)
 //
 // Note: Additional result corresponds to the control output.
-class ExitOp : public Op<ExitOp, OpTrait::AtLeastNOperands<1>::Impl,
-                         OpTrait::NResults<2>::Impl, OpTrait::HasNoSideEffect> {
+class ExitOp
+    : public Op<ExitOp, OpTrait::AtLeastNOperands<1>::Impl,
+                OpTrait::NResults<2>::Impl, MemoryEffectOpInterface::Trait> {
  public:
   using Op::Op;
   static StringRef getOperationName() { return "_tf.Exit"; }
@@ -270,6 +278,9 @@ class ExitOp : public Op<ExitOp, OpTrait::AtLeastNOperands<1>::Impl,
   void setData(Value value) { setOperand(0, value); }
 
   LogicalResult verify();
+
+  // ExitOp has no side-effects.
+  void getEffects(SmallVectorImpl<MemoryEffects::EffectInstance> &) {}
 };
 
 }  // namespace TFControlFlow
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/dialect_registration.cc b/tensorflow/compiler/mlir/tensorflow/ir/dialect_registration.cc
index ccab3d9c6e7..ac468d9810c 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/dialect_registration.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/dialect_registration.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir/InitAllDialects.h"  // TF:llvm-project
-#include "mlir/InitAllPasses.h"  // TF:llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
@@ -23,13 +21,6 @@ limitations under the License.
 
 namespace mlir {
 
-static bool auto_init = []() {
-  registerAllDialects();
-  registerAllPasses();
-
-  return true;
-}();
-
 // Static initialization for TF dialect registration.
 static DialectRegistration<TFControlFlow::TFControlFlowDialect>
     tf_control_flow_ops;
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
index 5071910031f..38fb3154c48 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
@@ -85,6 +85,20 @@ struct TFInlinerInterface : public DialectInlinerInterface {
                                       /*truncate=*/builder.getBoolAttr(false));
   }
 };
+
+// Checks if a block wraps a single operation and the single operation results
+// are perfectly forwarded to the block's terminator.
+bool BlockWrapsSingleOp(Block* block) {
+  auto body = block->without_terminator();
+  if (!has_single_element(body)) return false;
+
+  Operation& wrapped_op = *body.begin();
+  Operation* terminator = block->getTerminator();
+  return wrapped_op.getNumResults() == terminator->getNumOperands() &&
+         std::equal(wrapped_op.getResults().begin(),
+                    wrapped_op.getResults().end(),
+                    terminator->getOperands().begin());
+}
 }  // end anonymous namespace
 
 TensorFlowDeviceDialect::TensorFlowDeviceDialect(MLIRContext* context)
@@ -105,17 +119,7 @@ TensorFlowDeviceDialect::TensorFlowDeviceDialect(MLIRContext* context)
 
 // Checks if a tf_device.launch wraps a single operation and the single
 // operation results are perfectly forwarded to the launch return.
-bool LaunchOp::WrapsSingleOp() {
-  auto body = GetBody().without_terminator();
-  if (!has_single_element(body)) return false;
-
-  Operation& wrapped_op = *body.begin();
-  Operation* terminator = GetBody().getTerminator();
-  return wrapped_op.getNumResults() == terminator->getNumOperands() &&
-         std::equal(wrapped_op.getResults().begin(),
-                    wrapped_op.getResults().end(),
-                    terminator->getOperands().begin());
-}
+bool LaunchOp::WrapsSingleOp() { return BlockWrapsSingleOp(&GetBody()); }
 
 //===----------------------------------------------------------------------===//
 // tf_device.return
@@ -210,30 +214,32 @@ void ParallelExecuteOp::build(Builder* builder, OperationState& state,
   state.addTypes(output_types);
 }
 
-std::vector<OpResult> ParallelExecuteOp::GetRegionOutputs(
-    unsigned region_index) {
-  int num_region_results =
-      GetRegionBlockWithIndex(region_index).getTerminator()->getNumResults();
-  std::vector<OpResult> results;
-  results.reserve(num_region_results);
-
-  int return_value_offset = 0;
-  for (int region_id = 0; region_id < region_index; ++region_id)
-    return_value_offset +=
-        GetRegionBlockWithIndex(region_id).getTerminator()->getNumResults();
-
-  for (int i = 0; i < num_region_results; ++i)
-    results.emplace_back(getOperation()->getOpResult(return_value_offset + i));
-
-  return results;
-}
-
 LogicalResult ParallelExecuteOp::verify() { return Verify(*this); }
 
 Block& ParallelExecuteOp::GetRegionBlockWithIndex(unsigned index) {
   return getOperation()->getRegion(index).front();
 }
 
+Operation::result_range ParallelExecuteOp::GetRegionOutputs(
+    unsigned region_index) {
+  int num_region_results =
+      GetRegionBlockWithIndex(region_index).getTerminator()->getNumOperands();
+
+  int return_value_offset = 0;
+  for (int region_id = 0; region_id < region_index; ++region_id)
+    return_value_offset +=
+        GetRegionBlockWithIndex(region_id).getTerminator()->getNumOperands();
+
+  Operation::result_range region_results(getOperation(),
+                                         /*startIndex=*/return_value_offset,
+                                         /*count=*/num_region_results);
+  return region_results;
+}
+
+bool ParallelExecuteOp::RegionWrapsSingleOp(unsigned index) {
+  return BlockWrapsSingleOp(&GetRegionBlockWithIndex(index));
+}
+
 //===----------------------------------------------------------------------===//
 // tf_device.replicate
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.h
index 0cb26bbfe65..1b20120cc2e 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.h
@@ -74,9 +74,14 @@ class ParallelExecuteOp
 
   static StringRef getOperationName() { return "tf_device.parallel_execute"; }
 
-  std::vector<OpResult> GetRegionOutputs(unsigned region_index);
   LogicalResult verify();
   Block& GetRegionBlockWithIndex(unsigned index);
+  Operation::result_range GetRegionOutputs(unsigned region_index);
+
+  // Checks if a tf_device.parallel_execute index'th region block wraps a single
+  // operation and the single operation results are perfectly forwarded to the
+  // region block's return.
+  bool RegionWrapsSingleOp(unsigned index);
 };
 
 }  // namespace tf_device
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
index 38f72f24bd1..3c47ef1117d 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
@@ -128,7 +128,7 @@ def TfExecutor_GraphOp : TfExecutor_Op<"graph",
 def TfExecutor_FetchOp : TfExecutor_Op<"fetch",
     [Terminator, ControlOperandsAfterAllData, HasParent<"GraphOp">]> {
   let summary = [{
-    The `tf_executor.fetch` operation terminates the graph and returns values";
+    The `tf_executor.fetch` operation terminates the graph and returns values;
   }];
 
   let description = [{
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 440aeaa49dc..d2bbbd32b7c 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -319,6 +319,32 @@ this value or a subsequent newer value of the variable.
   TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<1>;
 }
 
+def TF_Atan2Op : TF_Op<"Atan2", [NoSideEffect, ResultsBroadcastableShape]>,
+                 WithBroadcastableBinOpBuilder {
+  let summary = [{
+Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
+  }];
+
+  let description = [{
+This is the angle \( \theta \in [-\pi, \pi] \) such that
+\[ x = r \cos(\theta) \]
+and
+\[ y = r \sin(\theta) \]
+where \(r = \sqrt(x^2 + y^2) \).
+  }];
+
+  let arguments = (ins
+    TF_FpTensor:$y,
+    TF_FpTensor:$x
+  );
+
+  let results = (outs
+    TF_FpTensor:$z
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_AvgPoolOp : TF_Op<"AvgPool", [NoSideEffect]> {
   let summary = "Performs average pooling on the input.";
 
@@ -426,6 +452,10 @@ about broadcasting
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 
+  let verifier = [{
+    return Verify(*this);
+  }];
+
   let hasCanonicalizer = 1;
 }
 
@@ -481,7 +511,7 @@ reverse of SpaceToBatch.  See below for a precise description.
   TF_DerivedOperandTypeAttr Tblock_shape = TF_DerivedOperandTypeAttr<1>;
 }
 
-def TF_BiasAddOp : TF_Op<"BiasAdd", [NoSideEffect, TF_LayoutSensitiveInterface]> {
+def TF_BiasAddOp : TF_Op<"BiasAdd", [NoSideEffect]> {
   let summary = "Adds `bias` to `value`.";
 
   let description = [{
@@ -505,13 +535,6 @@ Broadcasting is supported, so `value` may have any number of dimensions.
   let verifier = [{
     return Verify(*this);
   }];
-
-  let extraClassDeclaration = [{
-    // TF_LayoutSensitiveInterface:
-    SmallVector<unsigned, 4> GetLayoutDependentArgs() { return {0}; }
-    SmallVector<unsigned, 4> GetLayoutDependentResults() { return {0}; }
-    LogicalResult UpdateDataFormat(StringRef data_format);
-  }];
 }
 
 def TF_BiasAddGradOp : TF_Op<"BiasAddGrad", [NoSideEffect]> {
@@ -1036,6 +1059,7 @@ horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
     // TF_LayoutSensitiveInterface:
     SmallVector<unsigned, 4> GetLayoutDependentArgs() { return {0}; }
     SmallVector<unsigned, 4> GetLayoutDependentResults() { return {0}; }
+    StringRef GetOptimalLayout(const RuntimeDevices& devices);
     LogicalResult UpdateDataFormat(StringRef data_format);
   }];
 }
@@ -1184,6 +1208,56 @@ and `B, D, F, H` as group 1. Thus we get the outputs:
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_CumsumOp : TF_Op<"Cumsum", [NoSideEffect]> {
+  let summary = "Compute the cumulative sum of the tensor `x` along `axis`.";
+
+  let description = [{
+By default, this op performs an inclusive cumsum, which means that the first
+element of the input is identical to the first element of the output:
+
+```python
+tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
+```
+
+By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
+performed instead:
+
+```python
+tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
+```
+
+By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+opposite direction:
+
+```python
+tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
+```
+
+This is more efficient than using separate `tf.reverse` ops.
+
+The `reverse` and `exclusive` kwargs can also be combined:
+
+```python
+tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
+```
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$x,
+    TF_I32OrI64Tensor:$axis,
+
+    DefaultValuedAttr<BoolAttr, "false">:$exclusive,
+    DefaultValuedAttr<BoolAttr, "false">:$reverse
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$out
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_DepthToSpaceOp : TF_Op<"DepthToSpace", [NoSideEffect]> {
   let summary = "DepthToSpace for tensors of type T.";
 
@@ -4870,6 +4944,8 @@ operation.
   );
 
   TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+
+  let hasCanonicalizer = 1;
 }
 
 def TF_RealOp : TF_Op<"Real", [NoSideEffect, SameOperandsAndResultShape]> {
@@ -5120,7 +5196,7 @@ Resize `images` to `size` using nearest neighbor interpolation.
   }];
 
   let arguments = (ins
-    TensorOf<[F16, F32, F64, I16, I32, I64, I8, TF_Uint16, TF_Uint8]>:$images,
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Uint16, TF_Uint8]>:$images,
     I32Tensor:$size,
 
     DefaultValuedAttr<BoolAttr, "false">:$align_corners,
@@ -5128,7 +5204,7 @@ Resize `images` to `size` using nearest neighbor interpolation.
   );
 
   let results = (outs
-    TensorOf<[F16, F32, F64, I16, I32, I64, I8, TF_Uint16, TF_Uint8]>:$resized_images
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Uint16, TF_Uint8]>:$resized_images
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -6659,9 +6735,9 @@ receive 0, 0, and 1, respectively. The appropriate bits in `begin_mask` and
     // `begin_indices`, `end_indices`, and `strides` with their canonical
     // values, respectively.
     bool GetSlicedBoundRanges(
-      ::llvm::SmallVectorImpl<int64_t> *begin_indices,
-      ::llvm::SmallVectorImpl<int64_t> *end_indices,
-      ::llvm::SmallVectorImpl<int64_t> *strides);
+      ::llvm::SmallVectorImpl<int64_t> *slice_begin,
+      ::llvm::SmallVectorImpl<int64_t> *slice_end,
+      ::llvm::SmallVectorImpl<int64_t> *slice_stride);
   }];
 }
 
@@ -6708,10 +6784,10 @@ shape of `StridedSlice`'s `input`.
     // `begin_indices`, `end_indices`, and `strides` with their canonical
     // values, respectively.
     bool GetSlicedShapeAndBoundRanges(
-      ::llvm::SmallVectorImpl<int64_t> *shape,
-      ::llvm::SmallVectorImpl<int64_t> *begin_indices,
-      ::llvm::SmallVectorImpl<int64_t> *end_indices,
-      ::llvm::SmallVectorImpl<int64_t> *strides);
+      ::llvm::SmallVectorImpl<int64_t> *input_shape,
+      ::llvm::SmallVectorImpl<int64_t> *slice_begin,
+      ::llvm::SmallVectorImpl<int64_t> *slice_end,
+      ::llvm::SmallVectorImpl<int64_t> *slice_stride);
   }];
 }
 
@@ -7020,6 +7096,275 @@ is the corresponding input gradient.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_TensorArrayCloseV3Op : TF_Op<"TensorArrayCloseV3", []> {
+  let summary = "Delete the TensorArray from its resource container.";
+
+  let description = [{
+This enables the user to close and release the resource in the middle
+of a step/run.
+  }];
+
+  let arguments = (ins
+    TF_ResourceTensor:$handle
+  );
+
+  let results = (outs);
+}
+
+def TF_TensorArrayConcatV3Op : TF_Op<"TensorArrayConcatV3", []> {
+  let summary = "Concat the elements from the TensorArray into value `value`.";
+
+  let description = [{
+Takes `T` elements of shapes
+
+  ```
+  (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...), ..., (n(T-1) x d0 x d1 x ...)
+  ```
+
+and concatenates them into a Tensor of shape:
+
+  ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```
+
+All elements must have the same shape (excepting the first dimension).
+  }];
+
+  let arguments = (ins
+    TF_ResourceTensor:$handle,
+    F32Tensor:$flow_in,
+
+    DefaultValuedAttr<StrAttr, "tfshape$unknown_rank: true">:$element_shape_except0
+  );
+
+  let results = (outs
+    TF_Tensor:$value,
+    I64Tensor:$lengths
+  );
+
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
+def TF_TensorArrayGatherV3Op : TF_Op<"TensorArrayGatherV3", []> {
+  let summary = [{
+Gather specific elements from the TensorArray into output `value`.
+  }];
+
+  let description = [{
+All elements selected by `indices` must have the same shape.
+  }];
+
+  let arguments = (ins
+    TF_ResourceTensor:$handle,
+    I32Tensor:$indices,
+    F32Tensor:$flow_in,
+
+    DefaultValuedAttr<StrAttr, "tfshape$unknown_rank: true">:$element_shape
+  );
+
+  let results = (outs
+    TF_Tensor:$value
+  );
+
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
+def TF_TensorArrayGradV3Op : TF_Op<"TensorArrayGradV3", []> {
+  let summary = [{
+Creates a TensorArray for storing the gradients of values in the given handle.
+  }];
+
+  let description = [{
+If the given TensorArray gradient already exists, returns a reference to it.
+
+Locks the size of the original TensorArray by disabling its dynamic size flag.
+
+**A note about the input flow_in:**
+
+The handle flow_in forces the execution of the gradient lookup to occur
+only after certain other operations have occurred.  For example, when
+the forward TensorArray is dynamically sized, writes to this TensorArray
+may resize the object.  The gradient TensorArray is statically sized based
+on the size of the forward TensorArray when this operation executes.
+Furthermore, the size of the forward TensorArray is frozen by this call.
+As a result, the flow is used to ensure that the call to generate the gradient
+TensorArray only happens after all writes are executed.
+
+In the case of dynamically sized TensorArrays, gradient computation should
+only be performed on read operations that have themselves been chained via
+flow to occur only after all writes have executed. That way the final size
+of the forward TensorArray is known when this operation is called.
+
+**A note about the source attribute:**
+
+TensorArray gradient calls use an accumulator TensorArray object.  If
+multiple gradients are calculated and run in the same session, the multiple
+gradient nodes may accidentally flow through the same accumulator TensorArray.
+This double counts and generally breaks the TensorArray gradient flow.
+
+The solution is to identify which gradient call this particular
+TensorArray gradient is being called in.  This is performed by identifying
+a unique string (e.g. "gradients", "gradients_1", ...) from the input
+gradient Tensor's name.  This string is used as a suffix when creating
+the TensorArray gradient object here (the attribute `source`).
+
+The attribute `source` is added as a suffix to the forward TensorArray's
+name when performing the creation / lookup, so that each separate gradient
+calculation gets its own TensorArray accumulator.
+  }];
+
+  let arguments = (ins
+    TF_ResourceTensor:$handle,
+    F32Tensor:$flow_in,
+
+    StrAttr:$source
+  );
+
+  let results = (outs
+    TF_ResourceTensor:$grad_handle,
+    F32Tensor:$flow_out
+  );
+}
+
+def TF_TensorArrayReadV3Op : TF_Op<"TensorArrayReadV3", []> {
+  let summary = "Read an element from the TensorArray into output `value`.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TF_ResourceTensor:$handle,
+    I32Tensor:$index,
+    F32Tensor:$flow_in
+  );
+
+  let results = (outs
+    TF_Tensor:$value
+  );
+
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
+def TF_TensorArrayScatterV3Op : TF_Op<"TensorArrayScatterV3", []> {
+  let summary = [{
+Scatter the data from the input value into specific TensorArray elements.
+  }];
+
+  let description = [{
+`indices` must be a vector, its length must match the first dim of `value`.
+  }];
+
+  let arguments = (ins
+    TF_ResourceTensor:$handle,
+    I32Tensor:$indices,
+    TF_Tensor:$value,
+    F32Tensor:$flow_in
+  );
+
+  let results = (outs
+    F32Tensor:$flow_out
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<2>;
+}
+
+def TF_TensorArraySizeV3Op : TF_Op<"TensorArraySizeV3", []> {
+  let summary = "Get the current size of the TensorArray.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TF_ResourceTensor:$handle,
+    F32Tensor:$flow_in
+  );
+
+  let results = (outs
+    I32Tensor:$size
+  );
+}
+
+def TF_TensorArraySplitV3Op : TF_Op<"TensorArraySplitV3", []> {
+  let summary = [{
+Split the data from the input value into TensorArray elements.
+  }];
+
+  let description = [{
+Assuming that `lengths` takes on values
+
+  ```(n0, n1, ..., n(T-1))```
+
+and that `value` has shape
+
+  ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```,
+
+this splits values into a TensorArray with T tensors.
+
+TensorArray index t will be the subtensor of values with starting position
+
+  ```(n0 + n1 + ... + n(t-1), 0, 0, ...)```
+
+and having size
+
+  ```nt x d0 x d1 x ...```
+  }];
+
+  let arguments = (ins
+    TF_ResourceTensor:$handle,
+    TF_Tensor:$value,
+    I64Tensor:$lengths,
+    F32Tensor:$flow_in
+  );
+
+  let results = (outs
+    F32Tensor:$flow_out
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
+}
+
+def TF_TensorArrayV3Op : TF_Op<"TensorArrayV3", []> {
+  let summary = "An array of Tensors of given size.";
+
+  let description = [{
+Write data via Write and read via Read or Pack.
+  }];
+
+  let arguments = (ins
+    I32Tensor:$size,
+
+    TypeAttr:$dtype,
+    DefaultValuedAttr<StrAttr, "tfshape$unknown_rank: true">:$element_shape,
+    DefaultValuedAttr<BoolAttr, "false">:$dynamic_size,
+    DefaultValuedAttr<BoolAttr, "true">:$clear_after_read,
+    DefaultValuedAttr<BoolAttr, "false">:$identical_element_shapes,
+    StrAttr:$tensor_array_name
+  );
+
+  let results = (outs
+    TF_ResourceTensor:$handle,
+    F32Tensor:$flow
+  );
+}
+
+def TF_TensorArrayWriteV3Op : TF_Op<"TensorArrayWriteV3", []> {
+  let summary = "Push an element onto the tensor_array.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TF_ResourceTensor:$handle,
+    I32Tensor:$index,
+    TF_Tensor:$value,
+    F32Tensor:$flow_in
+  );
+
+  let results = (outs
+    F32Tensor:$flow_out
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<2>;
+}
+
 def TF_TensorListConcatV2Op : TF_Op<"TensorListConcatV2", [NoSideEffect]> {
   let summary = "Concats all tensors in the list along the 0th dimension.";
 
@@ -7052,6 +7397,27 @@ lengths: Output tensor containing sizes of the 0th dimension of tensors in the l
   TF_DerivedResultTypeAttr element_dtype = TF_DerivedResultTypeAttr<0>;
 }
 
+def TF_TensorListElementShapeOp : TF_Op<"TensorListElementShape", [NoSideEffect]> {
+  let summary = "The shape of the elements of the given list, as a tensor.";
+
+  let description = [{
+input_handle: the list
+  element_shape: the shape of elements of the list
+  }];
+
+  let arguments = (ins
+    TF_VariantTensor:$input_handle
+  );
+
+  let results = (outs
+    TF_I32OrI64Tensor:$element_shape
+  );
+
+  TF_DerivedResultTypeAttr shape_type = TF_DerivedResultTypeAttr<0>;
+
+  let hasFolder = 1;
+}
+
 def TF_TensorListFromTensorOp : TF_Op<"TensorListFromTensor", [NoSideEffect]> {
   let summary = [{
 Creates a TensorList which, when stacked, has the value of `tensor`.
@@ -7820,6 +8186,8 @@ shape(t) ==> [2, 2, 3]
   let verifier = [{
     return Verify(*this);
   }];
+
+  let hasFolder = 1;
 }
 
 def TF_WhereOp : TF_Op<"Where", [NoSideEffect]> {
@@ -8013,8 +8381,9 @@ used to look up the program in the compilation cache.
 
   let results = (outs
     TF_StrTensor:$compilation_status,
-    TF_StrTensor:$program
+    Variadic<TF_StrTensor>:$program
   );
 
+  TF_DerivedResultSizeAttr num_computations = TF_DerivedResultSizeAttr<1>;
   TF_DerivedOperandSizeAttr NumDynamicShapes = TF_DerivedOperandSizeAttr<0>;
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
index 92e6d522125..773025c58df 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
@@ -23,6 +23,7 @@ limitations under the License.
 #define TF_OP_BASE
 
 include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/SideEffects.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td"
 
 //===----------------------------------------------------------------------===//
@@ -86,6 +87,7 @@ class TF_TensorFlowType <string name, string description> :
 // Any tensor element type allowed in TensorFlow ops
 def TF_ElementType : Type<Or<[AnyFloat.predicate,
                               AnySignlessInteger.predicate,
+                              AnyUnsignedInteger.predicate,
                               AnyComplex.predicate,
                               TF_TFDialectType.predicate]>,
                           "tf.dtype">;
@@ -96,20 +98,20 @@ def TF_Tensor : TensorOf<[TF_ElementType]>;
 //===----------------------------------------------------------------------===//
 // Integer types
 
-def TF_I32Or64 : IntOfWidths<[32, 64]>;
+def TF_I32Or64 : SignlessIntOfWidths<[32, 64]>;
 
 def TF_I32OrI64Tensor : TensorOf<[TF_I32Or64]>;
 
-def TF_Uint8  : TF_TensorFlowType<"Uint8", "uint8">;
-def TF_Uint16 : TF_TensorFlowType<"Uint16", "uint16">;
-def TF_Uint32 : TF_TensorFlowType<"Uint32", "uint32">;
-def TF_Uint64 : TF_TensorFlowType<"Uint64", "uint64">;
+def TF_Uint8 : UI<8>;
+def TF_Uint16 : UI<16>;
+def TF_Uint32 : UI<32>;
+def TF_Uint64 : UI<64>;
 
 // Any unsigned integer type
-def TF_UInt : AnyTypeOf<[TF_Uint8, TF_Uint16, TF_Uint32, TF_Uint64]>;
+def TF_UInt : UnsignedIntOfWidths<[8, 16, 32, 64]>;
 
 // Any signed integer type
-def TF_SInt : IntOfWidths<[8, 16, 32, 64]>;
+def TF_SInt : SignlessIntOfWidths<[8, 16, 32, 64]>;
 
 // Any integer type
 def TF_Int : AnyTypeOf<[TF_SInt, TF_UInt]>;
@@ -192,6 +194,16 @@ def TF_NumberOrStrTensor : TensorOf<[TF_NumberOrStr]>;
 // TensorFlow attribute definitions
 //===----------------------------------------------------------------------===//
 
+//===----------------------------------------------------------------------===//
+// Tensorflow devices metadata
+
+// Tensorflow GPU device metadata.
+def TF_GpuDeviceMetadata : StructAttr<"GpuDeviceMetadata", TF_Dialect, [
+    // GPU device compute capability: major:minor.
+    StructFieldAttr<"cc_major", I32Attr>,
+    StructFieldAttr<"cc_minor", I32Attr>
+]>;
+
 //===----------------------------------------------------------------------===//
 // String attribute constraints
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td
index cc0819d71c9..3743bdda043 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td
@@ -50,11 +50,14 @@ def TF_LayoutSensitiveInterface : OpInterface<"LayoutSensitiveInterface"> {
       [{Returns indices of layout dependent results.}],
       "SmallVector<unsigned, 4>", "GetLayoutDependentResults", (ins)
     >,
+    InterfaceMethod<
+      [{Returns the optimal data layout based on the available devices.}],
+      "StringRef", "GetOptimalLayout", (ins "const RuntimeDevices&":$devices)
+    >,
     InterfaceMethod<
       [{Updates operation attributes and operands to account for the updated
         data format. If data format is not supported, must return failure.}],
-      "LogicalResult", "UpdateDataFormat",
-      (ins "StringRef":$data_format)
+      "LogicalResult", "UpdateDataFormat", (ins "StringRef":$data_format)
     >,
   ];
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 8d4c284bcf8..9cec3641d0a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -57,6 +57,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // TF:llvm-project
 #include "mlir/Support/STLExtras.h"  // TF:llvm-project
 #include "mlir/Transforms/InliningUtils.h"  // TF:llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/tensor_format.h"
@@ -292,6 +293,51 @@ static LogicalResult VerifyTypesCompatibility(
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// Helper functions detect device capabilities from RuntimeDevices.
+//===----------------------------------------------------------------------===//
+
+namespace {
+using DeviceNameUtils = ::tensorflow::DeviceNameUtils;
+using ParsedName = ::tensorflow::DeviceNameUtils::ParsedName;
+
+bool IsGpuDevice(const DeviceNameUtils::ParsedName &device) {
+  return device.type == ::tensorflow::DEVICE_GPU;
+}
+
+}  // namespace
+
+// Returns true if at least one GPU device is available at runtime.
+bool CanUseGpuDevice(const RuntimeDevices &devices) {
+  return llvm::any_of(devices.device_names(), IsGpuDevice);
+}
+
+// Returns true if all of the GPUs available at runtime support TensorCores
+// (NVIDIA compute capability >= 7.0).
+bool CanUseTensorCores(const RuntimeDevices &devices) {
+  auto has_tensor_cores = [&](const DeviceNameUtils::ParsedName &device) {
+    auto md = devices.GetGpuDeviceMetadata(device);
+    return md ? md->cc_major().getInt() >= 7 : false;
+  };
+  return llvm::all_of(
+      llvm::make_filter_range(devices.device_names(), IsGpuDevice),
+      has_tensor_cores);
+}
+
+// Returns true if operation does not have explicit device placement that would
+// prevent it from running on GPU device.
+bool CanUseGpuDevice(Operation *op) {
+  auto device_attr = op->getAttrOfType<StringAttr>("device");
+  if (!device_attr || device_attr.getValue().empty()) return true;
+
+  DeviceNameUtils::ParsedName device;
+  if (!DeviceNameUtils::ParseFullName(device_attr.getValue().str(), &device))
+    return false;
+
+  // We can't use GPU if operation explicitly placed on non-GPU device.
+  return !device.has_type || device.type == ::tensorflow::DEVICE_GPU;
+}
+
 //===----------------------------------------------------------------------===//
 // TF op helper functions to work with layout transformation.
 //===----------------------------------------------------------------------===//
@@ -566,6 +612,16 @@ void BatchMatMulOp::getCanonicalizationPatterns(
 // BatchMatMulV2Op
 //===----------------------------------------------------------------------===//
 
+static LogicalResult Verify(BatchMatMulV2Op op) {
+  if (!HasRankAtLeast(op.x(), 2)) {
+    return op.emitOpError("requires lhs operand to have rank at least two");
+  }
+  if (!HasRankAtLeast(op.y(), 2)) {
+    return op.emitOpError("requires rhs operand to have rank at least two");
+  }
+  return success();
+}
+
 void BatchMatMulV2Op::getCanonicalizationPatterns(
     OwningRewritePatternList &results, MLIRContext *context) {
   results.insert<BatchMatMulV2ToMatMul>(context);
@@ -617,15 +673,6 @@ static LogicalResult Verify(BiasAddOp op) {
   return success();
 }
 
-// TODO(ezhulenev): BiasAddOp is not really layout sensitive, it must only
-// support folding operand transposes.
-LogicalResult BiasAddOp::UpdateDataFormat(StringRef data_format) {
-  auto ranked = value().getType().dyn_cast<RankedTensorType>();
-  if (!ranked || ranked.getRank() != 4) return failure();
-
-  return ::mlir::TF::UpdateDataFormat(data_format, this);
-}
-
 //===----------------------------------------------------------------------===//
 // BiasAddGradOp
 //===----------------------------------------------------------------------===//
@@ -999,6 +1046,59 @@ LogicalResult Conv2DOp::UpdateDataFormat(StringRef data_format) {
   return success();
 }
 
+StringRef Conv2DOp::GetOptimalLayout(const RuntimeDevices &devices) {
+  // Keep current data format if no GPUs are available or if explicit placement
+  // does not allow to use GPU for this operation.
+  if (!CanUseGpuDevice(devices) || !CanUseGpuDevice(getOperation()))
+    return data_format();
+
+  // Input must be a tensor.
+  auto input_ty = input().getType().dyn_cast<TensorType>();
+  if (!input_ty) return data_format();
+
+  // For f16 data type on devices with Tensor Cores support NHWC data format
+  // is up to ~2x faster.
+  const bool is_f16 = input_ty.getElementType().isF16();
+  if (is_f16 && CanUseTensorCores(devices)) return "NHWC";
+
+  // For f32/f16 data type decision depends on the filter size in spatial
+  // dimensions, for other data types we keep current data format.
+  if (!input_ty.getElementType().isF32() && !input_ty.getElementType().isF16())
+    return data_format();
+
+  // Keep current data format if filter rank is unknown or not equal to 4.
+  auto filter_ty = filter().getType().dyn_cast<RankedTensorType>();
+  if (!filter_ty || filter_ty.getRank() != 4) return data_format();
+
+  const int64_t d0 = filter_ty.getDimSize(0);
+  const int64_t d1 = filter_ty.getDimSize(1);
+
+  auto all_ones = [](ArrayAttr arr) -> bool {
+    return llvm::all_of(arr, [](Attribute attr) -> bool {
+      return attr.cast<IntegerAttr>().getInt() == 1;
+    });
+  };
+
+  // Convolutions with 1x1 filter and with strides and dilations all ones, can
+  // be computed as a GEMM in NHWC data format, and can be up to ~2x times
+  // faster than convolution in NCHW.
+  const bool one_by_one = d0 == 1 && d1 == 1;
+  const bool trivial_strides = all_ones(strides());
+  const bool trivial_dilations = all_ones(dilations());
+
+  // TODO(ezhulenev): This might lead to excessive transposes in the final IR,
+  // if the ratio of 1x1 convolutions to regular convolutions is close to 1:1.
+  // Also FusedBatchNorm in training mode prefers NCHW data format. Check if all
+  // users can efficiently use NHWC data format?
+  if (one_by_one && trivial_strides && trivial_dilations) {
+    return "NHWC";
+  }
+
+  // If filter spatial dimensions are unknown or not 1x1 we prefer NCHW, because
+  // it's the fastest option on NVIDIA GPUs with cuDNN library support.
+  return "NCHW";
+}
+
 //===----------------------------------------------------------------------===//
 // Conv2dBackpropInputOp
 //===----------------------------------------------------------------------===//
@@ -1495,6 +1595,15 @@ void LogOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
   results.insert<LogOfSoftmax>(context);
 }
 
+//===----------------------------------------------------------------------===//
+// ReadVariableOp
+//===----------------------------------------------------------------------===//
+
+void ReadVariableOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<ReadVariableOfCast>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // LogicalNotOp
 //===----------------------------------------------------------------------===//
@@ -2612,9 +2721,8 @@ constexpr void CopyBit(const T &src, unsigned src_index, T &dst,
 // dimensions. For example, sparse spec for foo[..., 3:10] for foo of shape (2,
 // 4, 8) would have dims = 2.
 struct SparseSliceSpec {
-  const int64_t dims;
-  const uint64_t begin_mask, end_mask, ellipsis_mask, new_axis_mask,
-      shrink_axis_mask;
+  int64_t dims;
+  int32_t begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask;
   const ArrayRef<int64_t> &begin;
   const ArrayRef<int64_t> &end;
   const ArrayRef<int64_t> &strides;
@@ -2625,7 +2733,7 @@ struct SparseSliceSpec {
 // in operand tensor.
 struct DenseSliceSpec {
   int64_t dims;
-  uint64_t begin_mask, end_mask, shrink_axis_mask;
+  int32_t begin_mask, end_mask, shrink_axis_mask;
   SmallVectorImpl<int64_t> &begin;
   SmallVectorImpl<int64_t> &end;
   SmallVectorImpl<int64_t> &strides;
@@ -2638,8 +2746,8 @@ struct DenseSliceSpec {
 // For example suppose foo[...,3:, 2] on foo.shape=(2,2,3,4) then
 // we need to produce the missing begin_mask, end_mask for the first two
 // dimensions i.e. foo[:, :, 3:, 2].
-static LogicalResult BuildDenseSliceSpec(const SparseSliceSpec &sparse,
-                                         DenseSliceSpec *dense) {
+static void BuildDenseSliceSpec(const SparseSliceSpec &sparse,
+                                DenseSliceSpec *dense) {
   // Build expanded dense begin, end, strides, begin_mask, end_mask, and
   // shrink_axis_mask.
   dense->begin.resize(dense->dims);
@@ -2689,7 +2797,6 @@ static LogicalResult BuildDenseSliceSpec(const SparseSliceSpec &sparse,
             dense_index);
     dense_index++;
   }
-  return success();
 }
 
 // For the given `input_shape`, calculates the sliced shape using the given
@@ -2699,7 +2806,7 @@ static LogicalResult BuildDenseSliceSpec(const SparseSliceSpec &sparse,
 // dimensions in `input_shape`; it will turn them into 1s. At the same time,
 // canonicalizes `begin`, `end`, and `strides. The calculation follows
 // tf.StridedSlice op semantics.
-static void CalculateSlicedShapeAndBoundRanges(
+static void CalculateSlicedShapeFromDenseIndices(
     MutableArrayRef<int64_t> input_shape, int32_t begin_mask, int32_t end_mask,
     int32_t shrink_axis_mask, MutableArrayRef<int64_t> begin,
     MutableArrayRef<int64_t> end, MutableArrayRef<int64_t> stride) {
@@ -2759,21 +2866,59 @@ static void CalculateSlicedShapeAndBoundRanges(
   }
 }
 
+// For the given `input_shape`, calculates the sliced shape using the given
+// `sparse_begin`, `sparse_end`, and `sparse_strides` ranges and `begin_mask`,
+// `end_mask`, `ellipsis_mask` , `new_axis_mask` and `shrink_axis_mask` masks.
+// Updates the result back to `input_shape`.
+static void CalculateSlicedShapeFromSparseIndices(
+    MutableArrayRef<int64_t> input_shape, ArrayRef<int64_t> sparse_begin,
+    ArrayRef<int64_t> sparse_end, ArrayRef<int64_t> sparse_strides,
+    int32_t begin_mask, int32_t end_mask, int32_t ellipsis_mask,
+    int32_t new_axis_mask, int32_t shrink_axis_mask,
+    SmallVectorImpl<int64_t> *begin, SmallVectorImpl<int64_t> *end,
+    SmallVectorImpl<int64_t> *stride) {
+  int64_t num_sparse_indices = sparse_begin.size();
+  SparseSliceSpec sparse = {num_sparse_indices, begin_mask,    end_mask,
+                            ellipsis_mask,      new_axis_mask, shrink_axis_mask,
+                            sparse_begin,       sparse_end,    sparse_strides};
+
+  // If no ellipsis_mask exists then an implicit ellipsis_mask at the end is
+  // inserted. This handles cases where foo[2:4] (foo.shape() = [4, 8]) yields
+  // a tensor of shape [2, 8], i.e., foo[2:4] is same as foo[2:4, ...].
+  if (sparse.ellipsis_mask == 0) {
+    Set(sparse.ellipsis_mask, sparse.dims);
+    sparse.dims++;
+  }
+
+  int64_t dims = input_shape.size();
+  DenseSliceSpec dense = {dims,
+                          /*begin_mask = */ 0,
+                          /*end_mask = */ 0,
+                          /*shrink_axis_mask = */ 0,
+                          *begin,
+                          *end,
+                          *stride};
+
+  BuildDenseSliceSpec(sparse, &dense);
+  CalculateSlicedShapeFromDenseIndices(input_shape, dense.begin_mask,
+                                       dense.end_mask, dense.shrink_axis_mask,
+                                       *begin, *end, *stride);
+}
+
 bool StridedSliceOp::GetSlicedBoundRanges(
-    SmallVectorImpl<int64_t> *begin_indices,
-    SmallVectorImpl<int64_t> *end_indices, SmallVectorImpl<int64_t> *strides) {
+    SmallVectorImpl<int64_t> *slice_begin, SmallVectorImpl<int64_t> *slice_end,
+    SmallVectorImpl<int64_t> *slice_stride) {
   // TODO(hinsu): Support lowering for ops with dynamic begin and end values
   // when it is possible to derive indices based on mask attributes.
   DenseIntElementsAttr sparse_begin_attr, sparse_end_attr, sparse_strides_attr;
-  if (!matchPattern(this->begin(), m_Constant(&sparse_begin_attr)) ||
-      !matchPattern(this->end(), m_Constant(&sparse_end_attr)) ||
-      !matchPattern(this->strides(), m_Constant(&sparse_strides_attr)))
+  if (!matchPattern(begin(), m_Constant(&sparse_begin_attr)) ||
+      !matchPattern(end(), m_Constant(&sparse_end_attr)) ||
+      !matchPattern(strides(), m_Constant(&sparse_strides_attr)))
     return false;
 
   auto input_ty = this->input().getType().dyn_cast<RankedTensorType>();
   if (!input_ty || !input_ty.hasStaticShape()) return false;
   auto input_shape = llvm::to_vector<4>(input_ty.getShape());
-  int rank = input_shape.size();
 
   SmallVector<int64_t, 4> sparse_begin, sparse_end, sparse_strides;
 
@@ -2784,30 +2929,11 @@ bool StridedSliceOp::GetSlicedBoundRanges(
   for (const APInt &stride : sparse_strides_attr)
     sparse_strides.push_back(stride.getSExtValue());
 
-  auto num_sparse_indices = sparse_begin_attr.getNumElements();
-  SparseSliceSpec sparse = {num_sparse_indices,
-                            this->begin_mask().getZExtValue(),
-                            this->end_mask().getZExtValue(),
-                            this->ellipsis_mask().getZExtValue(),
-                            this->new_axis_mask().getZExtValue(),
-                            this->shrink_axis_mask().getZExtValue(),
-                            sparse_begin,
-                            sparse_end,
-                            sparse_strides};
-
-  DenseSliceSpec dense = {rank,
-                          /*begin_mask = */ 0,
-                          /*end_mask = */ 0,
-                          /*shrink_axis_mask = */ 0,
-                          *begin_indices,
-                          *end_indices,
-                          *strides};
-
-  if (failed(BuildDenseSliceSpec(sparse, &dense))) return false;
-
-  CalculateSlicedShapeAndBoundRanges(input_shape, dense.begin_mask,
-                                     dense.end_mask, dense.shrink_axis_mask,
-                                     *begin_indices, *end_indices, *strides);
+  CalculateSlicedShapeFromSparseIndices(
+      input_shape, sparse_begin, sparse_end, sparse_strides,
+      begin_mask().getZExtValue(), end_mask().getZExtValue(),
+      ellipsis_mask().getZExtValue(), new_axis_mask().getZExtValue(),
+      shrink_axis_mask().getZExtValue(), slice_begin, slice_end, slice_stride);
   return true;
 }
 
@@ -2830,44 +2956,38 @@ static LogicalResult Verify(StridedSliceGradOp op) {
 }
 
 bool StridedSliceGradOp::GetSlicedShapeAndBoundRanges(
-    SmallVectorImpl<int64_t> *shape, SmallVectorImpl<int64_t> *begin_indices,
-    SmallVectorImpl<int64_t> *end_indices, SmallVectorImpl<int64_t> *strides) {
-  if (this->ellipsis_mask().getZExtValue() ||
-      this->new_axis_mask().getZExtValue() ||
-      this->shrink_axis_mask().getZExtValue())
-    return false;  // TODO(b/146512589): support these masks
-
+    SmallVectorImpl<int64_t> *input_shape,
+    SmallVectorImpl<int64_t> *slice_begin, SmallVectorImpl<int64_t> *slice_end,
+    SmallVectorImpl<int64_t> *slice_stride) {
   DenseIntElementsAttr shape_attr;
-  DenseIntElementsAttr begin_indices_attr, end_indices_attr, strides_attr;
-  if (!matchPattern(this->shape(), m_Constant(&shape_attr)) ||
-      !matchPattern(this->begin(), m_Constant(&begin_indices_attr)) ||
-      !matchPattern(this->end(), m_Constant(&end_indices_attr)) ||
-      !matchPattern(this->strides(), m_Constant(&strides_attr)))
+  DenseIntElementsAttr sparse_begin_attr, sparse_end_attr, sparse_strides_attr;
+  if (!matchPattern(shape(), m_Constant(&shape_attr)) ||
+      !matchPattern(begin(), m_Constant(&sparse_begin_attr)) ||
+      !matchPattern(end(), m_Constant(&sparse_end_attr)) ||
+      !matchPattern(strides(), m_Constant(&sparse_strides_attr)))
     return false;
 
   int rank = std::distance(shape_attr.begin(), shape_attr.end());
 
-  shape->clear();
-  shape->reserve(rank);
-  begin_indices->clear();
-  begin_indices->reserve(rank);
-  end_indices->clear();
-  end_indices->reserve(rank);
-  strides->clear();
-  strides->reserve(rank);
+  input_shape->clear();
+  input_shape->reserve(rank);
+  for (const APInt &dim : shape_attr)
+    input_shape->push_back(dim.getSExtValue());
 
-  for (const APInt &dim : shape_attr) shape->push_back(dim.getSExtValue());
-  for (const APInt &index : begin_indices_attr)
-    begin_indices->push_back(index.getSExtValue());
-  for (const APInt &index : end_indices_attr)
-    end_indices->push_back(index.getSExtValue());
-  for (const APInt &stride : strides_attr)
-    strides->push_back(stride.getSExtValue());
+  SmallVector<int64_t, 4> sparse_begin, sparse_end, sparse_strides;
 
-  CalculateSlicedShapeAndBoundRanges(*shape, this->begin_mask().getZExtValue(),
-                                     this->end_mask().getZExtValue(),
-                                     this->shrink_axis_mask().getZExtValue(),
-                                     *begin_indices, *end_indices, *strides);
+  for (const APInt &index : sparse_begin_attr)
+    sparse_begin.push_back(index.getSExtValue());
+  for (const APInt &index : sparse_end_attr)
+    sparse_end.push_back(index.getSExtValue());
+  for (const APInt &stride : sparse_strides_attr)
+    sparse_strides.push_back(stride.getSExtValue());
+
+  CalculateSlicedShapeFromSparseIndices(
+      *input_shape, sparse_begin, sparse_end, sparse_strides,
+      begin_mask().getZExtValue(), end_mask().getZExtValue(),
+      ellipsis_mask().getZExtValue(), new_axis_mask().getZExtValue(),
+      shrink_axis_mask().getZExtValue(), slice_begin, slice_end, slice_stride);
   return true;
 }
 
@@ -2887,6 +3007,19 @@ static LogicalResult Verify(TensorListReserveOp op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// TensorListElementShapeOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult TensorListElementShapeOp::fold(ArrayRef<Attribute> operands) {
+  int width =
+      getType().cast<ShapedType>().getElementType().getIntOrFloatBitWidth();
+  auto variant_type =
+      getElementTypeOrSelf(getOperand().getType()).cast<TF::VariantType>();
+  if (variant_type.getSubtypes().empty()) return {};
+  return ConvertShapeToAttr(variant_type.getSubtypes()[0], width);
+}
+
 //===----------------------------------------------------------------------===//
 // TensorListStackOp
 //===----------------------------------------------------------------------===//
@@ -3157,6 +3290,15 @@ static LogicalResult Verify(VariableShapeOp op) {
   }
 }
 
+OpFoldResult VariableShapeOp::fold(ArrayRef<Attribute> operands) {
+  int width =
+      getType().cast<ShapedType>().getElementType().getIntOrFloatBitWidth();
+  auto resource_type =
+      getElementTypeOrSelf(getOperand().getType()).cast<TF::ResourceType>();
+  if (resource_type.getSubtypes().empty()) return {};
+  return ConvertShapeToAttr(resource_type.getSubtypes()[0], width);
+}
+
 //===----------------------------------------------------------------------===//
 // WhileOp
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
index 9c80c43042b..fbd1a335be1 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
@@ -19,7 +19,6 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_H_
 
-#include "mlir/Analysis/CallInterfaces.h"  // TF:llvm-project
 #include "mlir/Dialect/Traits.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
@@ -29,6 +28,10 @@ limitations under the License.
 #include "mlir/IR/OpImplementation.h"  // TF:llvm-project
 #include "mlir/IR/StandardTypes.h"  // TF:llvm-project
 #include "mlir/IR/TypeUtilities.h"  // TF:llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // TF:llvm-project
+#include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // TF:llvm-project
+#include "mlir/Interfaces/SideEffects.h"  // TF:llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index e95fcbbdad3..c1c6a643ef1 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -28,7 +28,7 @@ limitations under the License.
 #define TF_OPS
 
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td"
-include "mlir/Analysis/CallInterfaces.td"
+include "mlir/Interfaces/CallInterfaces.td"
 include "mlir/IR/OpBase.td"
 
 class TF_TensorListInitOp<string mnemonic> : TF_Op<mnemonic, [NoSideEffect]> {
@@ -64,7 +64,7 @@ class TF_TensorListInitOp<string mnemonic> : TF_Op<mnemonic, [NoSideEffect]> {
 
 // In MLIR, the TensorFlow tensor value is represented as an ElementsAttr, with
 // its type encoding the tensor's shape and data type.
-def TF_ConstOp : TF_Op<"Const", [NoSideEffect]> {
+def TF_ConstOp : TF_Op<"Const", [ConstantLike, NoSideEffect]> {
   let summary = "Constant tensor op";
 
   let arguments = (ins
@@ -550,4 +550,44 @@ Example:
   TF_DerivedOperandOrResultHandleShapeAttr shape =
     TF_DerivedOperandOrResultHandleShapeAttr<"resource">;
 }
+
+// Not generated because it begins with an underscore, which isn't allowed by
+// the C++ standard.
+def TF_FusedBatchNormExOp : TF_Op<"_FusedBatchNormEx", [NoSideEffect]> {
+  let summary = "Internal FusedBatchNorm operation: reserved for internal use";
+
+  let description = [{
+ Do not invoke this operator directly in Python. A fusion optimization is
+ expected to create these operators.
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32]>:$x,
+    F32Tensor:$scale,
+    F32Tensor:$offset,
+    F32Tensor:$mean,
+    F32Tensor:$variance,
+    Variadic<TensorOf<[F16, F32]>>:$side_input,
+
+    DefaultValuedAttr<F32Attr, "0.0001f">:$epsilon,
+    DefaultValuedAttr<F32Attr, "1.0f">:$exponential_avg_factor,
+    DefaultValuedAttr<StrAttr, "Identity">:$activation_mode,
+    DefaultValuedAttr<TF_ConvnetDataFormatAttr, "NHWC">:$data_format,
+    DefaultValuedAttr<BoolAttr, "true">:$is_training
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32]>:$y,
+    F32Tensor:$batch_mean,
+    F32Tensor:$batch_variance,
+    F32Tensor:$reserve_space_1,
+    F32Tensor:$reserve_space_2,
+    F32Tensor:$reserve_space_3
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr U = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandSizeAttr num_side_inputs = TF_DerivedOperandSizeAttr<5>;
+}
+
 #endif // TF_OPS
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_structs.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_structs.cc
new file mode 100644
index 00000000000..6c5485c16dd
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_structs.cc
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
+
+namespace mlir {
+
+// NOLINTNEXTLINE
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.cc.inc"
+
+namespace TF {
+
+void RuntimeDevices::AddDevice(const ParsedName& device) {
+  device_names_.push_back(device);
+}
+
+void RuntimeDevices::AddGpuDevice(const ParsedName& device,
+                                  const GpuDeviceMetadata& metadata) {
+  device_names_.push_back(device);
+  gpu_metadata_.insert({DeviceNameUtils::ParsedNameToString(device), metadata});
+}
+
+llvm::Optional<GpuDeviceMetadata> RuntimeDevices::GetGpuDeviceMetadata(
+    const ParsedName& device) const {
+  auto it = gpu_metadata_.find(DeviceNameUtils::ParsedNameToString(device));
+  if (it != gpu_metadata_.end()) {
+    return it->second;
+  } else {
+    return llvm::None;
+  }
+}
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h
new file mode 100644
index 00000000000..65887a0c960
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h
@@ -0,0 +1,67 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the types used in the standard MLIR TensorFlow dialect.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_STRUCTS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_STRUCTS_H_
+
+#include "llvm/ADT/StringMap.h"
+#include "mlir/IR/Diagnostics.h"  // TF:llvm-project
+#include "mlir/IR/Location.h"  // TF:llvm-project
+#include "mlir/IR/Operation.h"  // TF:llvm-project
+#include "mlir/IR/StandardTypes.h"  // TF:llvm-project
+#include "mlir/IR/Types.h"  // TF:llvm-project
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace mlir {
+
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h.inc"
+
+namespace TF {
+
+// Tensorflow devices available at runtime with corresponding metadata if it is
+// available. It's completely valid to have a device without any metadata
+// attached to it.
+class RuntimeDevices {
+  using DeviceNameUtils = ::tensorflow::DeviceNameUtils;
+  using ParsedName = ::tensorflow::DeviceNameUtils::ParsedName;
+
+ public:
+  // Adds a device with and empty metadata. Device can be of any type.
+  void AddDevice(const ParsedName& device);
+
+  // Adds a GPU device with GPU specific metadata.
+  void AddGpuDevice(const ParsedName& device,
+                    const GpuDeviceMetadata& metadata);
+
+  llvm::ArrayRef<ParsedName> device_names() const { return device_names_; }
+  size_t NumDevices() const { return device_names_.size(); }
+
+  // Returns GPU device metadata if it is available, otherwise returns None.
+  llvm::Optional<GpuDeviceMetadata> GetGpuDeviceMetadata(
+      const ParsedName& device) const;
+
+ private:
+  llvm::SmallVector<ParsedName, 8> device_names_;
+  // TODO(ezhulenev): Add DenseMapInfo<ParsedName> specialization to be able to
+  // use ParsedName as a key in a DenseMap.
+  llvm::StringMap<GpuDeviceMetadata> gpu_metadata_;
+};
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_STRUCTS_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc
index a3bba731581..ef97b234ef7 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc
@@ -77,13 +77,17 @@ TensorFlowType TensorFlowRefType::get(Type type) {
         case 1:
           return BoolRefType::get(ctx);
         case 8:
-          return Int8RefType::get(ctx);
+          return itype.isUnsigned() ? TensorFlowType(Uint8RefType::get(ctx))
+                                    : Int8RefType::get(ctx);
         case 16:
-          return Int16RefType::get(ctx);
+          return itype.isUnsigned() ? TensorFlowType(Uint16RefType::get(ctx))
+                                    : Int16RefType::get(ctx);
         case 32:
-          return Int32RefType::get(ctx);
+          return itype.isUnsigned() ? TensorFlowType(Uint32RefType::get(ctx))
+                                    : Int32RefType::get(ctx);
         case 64:
-          return Int64RefType::get(ctx);
+          return itype.isUnsigned() ? TensorFlowType(Uint64RefType::get(ctx))
+                                    : Int64RefType::get(ctx);
         default:
           llvm_unreachable("unexpected integer type");
       }
@@ -121,6 +125,14 @@ Type TensorFlowRefType::RemoveRef() {
       return mlir::IntegerType::get(32, ctx);
     case TensorFlowTypes::INT64_REF:
       return mlir::IntegerType::get(64, ctx);
+    case TensorFlowTypes::UINT8_REF:
+      return mlir::IntegerType::get(8, IntegerType::Unsigned, ctx);
+    case TensorFlowTypes::UINT16_REF:
+      return mlir::IntegerType::get(16, IntegerType::Unsigned, ctx);
+    case TensorFlowTypes::UINT32_REF:
+      return mlir::IntegerType::get(32, IntegerType::Unsigned, ctx);
+    case TensorFlowTypes::UINT64_REF:
+      return mlir::IntegerType::get(64, IntegerType::Unsigned, ctx);
     case TensorFlowTypes::COMPLEX64_REF:
       return mlir::ComplexType::get(mlir::FloatType::getF32(ctx));
     case TensorFlowTypes::COMPLEX128_REF:
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.def b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.def
index 0f5f7c17e02..a097a3cad88 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.def
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.def
@@ -19,10 +19,6 @@ limitations under the License.
 #ifdef HANDLE_TF_TYPE
 
 //             class, enumerant, name
-HANDLE_TF_TYPE(Uint8, UINT8, "uint8")
-HANDLE_TF_TYPE(Uint16, UINT16, "uint16")
-HANDLE_TF_TYPE(Uint32, UINT32, "uint32")
-HANDLE_TF_TYPE(Uint64, UINT64, "uint64")
 HANDLE_TF_TYPE(Qint8, QINT8, "qint8")
 HANDLE_TF_TYPE(Qint16, QINT16, "qint16")
 HANDLE_TF_TYPE(Qint32, QINT32, "qint32")
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
index 4059aba209f..2898338f8eb 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
@@ -91,7 +91,7 @@ class TensorFlowType : public Type {
 // Returns true if the specified type is a valid TensorFlow element type.
 static inline bool IsValidTFElementType(Type type) {
   return type.isa<ComplexType>() || type.isa<FloatType>() ||
-         type.isSignlessInteger() || type.isa<TensorFlowType>();
+         type.isa<IntegerType>() || type.isa<TensorFlowType>();
 }
 
 // Returns true if this is a valid TensorFlow tensor type.
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/batchmatmul_to_einsum.mlir b/tensorflow/compiler/mlir/tensorflow/tests/batchmatmul_to_einsum.mlir
new file mode 100644
index 00000000000..1589de3d661
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/batchmatmul_to_einsum.mlir
@@ -0,0 +1,44 @@
+// RUN: tf-opt %s -tf-batch-matmul-to-tf-einsum | FileCheck %s --dump-input-on-failure
+
+func @test_batch_matmul_to_einsum(%arg0: tensor<1x2x3xf32>, %arg1: tensor<3x4xf32>) -> tensor<1x2x4xf32> {
+  // CHECK-LABEL: test_batch_matmul_to_einsum
+  // CHECK: "tf.Einsum"(%arg0, %arg1) {equation = "...mk,...kn->...mn"} : (tensor<1x2x3xf32>, tensor<3x4xf32>) -> tensor<1x2x4xf32>
+  %0 = "tf.BatchMatMul"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<1x2x3xf32>, tensor<3x4xf32>) -> tensor<1x2x4xf32>
+  return %0: tensor<1x2x4xf32>
+}
+
+func @test_batch_matmul_broadcast_to_einsum(%arg0: tensor<2x2x4xf32>, %arg1: tensor<2x4x2xf32>) -> tensor<2x2x2xf32> {
+  // CHECK-LABEL: test_batch_matmul_broadcast_to_einsum
+  // CHECK: "tf.Einsum"(%arg0, %arg1) {equation = "...mk,...kn->...mn"} : (tensor<2x2x4xf32>, tensor<2x4x2xf32>) -> tensor<2x2x2xf32>
+  %0 = "tf.BatchMatMul"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<2x2x4xf32>, tensor<2x4x2xf32>) -> tensor<2x2x2xf32>
+  return %0: tensor<2x2x2xf32>
+}
+
+func @test_batch_matmul_dynamic_shape_both_arg_to_einsum(%arg0: tensor<1x2x?xf32>, %arg1: tensor<?x4xf32>) -> tensor<1x2x4xf32> {
+  // CHECK-LABEL: test_batch_matmul_dynamic_shape_both_arg_to_einsum
+  // CHECK: "tf.Einsum"(%arg0, %arg1) {equation = "...mk,...kn->...mn"} : (tensor<1x2x?xf32>, tensor<?x4xf32>) -> tensor<1x2x4xf32>
+  %0 = "tf.BatchMatMul"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<1x2x?xf32>, tensor<?x4xf32>) -> tensor<1x2x4xf32>
+  return %0: tensor<1x2x4xf32>
+}
+
+func @test_batch_matmul_dynamic_shape_one_arg_to_einsum(%arg0: tensor<1x2x?xf32>, %arg1: tensor<3x4xf32>) -> tensor<1x2x4xf32> {
+  // CHECK-LABEL: test_batch_matmul_dynamic_shape_one_arg_to_einsum
+  // CHECK: "tf.Einsum"(%arg0, %arg1) {equation = "...mk,...kn->...mn"} : (tensor<1x2x?xf32>, tensor<3x4xf32>) -> tensor<1x2x4xf32>
+  %0 = "tf.BatchMatMul"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<1x2x?xf32>, tensor<3x4xf32>) -> tensor<1x2x4xf32>
+  return %0: tensor<1x2x4xf32>
+}
+
+func @test_batch_matmul_adj_to_einsum(%arg0: tensor<1x2x3xf32>, %arg1: tensor<4x3xf32>) -> tensor<1x2x4xf32> {
+  // CHECK-LABEL: test_batch_matmul_adj_to_einsum
+  // CHECK: %[[RES_EINSUM:[0-9]*]] = "tf.Einsum"(%arg0, %arg1) {equation = "...mk,...nk->...mn"} : (tensor<1x2x3xf32>, tensor<4x3xf32>) -> tensor<1x2x4xf32>
+  // CHECK: return %[[RES_EINSUM]] : tensor<1x2x4xf32>
+  %0 = "tf.BatchMatMul"(%arg0, %arg1) {adj_x = false, adj_y = true} : (tensor<1x2x3xf32>, tensor<4x3xf32>) -> tensor<1x2x4xf32>
+  return %0: tensor<1x2x4xf32>
+}
+
+func @test_batch_matmulV2_adj_to_einsum(%arg0: tensor<1x3x2xf32>, %arg1: tensor<3x4xf32>) -> tensor<1x2x4xf32> {
+  // CHECK: %[[RES_EINSUM:[0-9]*]] = "tf.Einsum"(%arg0, %arg1) {equation = "...km,...kn->...mn"} : (tensor<1x3x2xf32>, tensor<3x4xf32>) -> tensor<1x2x4xf32>
+  // CHECK: return %[[RES_EINSUM]] : tensor<1x2x4xf32>
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = true, adj_y = false} : (tensor<1x3x2xf32>, tensor<3x4xf32>) -> tensor<1x2x4xf32>
+  return %0: tensor<1x2x4xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
index 5bf5b0610ae..158fd3064a0 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
@@ -418,3 +418,37 @@ func @ToBool_0DScalar(%arg0: tensor<i1>) -> tensor<i1> {
   %0 = "tf.ToBool"(%arg0) : (tensor<i1>) -> tensor<i1>
   return %0 : tensor<i1>
 }
+
+// CHECK-LABEL: testReadVariableOfOfCast
+func @testReadVariableOfOfCast(%arg0: tensor<!tf.resource<tensor<8x40xf32>>>) -> tensor<8x40xf32> {
+  %0 = "tf.Cast"(%arg0) : (tensor<!tf.resource<tensor<8x40xf32>>>) -> tensor<*x!tf.resource>
+  %1 = "tf.ReadVariableOp"(%0) : (tensor<*x!tf.resource>) -> tensor<8x40xf32>
+  return %1: tensor<8x40xf32>
+
+// CHECK: %0 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf.resource<tensor<8x40xf32>>>) -> tensor<8x40xf32>
+// CHECK: return %0
+}
+
+// CHECK-LABEL: testReadVariableOfOfCastWithTruncate
+func @testReadVariableOfOfCastWithTruncate(%arg0: tensor<!tf.resource<tensor<8x40xf32>>>) -> tensor<8x40xf32> {
+  %0 = "tf.Cast"(%arg0) {Truncate = true} : (tensor<!tf.resource<tensor<8x40xf32>>>) -> tensor<*x!tf.resource>
+  %1 = "tf.ReadVariableOp"(%0) : (tensor<*x!tf.resource>) -> tensor<8x40xf32>
+  return %1: tensor<8x40xf32>
+
+// CHECK: %0 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf.resource<tensor<8x40xf32>>>) -> tensor<8x40xf32>
+// CHECK: return %0
+}
+
+// CHECK-LABEL: testReadVariableOfOfCastMultiUse
+func @testReadVariableOfOfCastMultiUse(%arg0: tensor<!tf.resource<tensor<f32>>>) -> tensor<f32> {
+  %0 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<!tf.resource<tensor<f32>>>) -> tensor<*x!tf.resource>
+  %1 = "tf.ReadVariableOp"(%0) : (tensor<*x!tf.resource>) -> tensor<f32>
+  "tf.AssignVariableOp"(%0, %1) : (tensor<*x!tf.resource>, tensor<f32>) -> ()
+  return %1: tensor<f32>
+
+ // CHECK: %0 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<!tf.resource<tensor<f32>>>) -> tensor<*x!tf.resource>
+ // CHECK: %1 = "tf.ReadVariableOp"(%0) : (tensor<*x!tf.resource>) -> tensor<f32>
+ // CHECK: "tf.AssignVariableOp"(%0, %1) : (tensor<*x!tf.resource>, tensor<f32>) -> ()
+ // CHECK: return %1
+}
+
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
index d9727e94bb6..411599053e5 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
@@ -213,3 +213,21 @@ func @testRemoteDevice() -> tensor<2x2xi32> {
   // CHECK-NEXT:    return [[cst]] : tensor<2x2xi32>
   return %2: tensor<2x2xi32>
 }
+
+// Tests ops that variable shapes are correctly evaluated on static types.
+// CHECK-LABEL: func @testVariableShape
+func @testVariableShape(%arg0: tensor<!tf.resource<tensor<2x4xf32>>>) -> tensor<2xi32> {
+  %0 = "tf.VariableShape"(%arg0) : (tensor<!tf.resource<tensor<2x4xf32>>>) -> tensor<2xi32>
+  // CHECK:         [[cst:%.*]] = "tf.Const{{.*}} dense<{{\[}}2, 4]> : tensor<2xi32>
+  // CHECK-NEXT:    return [[cst]] : tensor<2xi32>
+  return %0: tensor<2xi32>
+}
+
+// Tests ops that tensor list shapes are correctly evaluated on static types.
+// CHECK-LABEL: func @testTensorListElementShape
+func @testTensorListElementShape(%arg0: tensor<!tf.variant<tensor<2x4xf32>>>) -> tensor<2xi32> {
+  %0 = "tf.TensorListElementShape"(%arg0) : (tensor<!tf.variant<tensor<2x4xf32>>>) -> tensor<2xi32>
+  // CHECK:         [[cst:%.*]] = "tf.Const{{.*}} dense<{{\[}}2, 4]> : tensor<2xi32>
+  // CHECK-NEXT:    return [[cst]] : tensor<2xi32>
+  return %0: tensor<2xi32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/einsum.mlir b/tensorflow/compiler/mlir/tensorflow/tests/einsum.mlir
new file mode 100644
index 00000000000..3dec94a98df
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/einsum.mlir
@@ -0,0 +1,57 @@
+// RUN: tf-opt -split-input-file -verify-diagnostics -tf-einsum %s | FileCheck %s
+
+func @einsum_basic(%arg0: tensor<3x4x5xf32>, %arg1: tensor<3x5x6xf32>) -> tensor<3x4x6xf32> {
+  %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "ijk,ikm->ijm"}: (tensor<3x4x5xf32>, tensor<3x5x6xf32>) -> tensor<3x4x6xf32>
+  return %0 : tensor<3x4x6xf32>
+  // CHECK-LABEL: einsum_basic
+  // CHECK: "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<3x4x5xf32>, tensor<3x5x6xf32>) -> tensor<3x4x6xf32>
+}
+
+func @einsum_4D(%arg0: tensor<2x5x7x3xf32>, %arg1: tensor<2x4x7x3xf32>) -> tensor<2x7x5x4xf32> {
+  %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "bfnh,btnh->bnft"}: (tensor<2x5x7x3xf32>, tensor<2x4x7x3xf32>) -> tensor<2x7x5x4xf32>
+  return %0 : tensor<2x7x5x4xf32>
+  // CHECK-LABEL: einsum_4D
+  // CHECK: %[[cst:.*]] = constant dense<[0, 2, 1, 3]> : tensor<4xi32>
+  // CHECK: %[[cst_1:.*]] = constant dense<[0, 2, 3, 1]> : tensor<4xi32>
+  // CHECK: %[[v0:.*]] = "tf.Transpose"(%arg0, %[[cst]]) : (tensor<2x5x7x3xf32>, tensor<4xi32>) -> tensor<2x7x5x3xf32>
+  // CHECK: %[[v1:.*]] = "tf.Transpose"(%arg1, %[[cst_1]]) : (tensor<2x4x7x3xf32>, tensor<4xi32>) -> tensor<2x7x3x4xf32>
+  // CHECK: "tf.BatchMatMulV2"(%[[v0]], %[[v1]]) {adj_x = false, adj_y = false} : (tensor<2x7x5x3xf32>, tensor<2x7x3x4xf32>) -> tensor<2x7x5x4xf32>
+}
+
+func @einsum_matrixdotprod(%arg0: tensor<2x5x7x3xf32>, %arg1: tensor<7x3x4xf32>) -> tensor<2x5x4xf32> {
+  %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "bfnd,ndh->bfh"}: (tensor<2x5x7x3xf32>, tensor<7x3x4xf32>) -> tensor<2x5x4xf32>
+  return %0 : tensor<2x5x4xf32>
+  // CHECK-LABEL: einsum_matrixdotprod
+  // CHECK: %[[cst:.*]] = constant dense<[2, 5, 21]> : tensor<3xi64>
+  // CHECK: %[[cst_1:.*]] = constant dense<[21, 4]> : tensor<2xi64>
+  // CHECK: %[[v0:.*]] = "tf.Reshape"(%arg0, %[[cst]]) : (tensor<2x5x7x3xf32>, tensor<3xi64>) -> tensor<2x5x21xf32>
+  // CHECK: %[[v1:.*]] = "tf.Reshape"(%arg1, %[[cst_1]]) : (tensor<7x3x4xf32>, tensor<2xi64>) -> tensor<21x4xf32>
+  // CHECK: "tf.BatchMatMulV2"(%[[v0]], %[[v1]]) {adj_x = false, adj_y = false} : (tensor<2x5x21xf32>, tensor<21x4xf32>) -> tensor<2x5x4xf32>
+}
+
+func @einsum_reshapetail(%arg0: tensor<3x4x5xf32>, %arg1: tensor<5x6x2xf32>) -> tensor<3x4x6x2xf32> {
+  %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "bfd,dnh->bfnh"}: (tensor<3x4x5xf32>, tensor<5x6x2xf32>) -> tensor<3x4x6x2xf32>
+  return %0 : tensor<3x4x6x2xf32>
+  // CHECK-LABEL: einsum_reshapetail
+  // CHECK: %[[cst:.*]] = constant dense<[5, 12]> : tensor<2xi64>
+  // CHECK: %[[cst_1:.*]] = constant dense<[3, 4, 6, 2]> : tensor<4xi64>
+  // CHECK: %[[v0:.*]] = "tf.Reshape"(%arg1, %[[cst]]) : (tensor<5x6x2xf32>, tensor<2xi64>) -> tensor<5x12xf32>
+  // CHECK: %[[v1:.*]] = "tf.BatchMatMulV2"(%arg0, %[[v0]]) {adj_x = false, adj_y = false} : (tensor<3x4x5xf32>, tensor<5x12xf32>) -> tensor<3x4x12xf32>
+  // CHECK: %[[v2:.*]] = "tf.Reshape"(%[[v1]], %[[cst_1]]) : (tensor<3x4x12xf32>, tensor<4xi64>) -> tensor<3x4x6x2xf32>
+  // CHECK: return %[[v2]] : tensor<3x4x6x2xf32>
+}
+
+func @einsum_no_match(%arg0: tensor<4x5xf32>, %arg1: tensor<5xf32>) -> tensor<4xf32> {
+  %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "ij,j->i"}: (tensor<4x5xf32>, tensor<5xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+// CHECK-LABEL: einsum_no_match
+// CHECK: %[[v0:.*]] = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "ij,j->i"} : (tensor<4x5xf32>, tensor<5xf32>) -> tensor<4xf32>
+// CHECK: return %[[v0]]
+}
+func @einsum_illegal_no_match(%arg0: tensor<4x5xf32>, %arg1: tensor<5xf32>) -> tensor<4xf32> {
+  %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "ij,?zw->kq->i"}: (tensor<4x5xf32>, tensor<5xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+// CHECK-LABEL: einsum_illegal_no_match
+// CHECK: %[[v0:.*]] = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "ij,?zw->kq->i"} : (tensor<4x5xf32>, tensor<5xf32>) -> tensor<4xf32>
+// CHECK: return %[[v0]]
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/gpu_fusion.mlir b/tensorflow/compiler/mlir/tensorflow/tests/gpu_fusion.mlir
new file mode 100644
index 00000000000..6e507f06ef4
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/gpu_fusion.mlir
@@ -0,0 +1,47 @@
+// RUN: tf-opt %s -tf-gpu-op-fusion | FileCheck %s --dump-input=fail
+
+// Test the op-fusion pass specific to the GPU target.
+
+// CHECK-LABEL: func @FusedBatchNormRelu
+func @FusedBatchNormRelu(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xf32>) {
+// CHECK-NEXT: %[[Y:[a-z0-9]*]], {{.*}}_FusedBatchNormEx
+// CHECK-NEXT: return %[[Y]]
+  %y:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
+  %relu = "tf.Relu"(%y#0) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
+  return %relu : tensor<8x8x8x8xf32>
+}
+
+// CHECK-LABEL: func @FusedBatchNormAddRelu
+func @FusedBatchNormAddRelu(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xf32>) {
+// CHECK-NEXT: %[[Y:[a-z0-9]*]], {{.*}}_FusedBatchNormEx
+// CHECK-NEXT: return %[[Y]]
+  %y:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
+  %add = "tf.AddV2"(%arg0, %y#0) : (tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
+  %relu = "tf.Relu"(%add) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
+  return %relu : tensor<8x8x8x8xf32>
+}
+
+// CHECK-LABEL: func @FusedBatchNormAddReluTwoUses
+func @FusedBatchNormAddReluTwoUses(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>) {
+// Since the tf.AddV2 op has two uses, we have a _FusedBatchNormEx without the
+// Relu activation and we only fuse the add.
+// CHECK-NEXT: %[[Y:[a-z0-9]*]], {{.*}}_FusedBatchNormEx
+// CHECK-NEXT: %[[relu:[a-z0-9]*]] ={{.*}}Relu"(%[[Y]]
+// CHECK-NEXT: return %[[relu]]
+  %y:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
+  %add = "tf.AddV2"(%arg0, %y#0) : (tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
+  %relu = "tf.Relu"(%add) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
+  return %relu, %add  : tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>
+}
+
+// CHECK-LABEL: func @TrainingFusedBatchNormRelu
+func @TrainingFusedBatchNormRelu(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xf32>) {
+  // We don't fuse in training right now
+// CHECK-NEXT: %[[Y:[a-z0-9]*]], {{.*}}FusedBatchNorm
+// CHECK-NEXT: %[[relu:[a-z0-9]*]] ={{.*}}Relu"(%[[Y]]
+// CHECK-NEXT: return %[[relu]]
+  %y:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = true} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
+  %relu = "tf.Relu"(%y#0) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
+  return %relu : tensor<8x8x8x8xf32>
+}
+
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/BUILD
index c15aad27209..1544d27009f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/BUILD
@@ -8,6 +8,9 @@ glob_lit_tests(
         ":test_utilities",
     ],
     driver = "@llvm-project//mlir:run_lit.sh",
+    tags_override = {
+        "error-message-with-source-info.pbtxt": ["no_oss"],  # TODO(b/150946057): to be fixed on oss.
+    },
     test_file_exts = ["pbtxt"],
 )
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-uint8-return.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-uint8-return.pbtxt
index 9ae5601fa57..bb5e02fedf2 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-uint8-return.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-uint8-return.pbtxt
@@ -107,5 +107,5 @@ versions {
 # CHECK: "tf.PartitionedCall"()
 # CHECK-SAME: Tout = ["tfdtype$DT_UINT8"]
 # CHECK-SAME: f = @[[FUNCTION:[A-Za-z0-9_]*]]
-# CHECK: func @[[FUNCTION]]() -> tensor<!tf.uint8>
-# CHECK: return {{.*}} : tensor<!tf.uint8>
+# CHECK: func @[[FUNCTION]]() -> tensor<ui8>
+# CHECK: return {{.*}} : tensor<ui8>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_gpu_cc_60.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_gpu_cc_60.mlir
new file mode 100644
index 00000000000..3786a26d114
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_gpu_cc_60.mlir
@@ -0,0 +1,25 @@
+// RUN: tf-opt %s -tf-layout-assignment -verify-diagnostics | FileCheck %s --dump-input=always
+
+module attributes {
+  tf.devices = {"/device:GPU:0" = {cc_major = 6 : i32, cc_minor = 0 : i32}}
+} {
+
+// CHECK-LABEL: func @transposeConv2D_3x3_f16
+func @transposeConv2D_3x3_f16(%input: tensor<1x28x28x64xf16>, %filter: tensor<3x3x64x64xf16>) -> tensor<1x28x28x64xf16> {
+  // cuDNN prefers NCHW data format for spatial convolutions in f16 before
+  // compute capability 7.0 (NVIDIA Tensor Cores).
+
+  // CHECK: "tf.Conv2D"(%[[INPUT_TRANSPOSE:[0-9]*]], %arg1)
+  // CHECK-SAME: data_format = "NCHW"
+  %0 = "tf.Conv2D"(%input, %filter)
+       {
+         data_format = "NHWC",
+         padding = "VALID",
+         strides = [1, 1, 1, 1]
+       } : (tensor<1x28x28x64xf16>, tensor<3x3x64x64xf16>)
+        -> tensor<1x28x28x64xf16>
+
+  return %0 : tensor<1x28x28x64xf16>
+}
+
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_gpu_cc_70.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_gpu_cc_70.mlir
new file mode 100644
index 00000000000..0b2588c38cc
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_gpu_cc_70.mlir
@@ -0,0 +1,66 @@
+// RUN: tf-opt %s -tf-layout-assignment -verify-diagnostics | FileCheck %s --dump-input=always
+
+module attributes {
+  tf.devices = {"/device:GPU:0" = {cc_major = 7 : i32, cc_minor = 0 : i32}}
+} {
+
+// CHECK-LABEL: func @transposeConv2D_3x3_f32
+func @transposeConv2D_3x3_f32(%input: tensor<1x28x28x64xf32>, %filter: tensor<3x3x64x64xf32>) -> tensor<1x28x28x64xf32> {
+  // cuDNN prefers NCHW data format for spatial convolutions.
+  // CHECK: "tf.Conv2D"(%[[INPUT_TRANSPOSE:[0-9]*]], %arg1)
+  // CHECK-SAME: data_format = "NCHW"
+  %0 = "tf.Conv2D"(%input, %filter)
+       {
+         data_format = "NHWC",
+         padding = "VALID",
+         strides = [1, 1, 1, 1]
+       } : (tensor<1x28x28x64xf32>, tensor<3x3x64x64xf32>)
+        -> tensor<1x28x28x64xf32>
+
+  return %0 : tensor<1x28x28x64xf32>
+}
+
+// CHECK-LABEL: func @transposeConv2D_1x1_f32
+func @transposeConv2D_1x1_f32(%input: tensor<1x64x28x28xf32>, %filter: tensor<1x1x64x64xf32>) -> tensor<1x64x28x28xf32> {
+  // 1x1 convolution can be computed as a GEMM in NHWC data format.
+  // CHECK: "tf.Conv2D"(%[[INPUT_TRANSPOSE:[0-9]*]], %arg1)
+  // CHECK-SAME: data_format = "NHWC"
+  %0 = "tf.Conv2D"(%input, %filter)
+       {
+         data_format = "NCHW",
+         padding = "VALID",
+         strides = [1, 1, 1, 1]
+       } : (tensor<1x64x28x28xf32>, tensor<1x1x64x64xf32>)
+        -> tensor<1x64x28x28xf32>
+
+  // Striding in spatial dimensions does not allow to use GEMM.
+  // CHECK: "tf.Conv2D"(%arg0, %arg1)
+  // CHECK-SAME: data_format = "NCHW"
+  %1 = "tf.Conv2D"(%input, %filter)
+       {
+         data_format = "NCHW",
+         padding = "VALID",
+         strides = [1, 1, 2, 2]
+       } : (tensor<1x64x28x28xf32>, tensor<1x1x64x64xf32>)
+        -> tensor<1x64x14x14xf32>
+
+  return %0 : tensor<1x64x28x28xf32>
+}
+
+// CHECK-LABEL: func @transposeConv2D_3x3_f16
+func @transposeConv2D_3x3_f16(%input: tensor<1x64x28x28xf16>, %filter: tensor<3x3x64x64xf16>) -> tensor<1x64x28x28xf16> {
+  // To use Tensor Cores for f16 data type, input must be in NHWC data format.
+  // CHECK: "tf.Conv2D"(%[[INPUT_TRANSPOSE:[0-9]*]], %arg1)
+  // CHECK-SAME: data_format = "NHWC"
+  %0 = "tf.Conv2D"(%input, %filter)
+       {
+         data_format = "NCHW",
+         padding = "VALID",
+         strides = [1, 1, 1, 1]
+       } : (tensor<1x64x28x28xf16>, tensor<3x3x64x64xf16>)
+        -> tensor<1x64x28x28xf16>
+
+  return %0 : tensor<1x64x28x28xf16>
+}
+
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nchw.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nchw.mlir
index 0610cbe8680..b66289ae34b 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nchw.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nchw.mlir
@@ -1,52 +1,12 @@
 // RUN: tf-opt %s -tf-layout-assignment=force-data-format=NCHW -verify-diagnostics | FileCheck %s --dump-input=always
 
-// CHECK-LABEL: func @transposeBiasAdd
-func @transposeBiasAdd(%arg0: tensor<1x4x4x8xf32>, %arg1: tensor<8xf32>) -> tensor<1x4x4x8xf32> {
-
-  // Check that BiasAdd was converted to forced data format, and layout
-  // dependent arguments and results passed through transpose nodes.
-
-  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
-  // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
-  // CHECK: %[[BIAS_ADD:[0-9]*]] = "tf.BiasAdd"(%[[ARG_TRANSPOSE]], %arg1) {data_format = "NCHW"} {{.*}} tensor<1x8x4x4xf32>
-  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>}
-  // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[BIAS_ADD]], %[[RES_PERM]])
-  // CHECK: return %[[RES_TRANSPOSE]]
-  %0 = "tf.BiasAdd"(%arg0, %arg1) {data_format = "NHWC"} : (tensor<1x4x4x8xf32>, tensor<8xf32>) -> tensor<1x4x4x8xf32>
-
-  return %0 : tensor<1x4x4x8xf32>
-}
-
-// CHECK-LABEL: func @transposeBiasAddWithDefaultAttr
-func @transposeBiasAddWithDefaultAttr(%arg0: tensor<1x4x4x8xf32>, %arg1: tensor<8xf32>) -> tensor<1x4x4x8xf32> {
-
-  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
-  // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
-  // CHECK: %[[BIAS_ADD:[0-9]*]] = "tf.BiasAdd"(%[[ARG_TRANSPOSE]], %arg1) {data_format = "NCHW"} {{.*}} tensor<1x8x4x4xf32>
-  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>}
-  // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[BIAS_ADD]], %[[RES_PERM]])
-  // CHECK: return %[[RES_TRANSPOSE]]
-  %0 = "tf.BiasAdd"(%arg0, %arg1) : (tensor<1x4x4x8xf32>, tensor<8xf32>) -> tensor<1x4x4x8xf32>
-
-  return %0 : tensor<1x4x4x8xf32>
-}
-
-// CHECK-LABEL: func @transposeBiasWithUnknownShape
-func @transposeBiasWithUnknownShape(%arg0: tensor<1x4x4x8xf32>, %arg1: tensor<8xf32>) -> tensor<*xf32> {
-
-  // CHECK: %[[BIAS_ADD:[0-9]*]] = "tf.BiasAdd"(%[[ARG_TRANSPOSE]], %arg1) {data_format = "NCHW"} {{.*}} tensor<*xf32>
-  %0 = "tf.BiasAdd"(%arg0, %arg1) : (tensor<1x4x4x8xf32>, tensor<8xf32>) -> tensor<*xf32>
-
-  return %0 : tensor<*xf32>
-}
+// IMPORTANT: In the following Conv2D tests tensor shapes do not match
+// convolution parameters (stride, dilations, etc...). This test only verifies
+// that changing convolution data layout will update all the attributes.
 
 // CHECK-LABEL: func @transposeConv2D
 func @transposeConv2D(%input: tensor<1x32x32x3xf32>, %filter: tensor<1x1x3x8xf32>) -> tensor<1x32x32x8xf32> {
 
-  // IMPORTANT: Tensor shapes do not match convolution parameters (stride,
-  // dilations, etc...). This test only verifies that changing convolution data
-  // layout will update all the attributes.
-
   // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
   // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
 
@@ -73,3 +33,35 @@ func @transposeConv2D(%input: tensor<1x32x32x3xf32>, %filter: tensor<1x1x3x8xf32
 
   return %0 : tensor<1x32x32x8xf32>
 }
+
+// CHECK-LABEL: func @transposeConv2DWithDefaultAttr
+func @transposeConv2DWithDefaultAttr(%input: tensor<1x32x32x3xf32>, %filter: tensor<1x1x3x8xf32>) -> tensor<*xf32>
+{
+
+  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
+
+  // CHECK: %[[CONV2D:[0-9]*]] = "tf.Conv2D"(%[[ARG_TRANSPOSE]], %arg1)
+  // CHECK-SAME: data_format = "NCHW"
+  // CHECK-SAME: dilations = [1, 4, 2, 3]
+  // CHECK-SAME: explicit_paddings = [1, 2, 7, 8, 3, 4, 5, 6]
+  // CHECK-SAME: padding = "EXPLICIT"
+  // CHECK-SAME: strides = [5, 8, 6, 7]
+  // CHECK-SAME: (tensor<1x3x32x32xf32>, tensor<1x1x3x8xf32>) -> tensor<*xf32>
+
+  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>}
+  // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[CONV2D]], %[[RES_PERM]])
+  // CHECK: return %[[RES_TRANSPOSE]]
+
+  // (1) data_format attribute has default value NHWC
+  // (2) result shape is unknown (check that optimizer does not fail)
+  %0 = "tf.Conv2D"(%input, %filter)
+       {
+         dilations = [1, 2, 3, 4],
+         explicit_paddings = [1, 2, 3, 4, 5, 6, 7, 8],
+         padding = "EXPLICIT",
+         strides = [5, 6, 7, 8]
+       } : (tensor<1x32x32x3xf32>, tensor<1x1x3x8xf32>) -> tensor<*xf32>
+
+  return %0 : tensor<*xf32>
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_to_nchw.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_to_nchw.mlir
index a2394cd93c1..ae3592b723f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_to_nchw.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_to_nchw.mlir
@@ -1,24 +1,33 @@
 // RUN: tf-opt %s -tf-layout-optimization=force-data-format=NCHW -verify-diagnostics | FileCheck %s --dump-input=always
 
-// CHECK-LABEL: func @transposeBiasAdd
-func @transposeBiasAdd(%arg0: tensor<1x8x4x4xf32>, %arg1: tensor<8xf32>) -> tensor<1x8x4x4xf32> {
+// CHECK-LABEL: func @transposeConv2D
+func @transposeConv2D(%arg0: tensor<1x3x32x32xf32>, %arg1: tensor<1x1x3x8xf32>) -> tensor<1x3x32x32xf32> {
 
   // Convert input: NCHW -> NHWC
   %0 = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %1 = "tf.Transpose"(%arg0, %0) : (tensor<1x8x4x4xf32>, tensor<4xi32>) -> tensor<1x4x4x8xf32>
+  %1 = "tf.Transpose"(%arg0, %0) : (tensor<1x3x32x32xf32>, tensor<4xi32>) -> tensor<1x32x32x3xf32>
 
   // Compute in NHWC
-  %2 = "tf.BiasAdd"(%1, %arg1) {data_format = "NHWC"} : (tensor<1x4x4x8xf32>, tensor<8xf32>) -> tensor<1x4x4x8xf32>
+  %2 = "tf.Conv2D"(%1, %arg1)
+    {
+      data_format = "NHWC",
+      padding = "SAME",
+      strides = [1, 1, 1, 1],
+      dilations = [1, 1, 1, 1]
+    } : (tensor<1x32x32x3xf32>, tensor<1x1x3x8xf32>) -> tensor<1x32x32x3xf32>
 
   // Convert result back: NHWC -> NCHW
   %3 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
-  %4 = "tf.Transpose"(%2, %3) : (tensor<1x4x4x8xf32>, tensor<4xi32>) -> tensor<1x8x4x4xf32>
+  %4 = "tf.Transpose"(%2, %3) : (tensor<1x32x32x3xf32>, tensor<4xi32>) -> tensor<1x3x32x32xf32>
 
-  // Check that BiasAdd computed in NCHW format, and all redundant transpose
+  // Check that Conv2D computed in NCHW format, and all redundant transpose
   // operations removed from the function.
 
-  // CHECK: %[[BIAS_ADD:[0-9]*]] = "tf.BiasAdd"(%arg0, %arg1) {data_format = "NCHW"} {{.*}} tensor<1x8x4x4xf32>
-  // CHECK: return %[[BIAS_ADD]]
+  // CHECK: %[[CONV:[0-9]*]] = "tf.Conv2D"(%arg0, %arg1)
+  // CHECK-SAME: data_format = "NCHW"
+  // CHECK-SAME: -> tensor<1x3x32x32xf32>
 
-  return %4 : tensor<1x8x4x4xf32>
+  // CHECK: return %[[CONV]]
+
+  return %4 : tensor<1x3x32x32xf32>
 }
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir b/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir
index 7b92d0776f8..c5f87c602a3 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir
@@ -392,12 +392,12 @@ func @DynamicStitch_scalar_matrix_indices(%arg0: tensor<2xf32>, %arg1: tensor<2x
 
 // Verify that custom types are lowered and have legal output.
 // CHECK-LABEL: func @DynamicStitch_uint8
-func @DynamicStitch_uint8(%arg0: tensor<2x2x!tf.uint8>) -> tensor<2x2x!tf.uint8> {
+func @DynamicStitch_uint8(%arg0: tensor<2x2xui8>) -> tensor<2x2xui8> {
   // CHECK-NOT: tf.DynamicStitch
 
   %indices = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
-  %0 = "tf.DynamicStitch"(%indices, %arg0) : (tensor<2xi32>, tensor<2x2x!tf.uint8>) -> tensor<2x2x!tf.uint8>
-  return %0 : tensor<2x2x!tf.uint8>
+  %0 = "tf.DynamicStitch"(%indices, %arg0) : (tensor<2xi32>, tensor<2x2xui8>) -> tensor<2x2xui8>
+  return %0 : tensor<2x2xui8>
 }
 
 // CHECK-LABEL: func @DynamicStitch_scalar_item
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
index c9db7e0a1dc..706524e39a1 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
@@ -254,4 +254,28 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
     %0 = "tf.Cast"(%arg0) : (tensor<*xf32>) -> (tensor<*xf32>)
     return %0 : tensor<*xf32>
   }
+
+  // CHECK-LABEL: func @while_variant
+  // CHECK-SAME: -> tensor<!tf.variant<tensor<16x1xf32>>>
+  func @while_variant(%arg0: tensor<!tf.variant<tensor<16x1xf32>>>) -> tensor<!tf.variant> {
+    // CHECK: tf.While
+    // CHECK-SAME: -> tensor<!tf.variant<tensor<16x1xf32>>>
+    %0 = "tf.While"(%arg0) {cond = @variant_cond_func, body = @variant_body_func, is_stateless = true} : (tensor<!tf.variant<tensor<16x1xf32>>>) -> tensor<!tf.variant>
+    // CHECK: tf.ZerosLike
+    // CHECK-SAME: -> tensor<!tf.variant<tensor<16x1xf32>>>
+    %1 = "tf.ZerosLike"(%0) : (tensor<!tf.variant>) -> tensor<!tf.variant>
+    // CHECK: tf.Identity
+    // CHECK-SAME: -> tensor<!tf.variant<tensor<16x1xf32>>>
+    %2 = "tf.Identity"(%1) : (tensor<!tf.variant>) -> tensor<!tf.variant>
+    return %2 : tensor<!tf.variant>
+  }
+  // CHECK-LABEL: func @variant_cond_func
+  func @variant_cond_func(%arg0: tensor<!tf.variant<tensor<16x1xf32>>>) -> tensor<i1> {
+    %0 = "tf._SomeOp"() : () -> tensor<i1>
+    return %0 : tensor<i1>
+  }
+  // CHECK-LABEL: func @variant_body_func
+  func @variant_body_func(%arg0: tensor<!tf.variant<tensor<16x1xf32>>>) -> tensor<!tf.variant<tensor<16x1xf32>>> {
+    return %arg0 : tensor<!tf.variant<tensor<16x1xf32>>>
+  }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/stack_ops_decomposition.mlir b/tensorflow/compiler/mlir/tensorflow/tests/stack_ops_decomposition.mlir
index 23b77399d4f..e8c5bb59663 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/stack_ops_decomposition.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/stack_ops_decomposition.mlir
@@ -6,14 +6,14 @@
 func @main() -> tensor<f32> {
   // CHECK-NEXT: "tf.Const"() {value = dense<10> : tensor<i32>}
   %max_size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
-  // CHECK-NEXT: %[[BUFFER:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<10xf32>>>
-  // CHECK-NEXT: %[[SIZE:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<1xi32>>>
-  // CHECK-NEXT: %[[ZERO:.*]] = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
-  // CHECK-NEXT: "tf.AssignVariableOp"(%[[SIZE]], %[[ZERO]])
   // CHECK-NEXT: %[[ZERO_SCALAR:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
   // CHECK-NEXT: %[[CAST_ZERO:.*]] = "tf.Cast"(%[[ZERO_SCALAR]]) : (tensor<i32>) -> tensor<f32>
   // CHECK-NEXT: %[[CONST10:.*]] = "tf.Const"() {value = dense<10> : tensor<1xi32>} : () -> tensor<1xi32>
   // CHECK-NEXT: %[[BROADCAST:.*]] = "tf.BroadcastTo"(%[[CAST_ZERO]], %[[CONST10]]) : (tensor<f32>, tensor<1xi32>) -> tensor<10xf32>
+  // CHECK-NEXT: %[[BUFFER:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<10xf32>>>
+  // CHECK-NEXT: %[[SIZE:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<1xi32>>>
+  // CHECK-NEXT: %[[ZERO:.*]] = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: "tf.AssignVariableOp"(%[[SIZE]], %[[ZERO]])
   // CHECK-NEXT: "tf.AssignVariableOp"(%[[BUFFER]], %[[BROADCAST]])
   %stack = "tf.StackV2"(%max_size) {elem_type = f32, stack_name = "s"} : (tensor<i32>) -> tensor<!tf.resource>
   %id = "tf.Identity"(%stack) : (tensor<!tf.resource>) -> tensor<!tf.resource>
@@ -52,13 +52,13 @@ func @main() -> tensor<f32> {
 func @main() -> tensor<2xi32> {
   // CHECK-NEXT: "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
   %size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NEXT: %[[ZERO_CONST:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NEXT: %[[STACK_SHAPE:.*]] = "tf.Const"() {value = dense<[10, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // CHECK-NEXT: %[[BROADCAST:.*]] = "tf.BroadcastTo"(%[[ZERO_CONST]], %[[STACK_SHAPE]]) : (tensor<i32>, tensor<2xi32>) -> tensor<10x2xi32>
   // CHECK-NEXT: %[[BUFFER:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<10x2xi32>>>
   // CHECK-NEXT: %[[SIZE:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<1xi32>>>
   // CHECK-NEXT: %[[ZERO_SIZE:.*]] = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
   // CHECK-NEXT: "tf.AssignVariableOp"(%[[SIZE]], %[[ZERO_SIZE]]) : (tensor<!tf.resource<tensor<1xi32>>>, tensor<1xi32>) -> ()
-  // CHECK-NEXT: %[[ZERO_CONST:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-  // CHECK-NEXT: %[[STACK_SHAPE:.*]] = "tf.Const"() {value = dense<[10, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
-  // CHECK-NEXT: %[[BROADCAST:.*]] = "tf.BroadcastTo"(%[[ZERO_CONST]], %[[STACK_SHAPE]]) : (tensor<i32>, tensor<2xi32>) -> tensor<10x2xi32>
   // CHECK-NEXT: "tf.AssignVariableOp"(%[[BUFFER]], %[[BROADCAST]]) : (tensor<!tf.resource<tensor<10x2xi32>>>, tensor<10x2xi32>) -> ()
   %stack = "tf.StackV2"(%size) {elem_type = i32, stack_name = "s"} : (tensor<i32>) -> tensor<!tf.resource>
   // CHECK-NEXT: %[[PUSH_VAL:.*]] = "tf._SomeOp"() : () -> tensor<2xi32>
@@ -151,10 +151,10 @@ func @if_then(%arg0: tensor<!tf.resource>) -> tensor<!tf.resource> {
 }
 // CHECK: func @if_else(%[[EARG0:.*]]: tensor<!tf.resource<tensor<10xf32>>>, %[[EARG1:.*]]: tensor<!tf.resource<tensor<1xi32>>>)
 func @if_else(%arg0: tensor<!tf.resource>) -> tensor<!tf.resource> {
-  // CHECK-NOT: "tf.StackPushV2"
+  // CHECK-NOT: "tf.StackPopV2"
   // CHECK: "tf.Slice"
   // CHECK: "tf.AssignVariableOp"(%[[EARG1:.*]],
-  // CHECK-NOT: "tf.StackPushV2"
+  // CHECK-NOT: "tf.StackPopV2"
   %pop = "tf.StackPopV2"(%arg0) : (tensor<!tf.resource>) -> tensor<f32>
   return %arg0 : tensor<!tf.resource>
 }
@@ -204,7 +204,7 @@ func @callee(%arg0: tensor<!tf.resource>, %arg1: tensor<i1>) -> tensor<!tf.resou
 // Tests that the pass reports error on unknown stack size.
 
 func @main(%arg0: tensor<i32>) -> tensor<2xi32> {
-  // expected-error @+1 {{max size of stack is not a constant.}}
+  // expected-error @+1 {{unknown max element count}}
   %stack = "tf.StackV2"(%arg0) {elem_type = i32, stack_name = "s"} : (tensor<i32>) -> tensor<!tf.resource>
   %elem = "tf._SomeOp"() : () -> tensor<2xi32>
   %push = "tf.StackPushV2"(%stack, %elem) {swap_memory = false} : (tensor<!tf.resource>, tensor<2xi32>) -> tensor<2xi32>
@@ -218,7 +218,7 @@ func @main(%arg0: tensor<i32>) -> tensor<2xi32> {
 
 func @main(%arg0: tensor<i32>)  -> () {
   %max_size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
-  // expected-error @+1 {{cannot infer element shape of stack.}}
+  // expected-error @+1 {{cannot infer element shape of stack}}
   %stack = "tf.StackV2"(%max_size) {elem_type = i32, stack_name = "s"} : (tensor<i32>) -> tensor<!tf.resource>
   %elem = "tf._SomeOp"() : () -> tensor<*xi32>
   %push = "tf.StackPushV2"(%stack, %elem) {swap_memory = false} : (tensor<!tf.resource>, tensor<*xi32>) -> tensor<*xi32>
@@ -236,7 +236,7 @@ func @main(%arg0: tensor<i1>) -> () {
   %stack2 = "tf.StackV2"(%max_size) {elem_type = f32, stack_name = "s2"} : (tensor<i32>) -> tensor<!tf.resource>
   %if_op = "tf.If"(%arg0, %stack, %stack2) {then_branch = @if_then, else_branch = @if_else, is_stateless = false}
     : (tensor<i1>, tensor<!tf.resource>, tensor<!tf.resource>) -> tensor<!tf.resource>
-  // expected-error @+1 {{unknown stack.}}
+  // expected-error @+1 {{unknown stack}}
   %pop = "tf.StackPopV2"(%if_op) : (tensor<!tf.resource>) -> tensor<f32>
   "tf.StackCloseV2"(%stack) : (tensor<!tf.resource>) -> ()
   // CHECK: return
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tensor_list_ops_decomposition.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tensor_list_ops_decomposition.mlir
new file mode 100644
index 00000000000..9e43cea1003
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tensor_list_ops_decomposition.mlir
@@ -0,0 +1,277 @@
+// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-tensor-list-ops-decomposition | FileCheck %s -dump-input-on-failure
+
+// Test push and pop on a tensor list which is initially empty.
+
+// CHECK-LABEL: func @main
+func @main() -> (tensor<f32>, tensor<i32>) {
+  // CHECK-NEXT: "tf.Const"() {value = dense<[]> : tensor<0xi32>}
+  %elem_shape = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
+  // CHECK-NEXT: "tf.Const"() {value = dense<10> : tensor<i32>}
+  %max_size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NEXT: %[[ZERO_SCALAR:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NEXT: %[[CAST_ZERO:.*]] = "tf.Cast"(%[[ZERO_SCALAR]]) : (tensor<i32>) -> tensor<f32>
+  // CHECK-NEXT: %[[CONST10:.*]] = "tf.Const"() {value = dense<10> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: %[[BROADCAST:.*]] = "tf.BroadcastTo"(%[[CAST_ZERO]], %[[CONST10]]) : (tensor<f32>, tensor<1xi32>) -> tensor<10xf32>
+  // CHECK-NEXT: %[[ZERO:.*]] = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
+  %tl = "tf.EmptyTensorList"(%elem_shape, %max_size) : (tensor<0xi32>, tensor<i32>) -> tensor<!tf.variant<tensor<f32>>>
+  %id = "tf.Identity"(%tl) : (tensor<!tf.variant<tensor<f32>>>) -> tensor<!tf.variant<tensor<f32>>>
+  // CHECK-NEXT: %[[PUSHVAL:.*]] = "tf._SomeOp"()
+  %elem = "tf._SomeOp"() : () -> tensor<f32>
+  // CHECK-NEXT: %[[UPDATE_SHAPE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: %[[UPDATE_SLICE:.*]] = "tf.Reshape"(%[[PUSHVAL]], %[[UPDATE_SHAPE]]) : (tensor<f32>, tensor<1xi32>) -> tensor<1xf32>
+  // CHECK-NEXT: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"(%[[BROADCAST]], %[[UPDATE_SLICE]], %[[ZERO]]) : (tensor<10xf32>, tensor<1xf32>, tensor<1xi32>) -> tensor<10xf32>
+  // CHECK-NEXT: %[[CONST1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: %[[NEW_SIZE:.*]] = "tf.AddV2"(%[[ZERO]], %[[CONST1]]) : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
+  %push = "tf.TensorListPushBack"(%id, %elem) : (tensor<!tf.variant<tensor<f32>>>, tensor<f32>) -> tensor<!tf.variant<tensor<f32>>>
+  // CHECK-NEXT: %[[COPY:.*]] = "tf.Identity"(%[[UPDATE]])
+  // CHECK-NEXT: %[[CONST1_1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: %[[SUB:.*]] = "tf.Sub"(%[[NEW_SIZE]], %[[CONST1_1]])
+  // CHECK-NEXT: %[[SLICE_SIZE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: %[[SLICE:.*]] = "tf.Slice"(%[[COPY]], %[[SUB]], %[[SLICE_SIZE]]) : (tensor<10xf32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xf32>
+  // CHECK-NEXT: %[[ELEM_SHAPE:.*]] = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
+  // CHECK-NEXT: %[[ELEM:.*]] = "tf.Reshape"(%[[SLICE]], %[[ELEM_SHAPE]]) : (tensor<1xf32>, tensor<0xi32>) -> tensor<f32>
+  %pop:2 = "tf.TensorListPopBack"(%push, %elem_shape) : (tensor<!tf.variant<tensor<f32>>>, tensor<0xi32>) -> (tensor<!tf.variant<tensor<f32>>>, tensor<f32>)
+  // CHECK-NEXT: %[[SCALAR_SHAPE:.*]] = "tf.Const"() {value = dense<[]> : tensor<0xi32>}
+  // CHECK-NEXT: %[[LENGTH:.*]] = "tf.Reshape"(%[[NEW_SIZE]], %[[SCALAR_SHAPE]])
+  %length = "tf.TensorListLength"(%push) : (tensor<!tf.variant<tensor<f32>>>) -> tensor<i32>
+  // CHECK-NEXT:  return %[[ELEM]], %[[LENGTH]] : tensor<f32>, tensor<i32>
+  return %pop#1, %length: tensor<f32>, tensor<i32>
+}
+
+// -----
+
+// Test get and set, and other operations on a tensor list which has reserved
+// initial size.
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<i32>) -> (tensor<f32>, tensor<10xf32>, tensor<i32>)
+func @main(%arg0: tensor<i32>) -> (tensor<f32>, tensor<10xf32>, tensor<i32>) {
+  // CHECK-NEXT: "tf.Const"() {value = dense<[]> : tensor<0xi32>}
+  %elem_shape = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
+  // CHECK-NEXT: %[[NUM:.*]] = "tf.Const"() {value = dense<10> : tensor<i32>}
+  %num = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NEXT: %[[ZERO_SCALAR:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NEXT: %[[CAST_ZERO:.*]] = "tf.Cast"(%[[ZERO_SCALAR]]) : (tensor<i32>) -> tensor<f32>
+  // CHECK-NEXT: %[[CONST10:.*]] = "tf.Const"() {value = dense<10> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: %[[BROADCAST:.*]] = "tf.BroadcastTo"(%[[CAST_ZERO]], %[[CONST10]]) : (tensor<f32>, tensor<1xi32>) -> tensor<10xf32>
+  // CHECK-NEXT: %[[SIZE_SHAPE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
+  // CHECK-NEXT: %[[SIZE:.*]] = "tf.Reshape"(%[[NUM]], %[[SIZE_SHAPE]])
+  %tl = "tf.TensorListReserve"(%elem_shape, %num) : (tensor<0xi32>, tensor<i32>) -> tensor<!tf.variant<tensor<f32>>>
+  // CHECK-NEXT: %[[SETVAL:.*]] = "tf._SomeOp"()
+  %elem = "tf._SomeOp"() : () -> tensor<f32>
+  // CHECK-NEXT: %[[SIZE_SHAPE1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
+  // CHECK-NEXT: %[[SET_INDEX:.*]] = "tf.Reshape"(%[[ARG0]], %[[SIZE_SHAPE1]]) : (tensor<i32>, tensor<1xi32>) -> tensor<1xi32>
+  // CHECK-NEXT: %[[UPDATE_SHAPE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: %[[UPDATE_SLICE:.*]] = "tf.Reshape"(%[[SETVAL]], %[[UPDATE_SHAPE]]) : (tensor<f32>, tensor<1xi32>) -> tensor<1xf32>
+  // CHECK-NEXT: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"(%[[BROADCAST]], %[[UPDATE_SLICE]], %[[SET_INDEX]]) : (tensor<10xf32>, tensor<1xf32>, tensor<1xi32>) -> tensor<10xf32>
+  %set = "tf.TensorListSetItem"(%tl, %arg0, %elem) : (tensor<!tf.variant<tensor<f32>>>, tensor<i32>, tensor<f32>) -> tensor<!tf.variant<tensor<f32>>>
+  // CHECK-NEXT: %[[SIZE_SHAPE2:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
+  // CHECK-NEXT: %[[GET_INDEX:.*]] = "tf.Reshape"(%[[ARG0]], %[[SIZE_SHAPE2]]) : (tensor<i32>, tensor<1xi32>) -> tensor<1xi32>
+  // CHECK-NEXT: %[[SLICE_SIZE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: %[[SLICE:.*]] = "tf.Slice"(%[[UPDATE]], %[[GET_INDEX]], %[[SLICE_SIZE]]) : (tensor<10xf32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xf32>
+  // CHECK-NEXT: %[[ELEM_SHAPE:.*]] = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
+  // CHECK-NEXT: %[[ELEM:.*]] = "tf.Reshape"(%[[SLICE]], %[[ELEM_SHAPE]]) : (tensor<1xf32>, tensor<0xi32>) -> tensor<f32>
+  %get = "tf.TensorListGetItem"(%set, %arg0, %elem_shape) : (tensor<!tf.variant<tensor<f32>>>, tensor<i32>, tensor<0xi32>) -> tensor<f32>
+  // CHECK-NEXT: %[[ADDN:.*]] = "tf.AddN"(%[[UPDATE]], %[[BROADCAST]]) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>
+  %addn = "tf.AddN"(%set, %tl) : (tensor<!tf.variant<tensor<f32>>>, tensor<!tf.variant<tensor<f32>>>) -> tensor<!tf.variant<tensor<f32>>>
+  // CHECK-NEXT: %[[ZEROS_LIKE:.*]] = "tf.ZerosLike"(%[[ADDN]]) : (tensor<10xf32>) -> tensor<10xf32>
+  %zeros-like = "tf.ZerosLike"(%addn) : (tensor<!tf.variant<tensor<f32>>>) -> tensor<!tf.variant<tensor<f32>>>
+  // CHECK-NEXT: %[[ADDN2:.*]] = "tf.AddN"(%[[ADDN]], %[[ZEROS_LIKE]]) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>
+  %addn2 = "tf.AddN"(%addn, %zeros-like) : (tensor<!tf.variant<tensor<f32>>>, tensor<!tf.variant<tensor<f32>>>) -> tensor<!tf.variant<tensor<f32>>>
+  %stack = "tf.TensorListStack"(%addn2, %elem_shape) : (tensor<!tf.variant<tensor<f32>>>, tensor<0xi32>) -> tensor<10xf32>
+  // CHECK-NEXT: %[[LEN:.*]] = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  %length = "tf.TensorListLength"(%addn2) : (tensor<!tf.variant<tensor<f32>>>) -> tensor<i32>
+  // CHECK-NEXT:  return %[[ELEM]], %[[ADDN2]], %[[LEN]] : tensor<f32>, tensor<10xf32>, tensor<i32>
+  return %get, %stack, %length : tensor<f32>, tensor<10xf32>, tensor<i32>
+}
+
+// -----
+
+// Test get on a tensor list created from a tensor.
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<i32>, %[[ARG1:.*]]: tensor<10xf32>) -> tensor<f32>
+func @main(%arg0: tensor<i32>, %arg1: tensor<10xf32>) -> tensor<f32> {
+  // CHECK-NEXT: "tf.Const"() {value = dense<[]> : tensor<0xi32>}
+  %elem_shape = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
+  // CHECK-NEXT: %[[BUFFER:.*]] = "tf.Identity"(%arg1) : (tensor<10xf32>) -> tensor<10xf32>
+  // CHECK-NEXT: %[[SIZE:.*]] = "tf.Const"() {value = dense<10> : tensor<1xi32>} : () -> tensor<1xi32>
+  %tl = "tf.TensorListFromTensor"(%arg1, %elem_shape) : (tensor<10xf32>, tensor<0xi32>) -> tensor<!tf.variant<tensor<f32>>>
+  // CHECK-NEXT: %[[SIZE_SHAPE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
+  // CHECK-NEXT: %[[GET_INDEX:.*]] = "tf.Reshape"(%[[ARG0]], %[[SIZE_SHAPE]]) : (tensor<i32>, tensor<1xi32>) -> tensor<1xi32>
+  // CHECK-NEXT: %[[SLICE_SIZE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: %[[SLICE:.*]] = "tf.Slice"(%[[BUFFER]], %[[GET_INDEX]], %[[SLICE_SIZE]]) : (tensor<10xf32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xf32>
+  // CHECK-NEXT: %[[ELEM_SHAPE:.*]] = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
+  // CHECK-NEXT: %[[ELEM:.*]] = "tf.Reshape"(%[[SLICE]], %[[ELEM_SHAPE]]) : (tensor<1xf32>, tensor<0xi32>) -> tensor<f32>
+  %get = "tf.TensorListGetItem"(%tl, %arg0, %elem_shape) : (tensor<!tf.variant<tensor<f32>>>, tensor<i32>, tensor<0xi32>) -> tensor<f32>
+  // CHECK-NEXT:  return %[[ELEM]] : tensor<f32>
+  return %get: tensor<f32>
+}
+
+// -----
+
+// Tests while loop.
+
+// CHECK-LABEL: func @main
+func @main() -> () {
+  %elem_shape = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
+  %max_size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NOT: tf.EmptyTensorList
+  %tl = "tf.EmptyTensorList"(%elem_shape, %max_size) : (tensor<0xi32>, tensor<i32>) -> tensor<!tf.variant<tensor<f32>>>
+  %1:2 = "tf.While"(%tl, %max_size) {
+    body = @while_body, cond = @while_cond, device = "", is_stateless = false}
+       : (tensor<!tf.variant<tensor<f32>>>, tensor<i32>) -> (tensor<!tf.variant<tensor<f32>>>, tensor<i32>)
+  // CHECK: "tf.Slice"
+  %pop:2 = "tf.TensorListPopBack"(%1#0, %elem_shape) : (tensor<!tf.variant<tensor<f32>>>, tensor<0xi32>) -> (tensor<!tf.variant<tensor<f32>>>, tensor<f32>)
+  // CHECK-NOT: tf.EmptyTensorList
+  // CHECK: return
+  return
+}
+// CHECK: func @while_body(%[[BARG0:.*]]: tensor<10xf32>, %[[BARG1:.*]]: tensor<i32>, %[[BARG2:.*]]: tensor<1xi32>)
+func @while_body(%arg0: tensor<!tf.variant<tensor<f32>>>, %arg1: tensor<i32>) -> (tensor<!tf.variant<tensor<f32>>>, tensor<i32>) {
+  // CHECK: %[[CONST1:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %const1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[SUB:.*]] = "tf.Sub"(%[[BARG1]], %[[CONST1]])
+  %sub = "tf.Sub"(%arg1, %const1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %elem = "tf._SomeOp"() : () -> tensor<f32>
+  // CHECK-NOT: "tf.TensorListPushBack"
+  // CHECK: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"
+  // CHECK: %[[CONST1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[BARG2]], %[[CONST1]])
+  // CHECK-NOT: "tf.TensorListPushBack"
+  %push = "tf.TensorListPushBack"(%arg0, %elem) : (tensor<!tf.variant<tensor<f32>>>, tensor<f32>) -> tensor<!tf.variant<tensor<f32>>>
+  // CHECK: return %[[UPDATE]], %[[SUB]], %[[ADD]]
+  return %push, %sub : tensor<!tf.variant<tensor<f32>>>, tensor<i32>
+}
+// CHECK: func @while_cond(%[[CARG0:.*]]: tensor<10xf32>, %[[CARG1:.*]]: tensor<i32>, %[[CARG2:.*]]: tensor<1xi32>)
+func @while_cond(%arg0: tensor<!tf.variant<tensor<f32>>>, %arg1: tensor<i32>) -> tensor<i32> {
+  // CHECK-NEXT: return %[[CARG1]]
+  return %arg1 : tensor<i32>
+}
+
+// -----
+
+// Tests IfOp.
+
+// CHECK-LABEL: func @main
+func @main(%arg0: tensor<i1>) -> () {
+  %elem_shape = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
+  %max_size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NOT: tf.EmptyTensorList
+  %tl = "tf.EmptyTensorList"(%elem_shape, %max_size) : (tensor<0xi32>, tensor<i32>) -> tensor<!tf.variant<tensor<f32>>>
+  %if_op = "tf.If"(%arg0, %tl) {then_branch = @if_then, else_branch = @if_else, is_stateless = false}
+    : (tensor<i1>, tensor<!tf.variant<tensor<f32>>>) -> tensor<!tf.variant<tensor<f32>>>
+  // CHECK: "tf.Slice"
+  %pop:2 = "tf.TensorListPopBack"(%if_op, %elem_shape) : (tensor<!tf.variant<tensor<f32>>>, tensor<0xi32>) -> (tensor<!tf.variant<tensor<f32>>>, tensor<f32>)
+  // CHECK-NOT: tf.TensorListPopBack
+  // CHECK: return
+  return
+}
+// CHECK: func @if_then(%[[TARG0:.*]]: tensor<10xf32>, %[[TARG1:.*]]: tensor<1xi32>) -> (tensor<10xf32>, tensor<1xi32>)
+func @if_then(%arg0: tensor<!tf.variant<tensor<f32>>>) -> tensor<!tf.variant<tensor<f32>>> {
+  %elem = "tf._SomeOp"() : () -> tensor<f32>
+  // CHECK-NOT: "tf.TensorListPushBack"
+  // CHECK: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"
+  // CHECK: %[[CONST1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[TARG1]], %[[CONST1]])
+  // CHECK-NOT: "tf.TensorListPushBack"
+  %push = "tf.TensorListPushBack"(%arg0, %elem) : (tensor<!tf.variant<tensor<f32>>>, tensor<f32>) -> tensor<!tf.variant<tensor<f32>>>
+  // CHECK: return %[[UPDATE]], %[[ADD]]
+  return %push : tensor<!tf.variant<tensor<f32>>>
+}
+// CHECK: func @if_else(%[[EARG0:.*]]: tensor<10xf32>, %[[EARG1:.*]]: tensor<1xi32>) -> (tensor<10xf32>, tensor<1xi32>)
+func @if_else(%arg0: tensor<!tf.variant<tensor<f32>>>) -> tensor<!tf.variant<tensor<f32>>> {
+  %elem_shape = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
+  // CHECK-NOT: "tf.TensorListPopBack"
+  // CHECK: %[[COPY:.*]] = "tf.Identity"(%[[EARG0]])
+  // CHECK: %[[CONST1_1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: %[[SUB:.*]] = "tf.Sub"(%[[EARG1]], %[[CONST1_1]])
+  // CHECK: %[[SLICE_SIZE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: %[[SLICE:.*]] = "tf.Slice"(%[[COPY]], %[[SUB]], %[[SLICE_SIZE]]) : (tensor<10xf32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xf32>
+  // CHECK: %[[ELEM_SHAPE:.*]] = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
+  // CHECK: %[[ELEM:.*]] = "tf.Reshape"(%[[SLICE]], %[[ELEM_SHAPE]]) : (tensor<1xf32>, tensor<0xi32>) -> tensor<f32>
+  // CHECK-NOT: "tf.TensorListPopBack"
+  %pop:2 = "tf.TensorListPopBack"(%arg0, %elem_shape) : (tensor<!tf.variant<tensor<f32>>>, tensor<0xi32>) -> (tensor<!tf.variant<tensor<f32>>>, tensor<f32>)
+  // CHECK:  return %[[COPY]], %[[SUB]]
+  return %pop#0 : tensor<!tf.variant<tensor<f32>>>
+}
+
+// -----
+
+// Tests PartitionedCall/StatefulPartitionedCall.
+
+// CHECK-LABEL: func @main
+func @main(%arg0: tensor<i1>) -> () {
+  %elem_shape = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
+  %max_size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NOT: tf.EmptyTensorList
+  // CHECK: %[[INIT:.*]] = "tf.BroadcastTo"
+  %tl = "tf.EmptyTensorList"(%elem_shape, %max_size) : (tensor<0xi32>, tensor<i32>) -> tensor<!tf.variant<tensor<f32>>>
+  // CHECK: "tf.StatefulPartitionedCall"(%[[INIT]],
+  // CHECK-SAME: f = @callee_tensorlist_decomposed
+  %call = "tf.StatefulPartitionedCall"(%tl, %arg0) {f = @callee, config = "", config_proto = "", executor_type = ""}
+    : (tensor<!tf.variant<tensor<f32>>>, tensor<i1>) -> tensor<!tf.variant<tensor<f32>>>
+  // CHECK: %[[CALL2:.*]]:2 = "tf.PartitionedCall"(%[[INIT]],
+  // CHECK-SAME: f = @callee_tensorlist_decomposed
+  %call2 = "tf.PartitionedCall"(%tl, %arg0) {f = @callee, config = "", config_proto = "", executor_type = ""}
+    : (tensor<!tf.variant<tensor<f32>>>, tensor<i1>) -> tensor<!tf.variant<tensor<f32>>>
+  // CHECK: %[[COPY:.*]] = "tf.Identity"(%[[CALL2]]#0)
+  // CHECK: "tf.Slice"(%[[COPY]],
+  %pop:2 = "tf.TensorListPopBack"(%call2, %elem_shape) : (tensor<!tf.variant<tensor<f32>>>, tensor<0xi32>) -> (tensor<!tf.variant<tensor<f32>>>, tensor<f32>)
+  // CHECK-NOT: tf.TensorListPopBack
+  // CHECK: return
+  return
+}
+
+// CHECK: func @callee(%[[AARG0:.*]]: tensor<!tf.variant<tensor<f32>>>, %[[AARG1:.*]]: tensor<i1>) -> tensor<!tf.variant<tensor<f32>>>
+func @callee(%arg0: tensor<!tf.variant<tensor<f32>>>, %arg1: tensor<i1>) -> tensor<!tf.variant<tensor<f32>>> {
+  %elem = "tf._SomeOp"(%arg1) : (tensor<i1>) -> tensor<f32>
+  // CHECK: "tf.TensorListPushBack"
+  %push = "tf.TensorListPushBack"(%arg0, %elem) : (tensor<!tf.variant<tensor<f32>>>, tensor<f32>) -> tensor<!tf.variant<tensor<f32>>>
+  return %push : tensor<!tf.variant<tensor<f32>>>
+}
+
+// CHECK: func @callee_tensorlist_decomposed(%[[ARG0:.*]]: tensor<10xf32>, %[[ARG1:.*]]: tensor<i1>, %[[ARG2:.*]]: tensor<1xi32>) -> (tensor<10xf32>, tensor<1xi32>)
+// CHECK-NOT: "tf.TensorListPushBack"
+// CHECK: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"
+// CHECK: %[[CONST1:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+// CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[ARG2]], %[[CONST1]])
+// CHECK-NOT: "tf.TensorListPushBack"
+// CHECK: return %[[UPDATE]], %[[ADD]]
+
+// -----
+
+// Tests that the pass reports error on unknown maximum size.
+
+func @main(%arg0: tensor<i32>) -> () {
+  %elem_shape = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
+  // expected-error @+1 {{unknown max element count}}
+  %tl = "tf.EmptyTensorList"(%elem_shape, %arg0) : (tensor<0xi32>, tensor<i32>) -> tensor<!tf.variant<tensor<f32>>>
+  return
+}
+
+// -----
+
+// Tests that the pass reports error on unknown element shape.
+
+func @main(%arg0: tensor<*xi32>)  -> () {
+  %max_size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // expected-error @+1 {{unknown tensor list element shape}}
+  %tl = "tf.EmptyTensorList"(%arg0, %max_size) : (tensor<*xi32>, tensor<i32>) -> tensor<!tf.variant<tensor<*xf32>>>
+  return
+}
+
+// -----
+
+// Tests that the pass reports error on pushing elements to a fixed-size tenosr
+// list.
+
+func @main(%arg0: tensor<*xi32>)  -> () {
+  %elem_shape = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
+  %num = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  %tl = "tf.TensorListReserve"(%elem_shape, %num) : (tensor<0xi32>, tensor<i32>) -> tensor<!tf.variant<tensor<f32>>>
+  %elem = "tf._SomeOp"() : () -> tensor<f32>
+  // expected-error @+1 {{cannot push on a fixed-size tensor list}}
+  %push = "tf.TensorListPushBack"(%tl, %elem) : (tensor<!tf.variant<tensor<f32>>>, tensor<f32>) -> tensor<!tf.variant<tensor<f32>>>
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
index 319660ae4bb..b7d1f3a7104 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
@@ -66,17 +66,17 @@ func @testIdentity(%arg0: tensor<4x2x!tf.stringref>) -> tensor<4x2x!tf.string> {
 // -----
 
 // CHECK-LABEL: func @testBitcast
-func @testBitcast(%arg0: tensor<3x4x!tf.uint16>) -> tensor<3x4x!tf.quint16> {
-  %0 = "tf.Bitcast"(%arg0) : (tensor<3x4x!tf.uint16>) -> tensor<3x4x!tf.quint16>
+func @testBitcast(%arg0: tensor<3x4xui16>) -> tensor<3x4x!tf.quint16> {
+  %0 = "tf.Bitcast"(%arg0) : (tensor<3x4xui16>) -> tensor<3x4x!tf.quint16>
   return %0 : tensor<3x4x!tf.quint16>
 }
 
 // -----
 
 // CHECK-LABEL: func @testReverseV2
-func @testReverseV2(%arg0: tensor<2x4x3x!tf.uint8>, %arg1: tensor<1xi32>) -> tensor<2x4x3x!tf.uint8> {
-  %0 = "tf.ReverseV2"(%arg0, %arg1) : (tensor<2x4x3x!tf.uint8>, tensor<1xi32>) -> tensor<2x4x3x!tf.uint8>
-  return %0 :  tensor<2x4x3x!tf.uint8>
+func @testReverseV2(%arg0: tensor<2x4x3xui8>, %arg1: tensor<1xi32>) -> tensor<2x4x3xui8> {
+  %0 = "tf.ReverseV2"(%arg0, %arg1) : (tensor<2x4x3xui8>, tensor<1xi32>) -> tensor<2x4x3xui8>
+  return %0 :  tensor<2x4x3xui8>
 }
 
 // -----
@@ -210,9 +210,9 @@ func @testLeakyWrongAlphaType(tensor<16xf32>) -> tensor<16xf32> {
 // -----
 
 // CHECK-LABEL: func @testMul
-func @testMul(%arg0: tensor<2x!tf.uint16>) -> (tensor<2x!tf.uint16>) {
-  %0 = "tf.Mul"(%arg0, %arg0) {T = "tfdtype$DT_UINT16", device = "/device:CPU:0", name = "Mul"} : (tensor<2x!tf.uint16>, tensor<2x!tf.uint16>) -> tensor<2x!tf.uint16>
-  return %0 : tensor<2x!tf.uint16>
+func @testMul(%arg0: tensor<2xui16>) -> (tensor<2xui16>) {
+  %0 = "tf.Mul"(%arg0, %arg0) {T = "tfdtype$DT_UINT16", device = "/device:CPU:0", name = "Mul"} : (tensor<2xui16>, tensor<2xui16>) -> tensor<2xui16>
+  return %0 : tensor<2xui16>
 }
 
 // -----
@@ -236,7 +236,7 @@ func @testReshape(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<1000
 func @testReshape(tensor<*xf32>, tensor<*xf32>) -> (tensor<100x100xf32>) {
 ^bb0(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>):
   %shape1 = constant dense<100.> : tensor<2xf32>
-  // expected-error @+1 {{must be tensor of 32/64-bit integer values}}
+  // expected-error @+1 {{must be tensor of 32/64-bit signless integer values}}
   %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<*xf32>, tensor<2xf32>) -> (tensor<100x100xf32>)
   return %r1 : tensor<100x100xf32>
 }
@@ -1290,7 +1290,7 @@ func @testValidShape(tensor<1x32x32x16xf32>, tensor<*xf32>) -> (tensor<4xi32>, t
 // -----
 
 func @testShapeWrongResultElemType(%arg0: tensor<1x32x32x16xf32>) -> tensor<4xf32> {
-  // expected-error @+1 {{result #0 must be tensor of 32/64-bit integer values}}
+  // expected-error @+1 {{result #0 must be tensor of 32/64-bit signless integer values}}
   %0 = "tf.Shape"(%arg0) : (tensor<1x32x32x16xf32>) -> tensor<4xf32>
   return %0 : tensor<4xf32>
 }
@@ -1334,7 +1334,7 @@ func @testValidShapeN(%arg0 : tensor<1x32x32x16xf32>, %arg1 : tensor<*xf32>) ->
 // -----
 
 func @testShapeNWrongResultElemType(%arg0: tensor<1x32x32x16xf32>) -> tensor<4xf32> {
-  // expected-error @+1 {{result #1 must be tensor of 32/64-bit integer values}}
+  // expected-error @+1 {{result #1 must be tensor of 32/64-bit signless integer values}}
   %0:2 = "tf.ShapeN"(%arg0, %arg0) : (tensor<1x32x32x16xf32>, tensor<1x32x32x16xf32>) -> (tensor<4xi32>, tensor<4xf32>)
   return %0#1 : tensor<4xf32>
 }
@@ -1395,7 +1395,7 @@ func @testVariableShapeMultipleSubtypes(%arg0: tensor<*x!tf.resource<tensor<1x32
 // -----
 
 func @testVariableShapeWrongResultElemType(%arg0: tensor<*x!tf.resource<tensor<1x32x32x16xf32>>>) -> tensor<?xf32> {
-  // expected-error @+1 {{result #0 must be tensor of 32/64-bit integer values}}
+  // expected-error @+1 {{result #0 must be tensor of 32/64-bit signless integer values}}
   %0 = "tf.VariableShape"(%arg0) : (tensor<*x!tf.resource<tensor<1x32x32x16xf32>>>) -> tensor<4xf32>
   return %0 : tensor<4xf32>
 }
@@ -1457,7 +1457,7 @@ func @testTranspose(tensor<2x3xf32>) -> tensor<3x2xf32> {
 // Test invalid tf.Less
 func @testLess(tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32> {
 ^bb0(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>):
-  // expected-error @+1 {{op result #0 must be tensor of 1-bit integer values}}
+  // expected-error @+1 {{op result #0 must be tensor of 1-bit signless integer values}}
   %0 = "tf.Less"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
   return %0 : tensor<4xi32>
 }
@@ -1474,7 +1474,7 @@ func @testConcatV2(%arg: tensor<8x16xf32>, %axis: tensor<i32>) -> tensor<?xf32>
 
 // tf.ConcatV2 with wrong 'axis' element type
 func @testConcatV2(%arg: tensor<8x16xf32>, %axis: tensor<f32>) -> tensor<?xf32> {
-  // expected-error @+1 {{operand #2 must be tensor of 32/64-bit integer values}}
+  // expected-error @+1 {{operand #2 must be tensor of 32/64-bit signless integer values}}
   %0 = "tf.ConcatV2"(%arg, %arg, %axis) : (tensor<8x16xf32>, tensor<8x16xf32>, tensor<f32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
@@ -1507,7 +1507,7 @@ func @testAll64(%arg0: tensor<2x2xi1>, %arg1: tensor<i64>) -> tensor<i1> {
 // -----
 
 func @testAllFloat(%arg0: tensor<2x2xi1>, %arg1: tensor<f32>) -> tensor<i1> {
-  // expected-error @+1 {{'tf.All' op operand #1 must be tensor of 32/64-bit integer values}}
+  // expected-error @+1 {{'tf.All' op operand #1 must be tensor of 32/64-bit signless integer values}}
   %0 = "tf.All"(%arg0, %arg1) {keep_dims = false} : (tensor<2x2xi1>, tensor<f32>) -> tensor<i1>
   return %0 : tensor<i1>
 }
@@ -1515,7 +1515,7 @@ func @testAllFloat(%arg0: tensor<2x2xi1>, %arg1: tensor<f32>) -> tensor<i1> {
 // -----
 
 func @testAllI32(%arg0: tensor<2x2xi32>, %arg1: tensor<f32>) -> tensor<i32> {
-  // expected-error @+1 {{'tf.All' op operand #0 must be tensor of 1-bit integer values}}
+  // expected-error @+1 {{'tf.All' op operand #0 must be tensor of 1-bit signless integer values}}
   %0 = "tf.All"(%arg0, %arg1) {keep_dims = false} : (tensor<2x2xi32>, tensor<f32>) -> tensor<i32>
   return %0 : tensor<i32>
 }
@@ -2449,3 +2449,17 @@ func @testParseExampleV2RaggedMismatchedOutputLengths(%serialized: tensor<32x!tf
   %result:3 = "tf.ParseExampleV2"(%serialized, %names, %empty_str_vector, %empty_str_vector, %ragged_keys) {dense_shapes = [], num_sparse = 0 : i64, result_segment_sizes = dense<[0, 0, 0, 0, 2, 1]> : vector<6xi32>} : (tensor<32x!tf.string>, tensor<32x!tf.string>, tensor<0x!tf.string>, tensor<0x!tf.string>, tensor<2x!tf.string>) -> (tensor<?xf32>, tensor<?x!tf.string>, tensor<?xi32>)
   return %result#0 : tensor<?xf32>
 }
+
+// -----
+
+func @testBatchMatMulV2(%lhs: tensor<f32>, %rhs: tensor<10x10xf32>) {
+  // expected-error @+1 {{requires lhs operand to have rank at least two}}
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) : (tensor<f32>, tensor<10x10xf32>) -> tensor<10x10xf32>
+}
+
+// -----
+
+func @testBatchMatMulV2(%lhs: tensor<10x10xf32>, %rhs: tensor<f32>) {
+  // expected-error @+1 {{requires rhs operand to have rank at least two}}
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) : (tensor<10x10xf32>, tensor<f32>) -> tensor<10x10xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_device_ops_invalid.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_device_ops_invalid.mlir
index b1e2bc36900..07863e3e806 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_device_ops_invalid.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_device_ops_invalid.mlir
@@ -94,7 +94,7 @@ func @verifier_replicate_terminator() {
 // Check that a replicate with 'n' attribute that is less than 2 is invalid.
 func @verifier_replicate_n() {
   "tf_device.replicate" () ({
-// expected-error@-1 {{'tf_device.replicate' op attribute 'n' failed to satisfy constraint: 32-bit integer attribute whose minimum value is 2}}
+// expected-error@-1 {{'tf_device.replicate' op attribute 'n' failed to satisfy constraint: 32-bit signless integer attribute whose minimum value is 2}}
   ^entry:
     tf_device.return
   }) {n = 1 : i32} : () -> ()
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops_invalid.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops_invalid.mlir
index d0aa1414723..b9ec020ff59 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops_invalid.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops_invalid.mlir
@@ -348,7 +348,7 @@ func @invalid_switch(%arg0: tensor<*xf32>) {
 func @invalid_switch(%arg0: tensor<*xf32>, %arg1: i1) -> tensor<*xf32> {
   %result = tf_executor.graph {
     %true, %false, %ctlSwitch = "tf_executor.Switch"(%arg0, %arg0) : (tensor<*xf32>, tensor<*xf32>) -> (tensor<*xf32>, tensor<*xf32>, !tf_executor.control)
-// expected-error@-1 {{'tf_executor.Switch' op operand #1 must be tensor of 1-bit integer values}}
+// expected-error@-1 {{'tf_executor.Switch' op operand #1 must be tensor of 1-bit signless integer values}}
     tf_executor.fetch %true : tensor<*xf32>
   }
   return %result : tensor<*xf32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_global_tensors.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_global_tensors.mlir
new file mode 100644
index 00000000000..ce44a562aca
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_global_tensors.mlir
@@ -0,0 +1,88 @@
+// RUN: tf-opt -verify-diagnostics -tf-saved-model-freeze-global-tensors -split-input-file %s | FileCheck %s --dump-input=fail
+
+module attributes {tf_saved_model.semantics} {
+
+  // Test case: Basic freezing.
+
+  // CHECK-NOT: tf_saved_model.global_tensor
+ "tf_saved_model.global_tensor"() {sym_name = "v", type = tensor<f32>, value = dense<1.0> : tensor<f32> } : () -> ()
+
+  // CHECK: func @f()
+  func @f(%arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v})
+  attributes {tf_saved_model.exported_names = ["f"]} {
+    %val = "tf.ReadVariableOp"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+    // CHECK: "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>}
+    return
+  }
+}
+
+// -----
+
+
+module attributes {tf_saved_model.semantics} {
+
+  // Test case: Sanity check handling of non-bound inputs.
+  // The pass shouldn't do anything in this case.
+
+  // CHECK: func @f(%arg0: tensor<!tf.resource<tensor<f32>>>  {tf_saved_model.index_path = [0]})
+  func @f(%arg0: tensor<!tf.resource<tensor<f32>>>  {tf_saved_model.index_path = [0]})
+  attributes {tf_saved_model.exported_names = ["f"]} {
+    %val = "tf.ReadVariableOp"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+    // CHECK: "tf.ReadVariableOp"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+    return
+  }
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+  // Test case: Fail if mutable global tensors are found.
+
+  // expected-error @+1 {{is not immutable}}
+  "tf_saved_model.global_tensor"() { is_mutable, sym_name = "v", type = tensor<f32>, value = dense<1.0> : tensor<f32> } : () -> ()
+
+  func @f(%arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v})
+  attributes {tf_saved_model.exported_names = ["f"]} {
+    return
+  }
+
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
+  // Test case: Fail if bound input user is not ReadVariableOp
+
+ "tf_saved_model.global_tensor"() {sym_name = "v", type = tensor<f32>, value = dense<1.0> : tensor<f32> } : () -> ()
+
+  func @f(%arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v})
+  attributes {tf_saved_model.exported_names = ["f"]} {
+    // expected-error @+1 {{could not rewrite use of immutable bound input}}
+    "tf.StatefulPartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @f_callee} : (tensor<!tf.resource<tensor<f32>>>) -> ()
+    return
+  }
+
+  func @f_callee(%arg0: tensor<!tf.resource<tensor<f32>>>) {
+    return
+  }
+}
+
+// -----
+
+// expected-error @+1 {{could not freeze all global tensors in the module}}
+module attributes {tf_saved_model.semantics} {
+
+  // Test case: Fail if some global tensor ops remain
+
+ "tf_saved_model.global_tensor"() {sym_name = "v", type = tensor<f32>, value = dense<1.0> : tensor<f32> } : () -> ()
+ "tf_saved_model.global_tensor"() {sym_name = "v2", type = tensor<f32>, value = dense<1.0> : tensor<f32> } : () -> ()
+
+  func @f(%arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v})
+  attributes {tf_saved_model.exported_names = ["f"]} {
+    %val = "tf.ReadVariableOp"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+    return
+  }
+}
+
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu-merge-variables-with-execute.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu-merge-variables-with-execute.mlir
index 9e54ff43933..d5cd9004132 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu-merge-variables-with-execute.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu-merge-variables-with-execute.mlir
@@ -12,37 +12,30 @@ func @merge_same_device_variables(
   %arg1: tensor<*x!tf.resource<tensor<64xf32>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"},
   %arg2: tensor<*x!tf.resource<tensor<16xf32>>> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"},
   %arg3: tensor<!tf.string>) {
-  tf_executor.graph {
-    // CHECK: tf_executor.island
-    %island = tf_executor.island {
-      // CHECK-NEXT: %[[ID_0:.*]] = "tf.IdentityN"(%[[ARG_0]])
-      %id0 = "tf.IdentityN"(%arg0) {device = "/job:localhost/replica:0/task:0/device:TPU:0"}
-        : (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<*x!tf.resource<tensor<32xf32>>>
-      // CHECK-NEXT: %[[READ_2:.*]] = "tf.ReadVariableOp"(%[[ARG_2]])
-      %read0 = "tf.ReadVariableOp"(%id0) : (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
-      %read1 = "tf.ReadVariableOp"(%arg1) : (tensor<*x!tf.resource<tensor<64xf32>>>) -> tensor<64xf32>
-      %read2 = "tf.ReadVariableOp"(%arg2) : (tensor<*x!tf.resource<tensor<16xf32>>>) -> tensor<16xf32>
-      // CHECK-NEXT: %[[EXE:.*]] = "tf_device.launch"
-      // CHECK-NEXT: "tf.TPUExecuteAndUpdateVariables"(%[[ID_0]], %[[ARG_1]], %[[READ_2]], %[[ARG_3]])
-      // CHECK-SAME: device_var_reads_indices = [0, 1],
-      // CHECK-SAME: device_var_updates_indices = [0, -1]
-      %execute:2 = "tf_device.launch"() ( {
-        %0:2 = "tf.TPUExecute"(%read0, %read1, %read2, %arg3) {
-          Targs = [tensor<32xf32>, tensor<64xf32>, tensor<16xf32>],
-          Tresults = [tensor<32xf32>, tensor<16xf32>]}
-          : (tensor<32xf32>, tensor<64xf32>, tensor<16xf32>, tensor<!tf.string>) -> (tensor<32xf32>, tensor<16xf32>)
-        tf_device.return %0#0, %0#1 : tensor<32xf32>, tensor<16xf32>
-      }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"} : () -> (tensor<32xf32>, tensor<16xf32>)
-      // CHECK-NEXT: tf_device.return
-      // CHECK-NEXT: }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"}
-      "tf.AssignVariableOp"(%id0, %execute#0) : (tensor<*x!tf.resource<tensor<32xf32>>>, tensor<32xf32>) -> ()
-      // CHECK-NEXT: "tf.AssignVariableOp"(%[[ARG_2]], %[[EXE]])
-      "tf.AssignVariableOp"(%arg2, %execute#1) : (tensor<*x!tf.resource<tensor<16xf32>>>, tensor<16xf32>) -> ()
-      // CHECK-NEXT: tf_executor.yield
-      tf_executor.yield
-    }
-    tf_executor.fetch %island : !tf_executor.control
-  }
+  // CHECK-NEXT: %[[ID_0:.*]] = "tf.IdentityN"(%[[ARG_0]])
+  %id0 = "tf.IdentityN"(%arg0) {device = "/job:localhost/replica:0/task:0/device:TPU:0"}
+    : (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<*x!tf.resource<tensor<32xf32>>>
+  // CHECK-NEXT: %[[READ_2:.*]] = "tf.ReadVariableOp"(%[[ARG_2]])
+  %read0 = "tf.ReadVariableOp"(%id0) : (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
+  %read1 = "tf.ReadVariableOp"(%arg1) : (tensor<*x!tf.resource<tensor<64xf32>>>) -> tensor<64xf32>
+  %read2 = "tf.ReadVariableOp"(%arg2) : (tensor<*x!tf.resource<tensor<16xf32>>>) -> tensor<16xf32>
+  // CHECK-NEXT: %[[EXE:.*]] = "tf_device.launch"
+  // CHECK-NEXT: "tf.TPUExecuteAndUpdateVariables"(%[[ID_0]], %[[ARG_1]], %[[READ_2]], %[[ARG_3]])
+  // CHECK-SAME: device_var_reads_indices = [0, 1],
+  // CHECK-SAME: device_var_updates_indices = [0, -1]
+  %execute:2 = "tf_device.launch"() ( {
+    %0:2 = "tf.TPUExecute"(%read0, %read1, %read2, %arg3) {
+      Targs = [tensor<32xf32>, tensor<64xf32>, tensor<16xf32>],
+      Tresults = [tensor<32xf32>, tensor<16xf32>]}
+      : (tensor<32xf32>, tensor<64xf32>, tensor<16xf32>, tensor<!tf.string>) -> (tensor<32xf32>, tensor<16xf32>)
+    tf_device.return %0#0, %0#1 : tensor<32xf32>, tensor<16xf32>
+  }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"} : () -> (tensor<32xf32>, tensor<16xf32>)
+  // CHECK-NEXT: tf_device.return
+  // CHECK-NEXT: }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"}
+  "tf.AssignVariableOp"(%id0, %execute#0) : (tensor<*x!tf.resource<tensor<32xf32>>>, tensor<32xf32>) -> ()
+  // CHECK-NEXT: "tf.AssignVariableOp"(%[[ARG_2]], %[[EXE]])
+  "tf.AssignVariableOp"(%arg2, %execute#1) : (tensor<*x!tf.resource<tensor<16xf32>>>, tensor<16xf32>) -> ()
+  // CHECK-NEXT: return
   return
 }
 
@@ -59,35 +52,28 @@ func @merge_replicated_variables(
   %arg1: tensor<!tf.string>,
   %arg2: tensor<*x!tf.resource<tensor<32xf32>>>,
   %arg3: tensor<*x!tf.resource<tensor<32xf32>>>) {
-  tf_executor.graph {
-    // CHECK: tf_executor.island
-    %island = tf_executor.island {
-      // CHECK-NEXT: %[[READ_0:.*]] = "tf.ReadVariableOp"(%[[ARG_0]])
-      %read0 = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
-      // CHECK-NEXT: tf_device.replicate([%[[ARG_2]], %[[ARG_3]]] as %[[R_ARG:.*]]: tensor<*x!tf.resource<tensor<32xf32>>>)
-      tf_device.replicate([%arg2, %arg3] as %r: tensor<*x!tf.resource<tensor<32xf32>>>) {n = 2 : i32} {
-        // CHECK-NEXT: "tf_device.launch"
-        // CHECK-NEXT: "tf.TPUExecuteAndUpdateVariables"(%[[READ_0]], %[[R_ARG]], %[[ARG_1]])
-        // CHECK-SAME: device_var_reads_indices = [1],
-        // CHECK-SAME: device_var_updates_indices = [0]
-        %read1 = "tf.ReadVariableOp"(%r) : (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
-        %execute = "tf_device.launch"() ( {
-          %0 = "tf.TPUExecute"(%read0, %read1, %arg1)
-            : (tensor<32xf32>, tensor<32xf32>, tensor<!tf.string>) -> tensor<32xf32>
-          tf_device.return %0 : tensor<32xf32>
-        }) {device = ""} : () -> tensor<32xf32>
-        // CHECK-NEXT: tf_device.return
-        // CHECK-NEXT: }) {device = ""}
-        "tf.AssignVariableOp"(%r, %execute) : (tensor<*x!tf.resource<tensor<32xf32>>>, tensor<32xf32>) -> ()
-        // CHECK-NEXT: tf_device.return
-        tf_device.return
-      // CHECK-NEXT: }
-      }
-      // CHECK-NEXT: tf_executor.yield
-      tf_executor.yield
-    }
-    tf_executor.fetch %island : !tf_executor.control
+  // CHECK-NEXT: %[[READ_0:.*]] = "tf.ReadVariableOp"(%[[ARG_0]])
+  %read0 = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
+  // CHECK-NEXT: tf_device.replicate([%[[ARG_2]], %[[ARG_3]]] as %[[R_ARG:.*]]: tensor<*x!tf.resource<tensor<32xf32>>>)
+  tf_device.replicate([%arg2, %arg3] as %r: tensor<*x!tf.resource<tensor<32xf32>>>) {n = 2 : i32} {
+    // CHECK-NEXT: "tf_device.launch"
+    // CHECK-NEXT: "tf.TPUExecuteAndUpdateVariables"(%[[READ_0]], %[[R_ARG]], %[[ARG_1]])
+    // CHECK-SAME: device_var_reads_indices = [1],
+    // CHECK-SAME: device_var_updates_indices = [0]
+    %read1 = "tf.ReadVariableOp"(%r) : (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
+    %execute = "tf_device.launch"() ( {
+      %0 = "tf.TPUExecute"(%read0, %read1, %arg1)
+        : (tensor<32xf32>, tensor<32xf32>, tensor<!tf.string>) -> tensor<32xf32>
+      tf_device.return %0 : tensor<32xf32>
+    }) {device = ""} : () -> tensor<32xf32>
+    // CHECK-NEXT: tf_device.return
+    // CHECK-NEXT: }) {device = ""}
+    "tf.AssignVariableOp"(%r, %execute) : (tensor<*x!tf.resource<tensor<32xf32>>>, tensor<32xf32>) -> ()
+    // CHECK-NEXT: tf_device.return
+    tf_device.return
+  // CHECK-NEXT: }
   }
+  // CHECK-NEXT: return
   return
 }
 
@@ -112,46 +98,39 @@ func @interferencing_accesses(
   %arg4: tensor<*x!tf.resource<tensor<8xf32>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"},
   %arg5: tensor<*x!tf.resource<tensor<2xf32>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"},
   %arg6: tensor<2xf32>) -> (tensor<8xf32>) {
-  %graph = tf_executor.graph {
-    // CHECK: tf_executor.island
-    %island:2 = tf_executor.island {
-      // CHECK-NEXT: %[[READ_0:.*]] = "tf.ReadVariableOp"(%[[ARG_0]])
-      %read0 = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
-      // CHECK-NEXT: %[[READ_5:.*]] = "tf.ReadVariableOp"(%[[ARG_5]])
-      %read5 = "tf.ReadVariableOp"(%arg5) : (tensor<*x!tf.resource<tensor<2xf32>>>) -> tensor<2xf32>
-      // CHECK-NEXT: "tf.AssignVariableOp"(%[[ARG_0]], %[[ARG_2]])
-      "tf.AssignVariableOp"(%arg0, %arg2) : (tensor<*x!tf.resource<tensor<32xf32>>>, tensor<32xf32>) -> ()
-      // CHECK-NEXT: "tf.AssignVariableOp"(%[[ARG_5]], %[[ARG_6]])
-      "tf.AssignVariableOp"(%arg5, %arg6) : (tensor<*x!tf.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
-      %read1 = "tf.ReadVariableOp"(%arg1) : (tensor<*x!tf.resource<tensor<64xf32>>>) -> tensor<64xf32>
-      %read2 = "tf.ReadVariableOp"(%arg4) : (tensor<*x!tf.resource<tensor<8xf32>>>) -> tensor<8xf32>
-      // CHECK-NEXT: %[[EXE:.*]]:2 = "tf_device.launch"
-      // CHECK-NEXT: "tf.TPUExecuteAndUpdateVariables"(%[[READ_0]], %[[ARG_1]], %[[ARG_4]], %[[READ_5]], %[[ARG_3]])
-      // CHECK-SAME: device_var_reads_indices = [1, 2],
-      // CHECK-SAME: device_var_updates_indices = [1, -1]
-      %execute:3 = "tf_device.launch"() ( {
-        %0:3 = "tf.TPUExecute"(%read0, %read1, %read2, %read5, %arg3) {
-          Targs = [tensor<32xf32>, tensor<64xf32>, tensor<8xf32>, tensor<2xf32>],
-          Tresults = [tensor<32xf32>, tensor<64xf32>, tensor<8xf32>]}
-          : (tensor<32xf32>, tensor<64xf32>, tensor<8xf32>, tensor<2xf32>, tensor<!tf.string>)
-            -> (tensor<32xf32>, tensor<64xf32>, tensor<8xf32>)
-        tf_device.return %0#0, %0#1, %0#2 : tensor<32xf32>, tensor<64xf32>, tensor<8xf32>
-      }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"} : () -> (tensor<32xf32>, tensor<64xf32>, tensor<8xf32>)
-      // CHECK-NEXT: tf_device.return
-      // CHECK-NEXT: }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"}
-      "tf.AssignVariableOp"(%arg1, %execute#1) : (tensor<*x!tf.resource<tensor<64xf32>>>, tensor<64xf32>) -> ()
-      // CHECK-NEXT: "tf.AssignVariableOp"(%[[ARG_0]], %[[EXE]]#0)
-      "tf.AssignVariableOp"(%arg0, %execute#0) : (tensor<*x!tf.resource<tensor<32xf32>>>, tensor<32xf32>) -> ()
-      // CHECK-NEXT: %[[READ_3:.*]] = "tf.ReadVariableOp"(%[[ARG_4]])
-      %read3 = "tf.ReadVariableOp"(%arg4) : (tensor<*x!tf.resource<tensor<8xf32>>>) -> tensor<8xf32>
-      // CHECK-NEXT: "tf.AssignVariableOp"(%[[ARG_4]], %[[EXE]]#1)
-      "tf.AssignVariableOp"(%arg4, %execute#2) : (tensor<*x!tf.resource<tensor<8xf32>>>, tensor<8xf32>) -> ()
-      // CHECK-NEXT: tf_executor.yield %[[READ_3]]
-      tf_executor.yield %read3 : tensor<8xf32>
-    }
-    tf_executor.fetch %island#0 : tensor<8xf32>
-  }
-  return %graph : tensor<8xf32>
+  // CHECK-NEXT: %[[READ_0:.*]] = "tf.ReadVariableOp"(%[[ARG_0]])
+  %read0 = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
+  // CHECK-NEXT: %[[READ_5:.*]] = "tf.ReadVariableOp"(%[[ARG_5]])
+  %read5 = "tf.ReadVariableOp"(%arg5) : (tensor<*x!tf.resource<tensor<2xf32>>>) -> tensor<2xf32>
+  // CHECK-NEXT: "tf.AssignVariableOp"(%[[ARG_0]], %[[ARG_2]])
+  "tf.AssignVariableOp"(%arg0, %arg2) : (tensor<*x!tf.resource<tensor<32xf32>>>, tensor<32xf32>) -> ()
+  // CHECK-NEXT: "tf.AssignVariableOp"(%[[ARG_5]], %[[ARG_6]])
+  "tf.AssignVariableOp"(%arg5, %arg6) : (tensor<*x!tf.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+  %read1 = "tf.ReadVariableOp"(%arg1) : (tensor<*x!tf.resource<tensor<64xf32>>>) -> tensor<64xf32>
+  %read2 = "tf.ReadVariableOp"(%arg4) : (tensor<*x!tf.resource<tensor<8xf32>>>) -> tensor<8xf32>
+  // CHECK-NEXT: %[[EXE:.*]]:2 = "tf_device.launch"
+  // CHECK-NEXT: "tf.TPUExecuteAndUpdateVariables"(%[[READ_0]], %[[ARG_1]], %[[ARG_4]], %[[READ_5]], %[[ARG_3]])
+  // CHECK-SAME: device_var_reads_indices = [1, 2],
+  // CHECK-SAME: device_var_updates_indices = [1, -1]
+  %execute:3 = "tf_device.launch"() ( {
+    %0:3 = "tf.TPUExecute"(%read0, %read1, %read2, %read5, %arg3) {
+      Targs = [tensor<32xf32>, tensor<64xf32>, tensor<8xf32>, tensor<2xf32>],
+      Tresults = [tensor<32xf32>, tensor<64xf32>, tensor<8xf32>]}
+      : (tensor<32xf32>, tensor<64xf32>, tensor<8xf32>, tensor<2xf32>, tensor<!tf.string>)
+        -> (tensor<32xf32>, tensor<64xf32>, tensor<8xf32>)
+    tf_device.return %0#0, %0#1, %0#2 : tensor<32xf32>, tensor<64xf32>, tensor<8xf32>
+  }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"} : () -> (tensor<32xf32>, tensor<64xf32>, tensor<8xf32>)
+  // CHECK-NEXT: tf_device.return
+  // CHECK-NEXT: }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"}
+  "tf.AssignVariableOp"(%arg1, %execute#1) : (tensor<*x!tf.resource<tensor<64xf32>>>, tensor<64xf32>) -> ()
+  // CHECK-NEXT: "tf.AssignVariableOp"(%[[ARG_0]], %[[EXE]]#0)
+  "tf.AssignVariableOp"(%arg0, %execute#0) : (tensor<*x!tf.resource<tensor<32xf32>>>, tensor<32xf32>) -> ()
+  // CHECK-NEXT: %[[READ_3:.*]] = "tf.ReadVariableOp"(%[[ARG_4]])
+  %read3 = "tf.ReadVariableOp"(%arg4) : (tensor<*x!tf.resource<tensor<8xf32>>>) -> tensor<8xf32>
+  // CHECK-NEXT: "tf.AssignVariableOp"(%[[ARG_4]], %[[EXE]]#1)
+  "tf.AssignVariableOp"(%arg4, %execute#2) : (tensor<*x!tf.resource<tensor<8xf32>>>, tensor<8xf32>) -> ()
+  // CHECK-NEXT: return %[[READ_3]]
+  return %read3 : tensor<8xf32>
 }
 
 // -----
@@ -165,30 +144,23 @@ func @interferencing_accesses(
 func @do_not_merge_multi_read(
   %arg0: tensor<*x!tf.resource<tensor<32xf32>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"},
   %arg1: tensor<!tf.string>) {
-  tf_executor.graph {
-    // CHECK: tf_executor.island
-    %island = tf_executor.island {
-      // CHECK-NEXT: %[[READ_0:.*]] = "tf.ReadVariableOp"(%[[ARG_0]])
-      %read0 = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
-      // CHECK-NEXT: %[[READ_1:.*]] = "tf.ReadVariableOp"(%[[ARG_0]])
-      %read1 = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
-      // CHECK-NEXT: %[[EXE:.*]] = "tf_device.launch"
-      // CHECK-NEXT: "tf.TPUExecute"(%[[READ_0]], %[[READ_1]], %[[ARG_1]])
-      %execute = "tf_device.launch"() ( {
-        %0 = "tf.TPUExecute"(%read0, %read1, %arg1) {
-          Targs = [tensor<32xf32>, tensor<32xf32>], Tresults = [tensor<32xf32>]}
-          : (tensor<32xf32>, tensor<32xf32>, tensor<!tf.string>) -> (tensor<32xf32>)
-        tf_device.return %0 : tensor<32xf32>
-      }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"} : () -> tensor<32xf32>
-      // CHECK-NEXT: tf_device.return
-      // CHECK-NEXT: }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"}
-      // CHECK-NEXT: "tf.AssignVariableOp"(%[[ARG_0]], %[[EXE]])
-      "tf.AssignVariableOp"(%arg0, %execute) : (tensor<*x!tf.resource<tensor<32xf32>>>, tensor<32xf32>) -> ()
-      // CHECK-NEXT: tf_executor.yield
-      tf_executor.yield
-    }
-    tf_executor.fetch %island : !tf_executor.control
-  }
+  // CHECK-NEXT: %[[READ_0:.*]] = "tf.ReadVariableOp"(%[[ARG_0]])
+  %read0 = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
+  // CHECK-NEXT: %[[READ_1:.*]] = "tf.ReadVariableOp"(%[[ARG_0]])
+  %read1 = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
+  // CHECK-NEXT: %[[EXE:.*]] = "tf_device.launch"
+  // CHECK-NEXT: "tf.TPUExecute"(%[[READ_0]], %[[READ_1]], %[[ARG_1]])
+  %execute = "tf_device.launch"() ( {
+    %0 = "tf.TPUExecute"(%read0, %read1, %arg1) {
+      Targs = [tensor<32xf32>, tensor<32xf32>], Tresults = [tensor<32xf32>]}
+      : (tensor<32xf32>, tensor<32xf32>, tensor<!tf.string>) -> (tensor<32xf32>)
+    tf_device.return %0 : tensor<32xf32>
+  }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"} : () -> tensor<32xf32>
+  // CHECK-NEXT: tf_device.return
+  // CHECK-NEXT: }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"}
+  // CHECK-NEXT: "tf.AssignVariableOp"(%[[ARG_0]], %[[EXE]])
+  "tf.AssignVariableOp"(%arg0, %execute) : (tensor<*x!tf.resource<tensor<32xf32>>>, tensor<32xf32>) -> ()
+  // CHECK-NEXT: return
   return
 }
 
@@ -203,29 +175,118 @@ func @do_not_merge_multi_read(
 func @do_not_merge_multi_assign(
   %arg0: tensor<*x!tf.resource<tensor<32xf32>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"},
   %arg1: tensor<!tf.string>) {
-  tf_executor.graph {
-    // CHECK: tf_executor.island
-    %island = tf_executor.island {
-      // CHECK-NEXT: %[[READ_0:.*]] = "tf.ReadVariableOp"(%[[ARG_0]])
-      %read0 = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
-      // CHECK-NEXT: %[[EXE:.*]]:2 = "tf_device.launch"
-      // CHECK-NEXT: "tf.TPUExecute"(%[[READ_0]], %[[ARG_1]])
-      %execute:2 = "tf_device.launch"() ( {
-        %0:2 = "tf.TPUExecute"(%read0, %arg1) {
-          Targs = [tensor<32xf32>], Tresults = [tensor<32xf32>, tensor<32xf32>]}
-          : (tensor<32xf32>, tensor<!tf.string>) -> (tensor<32xf32>, tensor<32xf32>)
-        tf_device.return %0#0, %0#1 : tensor<32xf32>, tensor<32xf32>
-      }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"} : () -> (tensor<32xf32>, tensor<32xf32>)
+  // CHECK-NEXT: %[[READ_0:.*]] = "tf.ReadVariableOp"(%[[ARG_0]])
+  %read0 = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
+  // CHECK-NEXT: %[[EXE:.*]]:2 = "tf_device.launch"
+  // CHECK-NEXT: "tf.TPUExecute"(%[[READ_0]], %[[ARG_1]])
+  %execute:2 = "tf_device.launch"() ( {
+    %0:2 = "tf.TPUExecute"(%read0, %arg1) {
+      Targs = [tensor<32xf32>], Tresults = [tensor<32xf32>, tensor<32xf32>]}
+      : (tensor<32xf32>, tensor<!tf.string>) -> (tensor<32xf32>, tensor<32xf32>)
+    tf_device.return %0#0, %0#1 : tensor<32xf32>, tensor<32xf32>
+  }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"} : () -> (tensor<32xf32>, tensor<32xf32>)
+  // CHECK-NEXT: tf_device.return
+  // CHECK-NEXT: }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"}
+  // CHECK-NEXT: "tf.AssignVariableOp"(%[[ARG_0]], %[[EXE]]#0)
+  "tf.AssignVariableOp"(%arg0, %execute#0) : (tensor<*x!tf.resource<tensor<32xf32>>>, tensor<32xf32>) -> ()
+  // CHECK-NEXT: "tf.AssignVariableOp"(%[[ARG_0]], %[[EXE]]#1)
+  "tf.AssignVariableOp"(%arg0, %execute#1) : (tensor<*x!tf.resource<tensor<32xf32>>>, tensor<32xf32>) -> ()
+  // CHECK-NEXT: return
+  return
+}
+
+// -----
+
+// Tests that the pass merges only variable reads/writes on the same device,
+// with TPUExecutes in a tf_device.parallel_execute.
+
+// CHECK-LABEL: func @parallel_execute
+// CHECK-SAME: %[[ARG_0:.*]]: tensor<*x!tf.resource<tensor<32xf32>>>
+// CHECK-SAME: %[[ARG_1:.*]]: tensor<*x!tf.resource<tensor<64xf32>>>
+// CHECK-SAME: %[[ARG_2:.*]]: tensor<!tf.string>
+func @parallel_execute(
+  %arg0: tensor<*x!tf.resource<tensor<32xf32>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"},
+  %arg1: tensor<*x!tf.resource<tensor<64xf32>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:1"},
+  %arg2: tensor<!tf.string>) {
+  %read0 = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
+  %read1 = "tf.ReadVariableOp"(%arg1) : (tensor<*x!tf.resource<tensor<64xf32>>>) -> tensor<64xf32>
+  // CHECK-NOT: "tf.ReadVariableOp"
+  // CHECK: "tf_device.parallel_execute"
+  %pe:2 = "tf_device.parallel_execute"() ( {
+    // CHECK: "tf_device.launch"
+    %execute0 = "tf_device.launch"() ( {
+      // CHECK-NEXT: "tf.TPUExecuteAndUpdateVariables"(%[[ARG_0]], %[[ARG_2]])
+      %0 = "tf.TPUExecute"(%read0, %arg2) : (tensor<32xf32>, tensor<!tf.string>) -> tensor<32xf32>
       // CHECK-NEXT: tf_device.return
-      // CHECK-NEXT: }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"}
-      // CHECK-NEXT: "tf.AssignVariableOp"(%[[ARG_0]], %[[EXE]]#0)
-      "tf.AssignVariableOp"(%arg0, %execute#0) : (tensor<*x!tf.resource<tensor<32xf32>>>, tensor<32xf32>) -> ()
-      // CHECK-NEXT: "tf.AssignVariableOp"(%[[ARG_0]], %[[EXE]]#1)
-      "tf.AssignVariableOp"(%arg0, %execute#1) : (tensor<*x!tf.resource<tensor<32xf32>>>, tensor<32xf32>) -> ()
-      // CHECK-NEXT: tf_executor.yield
-      tf_executor.yield
-    }
-    tf_executor.fetch %island : !tf_executor.control
+      tf_device.return %0 : tensor<32xf32>
+    // CHECK-NEXT: device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"} : () -> tensor<32xf32>
+    tf_device.return %execute0 : tensor<32xf32>
+  }, {
+    // CHECK: "tf_device.launch"
+    %execute1 = "tf_device.launch"() ( {
+      // CHECK-NEXT: "tf.TPUExecuteAndUpdateVariables"(%[[ARG_1]], %[[ARG_2]])
+      %1 = "tf.TPUExecute"(%read1, %arg2) : (tensor<64xf32>, tensor<!tf.string>) -> tensor<64xf32>
+      // CHECK-NEXT: tf_device.return
+      tf_device.return %1 : tensor<64xf32>
+    // CHECK-NEXT: device = "/job:localhost/replica:0/task:0/device:TPU:1"
+    }) {device = "/job:localhost/replica:0/task:0/device:TPU:1"} : () -> tensor<64xf32>
+    tf_device.return %execute1 : tensor<64xf32>
+  }) : () -> (tensor<32xf32>, tensor<64xf32>)
+  // CHECK-NOT: "tf.AssignVariableOp"
+  "tf.AssignVariableOp"(%arg0, %pe#0) : (tensor<*x!tf.resource<tensor<32xf32>>>, tensor<32xf32>) -> ()
+  "tf.AssignVariableOp"(%arg1, %pe#1) : (tensor<*x!tf.resource<tensor<64xf32>>>, tensor<64xf32>) -> ()
+  return
+}
+
+// -----
+
+// Tests that the pass merges variable reads/writes for TPUExecutes in a
+// tf_device.parallel_execute that is replicated (tf_device.replicate).
+
+// CHECK-LABEL: func @replicated_parallel_execute
+// CHECK-SAME: %[[ARG_0:[a-z0-9]+]]: tensor<*x!tf.resource<tensor<32xf32>>>
+// CHECK-SAME: %[[ARG_1:[a-z0-9]+]]: tensor<*x!tf.resource<tensor<32xf32>>>
+// CHECK-SAME: %[[ARG_2:[a-z0-9]+]]: tensor<*x!tf.resource<tensor<64xf32>>>
+// CHECK-SAME: %[[ARG_3:[a-z0-9]+]]: tensor<*x!tf.resource<tensor<64xf32>>>
+// CHECK-SAME: %[[ARG_4:[a-z0-9]+]]: tensor<!tf.string>
+func @replicated_parallel_execute(
+  %arg0: tensor<*x!tf.resource<tensor<32xf32>>>,
+  %arg1: tensor<*x!tf.resource<tensor<32xf32>>>,
+  %arg2: tensor<*x!tf.resource<tensor<64xf32>>>,
+  %arg3: tensor<*x!tf.resource<tensor<64xf32>>>,
+  %arg4: tensor<!tf.string>) {
+  // CHECK: tf_device.replicate
+  // CHECK-SAME: [%[[ARG_0]], %[[ARG_1]]] as %[[RI_0:[a-z0-9]+]]: tensor<*x!tf.resource<tensor<32xf32>>>
+  // CHECK-SAME: [%[[ARG_2]], %[[ARG_3]]] as %[[RI_1:[a-z0-9]+]]: tensor<*x!tf.resource<tensor<64xf32>>>
+  tf_device.replicate([%arg0, %arg1] as %ri0: tensor<*x!tf.resource<tensor<32xf32>>>,
+                      [%arg2, %arg3] as %ri1: tensor<*x!tf.resource<tensor<64xf32>>>) {n = 2 : i32} {
+    // CHECK-NOT: "tf.ReadVariableOp"
+    %read0 = "tf.ReadVariableOp"(%ri0) : (tensor<*x!tf.resource<tensor<32xf32>>>) -> tensor<32xf32>
+    %read1 = "tf.ReadVariableOp"(%ri1) : (tensor<*x!tf.resource<tensor<64xf32>>>) -> tensor<64xf32>
+    // CHECK: "tf_device.parallel_execute"
+    %pe:2 = "tf_device.parallel_execute"() ( {
+      // CHECK: "tf_device.launch"
+      %execute0 = "tf_device.launch"() ( {
+        // CHECK-NEXT: "tf.TPUExecuteAndUpdateVariables"(%[[RI_0]], %[[ARG_4]])
+        %0 = "tf.TPUExecute"(%read0, %arg4) : (tensor<32xf32>, tensor<!tf.string>) -> tensor<32xf32>
+        // CHECK-NEXT: tf_device.return
+        tf_device.return %0 : tensor<32xf32>
+      }) {device = ""} : () -> tensor<32xf32>
+      tf_device.return %execute0 : tensor<32xf32>
+    }, {
+      // CHECK: "tf_device.launch"
+      %execute1 = "tf_device.launch"() ( {
+        // CHECK-NEXT: "tf.TPUExecuteAndUpdateVariables"(%[[RI_1]], %[[ARG_4]])
+        %1 = "tf.TPUExecute"(%read1, %arg4) : (tensor<64xf32>, tensor<!tf.string>) -> tensor<64xf32>
+        // CHECK-NEXT: tf_device.return
+        tf_device.return %1 : tensor<64xf32>
+      }) {device = ""} : () -> tensor<64xf32>
+      tf_device.return %execute1 : tensor<64xf32>
+    }) : () -> (tensor<32xf32>, tensor<64xf32>)
+    // CHECK-NOT: "tf.AssignVariableOp"
+    "tf.AssignVariableOp"(%ri0, %pe#0) : (tensor<*x!tf.resource<tensor<32xf32>>>, tensor<32xf32>) -> ()
+    "tf.AssignVariableOp"(%ri1, %pe#1) : (tensor<*x!tf.resource<tensor<64xf32>>>, tensor<64xf32>) -> ()
   }
   return
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
index d9107185954..7ee20d23df3 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
@@ -5,7 +5,7 @@
 // expected-error@+1 {{requires attribute 'tf.versions'}}
 module attributes {tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @missing_tf_versions() {
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = []} : () -> ()
+    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     return
   }
   func @empty_func() {
@@ -20,7 +20,7 @@ module attributes {tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @bad_devices() {
     // expected-error@+1 {{error in fetching TPU compilation/execution devices: no TPU_SYSTEM devices found}}
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = []} : () -> ()
+    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     return
   }
   func @empty_func() {
@@ -36,7 +36,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @missing_num_cores_per_replica() {
     // expected-error@+1 {{requires attribute 'num_cores_per_replica'}}
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = []} : () -> ()
+    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     return
   }
   func @empty_func() {
@@ -51,7 +51,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @bad_num_cores_per_replica() {
     // expected-error@+1 {{requires attribute 'num_cores_per_replica'}}
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = "", step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = []} : () -> ()
+    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = "", step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     return
   }
   func @empty_func() {
@@ -66,7 +66,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @bad_num_cores_per_replica() {
     // expected-error@+1 {{requires attribute 'step_marker_location'}}
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, padding_map = []} : () -> ()
+    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     return
   }
   func @empty_func() {
@@ -81,7 +81,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @bad_step_marker_location() {
     // expected-error@+1 {{requires attribute 'step_marker_location'}}
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = 1, padding_map = []} : () -> ()
+    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = 1, padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     return
   }
   func @empty_func() {
@@ -96,7 +96,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @unparsable_step_marker_location() {
     // expected-error@+1 {{bad 'step_marker_location' attribute with value 'test'}}
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "test", padding_map = []} : () -> ()
+    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "test", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     return
   }
   func @empty_func() {
@@ -111,7 +111,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @missing_padding_map() {
     // expected-error@+1 {{requires attribute 'padding_map'}}
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP"} : () -> ()
+    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     return
   }
   func @empty_func() {
@@ -126,7 +126,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @bad_padding_map() {
     // expected-error@+1 {{requires attribute 'padding_map'}}
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ""} : () -> ()
+    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = "", topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     return
   }
   func @empty_func() {
@@ -141,7 +141,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @bad_element_padding_map() {
     // expected-error@+1 {{bad 'padding_map' attribute at index 0, not a string}}
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [1]} : () -> ()
+    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [1], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     return
   }
   func @empty_func() {
@@ -155,8 +155,8 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @unparsable_element_padding_map() {
-    // expected-error@+1 {{bad 'padding_map' attribute at index 0 with value 'test'}}
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["test"]} : () -> ()
+    // expected-error@+1 {{bad 'padding_map' attribute at index 0 with value 'test': failed to parse to tpu::PaddingMap}}
+    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["test"], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     return
   }
   func @empty_func() {
@@ -166,12 +166,193 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
+// Tests `tf_device.launch_func` with missing `topology` attribute.
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+  func @missing_topology() {
+    // expected-error@+1 {{requires attribute 'topology'}}
+    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
+    return
+  }
+  func @empty_func() {
+    return
+  }
+}
+
+// -----
+
+// Tests `tf_device.launch_func` with bad `topology` attribute.
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+  func @bad_topology() {
+    // expected-error@+1 {{requires attribute 'topology'}}
+    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = 1 : i32, device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
+    return
+  }
+  func @empty_func() {
+    return
+  }
+}
+
+// -----
+
+// Tests `tf_device.launch_func` with `topology` attribute resulting in device assignment error.
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+  func @invalid_topology() {
+    // expected-error@+1 {{error in fetching TPU compilation/execution devices}}
+    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "test", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
+    return
+  }
+  func @empty_func() {
+    return
+  }
+}
+
+// -----
+
+// Tests `tf_device.launch_func` with missing `device_assignment` attribute.
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+  func @missing_device_assignment() {
+    // expected-error@+1 {{requires attribute 'device_assignment'}}
+    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
+    return
+  }
+  func @empty_func() {
+    return
+  }
+}
+
+// -----
+
+// Tests `tf_device.launch_func` with bad `device_assignment` attribute.
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+  func @bad_device_assignment() {
+    // expected-error@+1 {{requires attribute 'device_assignment'}}
+    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = "", input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
+    return
+  }
+  func @empty_func() {
+    return
+  }
+}
+
+// -----
+
+// Tests `tf_device.launch_func` with bad element in `device_assignment` attribute.
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+  func @bad_element_device_assignment() {
+    // expected-error@+1 {{bad 'device_assignment' attribute at index 0, not an int}}
+    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [""], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
+    return
+  }
+  func @empty_func() {
+    return
+  }
+}
+
+// -----
+
+// The following topology is used in subsequent test cases:
+// Proto debug string:
+//   mesh_shape: 1
+//   mesh_shape: 1
+//   mesh_shape: 1
+//   mesh_shape: 2
+//   num_tasks: 1
+//   num_tpu_devices_per_task: 2
+//   device_coordinates: 0
+//   device_coordinates: 0
+//   device_coordinates: 0
+//   device_coordinates: 0
+//   device_coordinates: 0
+//   device_coordinates: 0
+//   device_coordinates: 0
+//   device_coordinates: 1
+// Serialized string:
+//   "\0A\04\01\01\01\02\10\01\18\02\22\06\00\00\00\00\00\00\00\01"
+
+// -----
+
+// Tests `tf_device.launch_func` with `device_assignment` attribute resulting in device assignment error.
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+  func @invalid_device_assignment() {
+    // expected-error@+1 {{error in fetching TPU compilation/execution devices}}
+    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "\0A\03\01\01\02\10\01\18\02\22\06\00\00\00\00\00\01", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
+    return
+  }
+  func @empty_func() {
+    return
+  }
+}
+
+// -----
+
+// Tests `tf_device.launch_func` with missing `input_sharding_configuration` attribute.
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+  func @missing_input_sharding_configuration(%arg0: tensor<?xi32>) {
+    // expected-error@+1 {{requires attribute 'input_sharding_configuration'}}
+    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_ENTRY", padding_map = [], topology = "", device_assignment = [], output_sharding_configuration = []} : (tensor<?xi32>) -> tensor<?xi32>
+    return
+  }
+  func @empty_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    return %arg0 : tensor<?xi32>
+  }
+}
+
+// -----
+
+// The following op sharding is used in subsequent test cases:
+// Proto debug string:
+//   type: MAXIMAL
+//   tile_assignment_dimensions: 1
+//   tile_assignment_devices: 0
+// Serialized string:
+//   "\08\01\1A\01\01\22\01\00"
+
+// -----
+
+// Tests `tf_device.launch_func` with bad `input_sharding_configuration` attribute.
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+  func @bad_input_sharding_configuration(%arg0: tensor<?xi32>) {
+    // expected-error@+1 {{requires attribute 'input_sharding_configuration'}}
+    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = "", output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
+    return
+  }
+  func @empty_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    return %arg0 : tensor<?xi32>
+  }
+}
+
+// -----
+
+// Tests `tf_device.launch_func` with mismatched `input_sharding_configuration` attribute size.
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+  func @mismatched_size_input_sharding_configuration(%arg0: tensor<?xi32>) {
+    // expected-error@+1 {{bad 'input_sharding_configuration' attribute, expected array attribute of size 1, got size 0}}
+    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
+    return
+  }
+  func @empty_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    return %arg0 : tensor<?xi32>
+  }
+}
+
+// -----
+
 // Tests `tf_device.launch_func` with unsupported operand type.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   func @unsupported_operand_type(%arg0: tensor<?xi2>) {
     // expected-error@+1 {{failed to determine operand type at index 0: Converting i2 to DataType}}
-    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_ENTRY", padding_map = []} : (tensor<?xi2>) -> tensor<?xi2>
+    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_ENTRY", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi2>) -> tensor<?xi2>
     return
   }
   func @empty_func(%arg0: tensor<?xi2>) -> tensor<?xi2> {
@@ -181,6 +362,112 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
+// Tests `tf_device.launch_func` with bad element in `input_sharding_configuration` attribute.
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+  func @bad_element_input_sharding_configuration(%arg0: tensor<?xi32>) {
+    // expected-error@+1 {{bad 'input_sharding_configuration' attribute at index 0, not a string}}
+    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [1], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
+    return
+  }
+  func @empty_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    return %arg0 : tensor<?xi32>
+  }
+}
+
+// -----
+
+// Tests `tf_device.launch_func` with unparsable element in `input_sharding_configuration` attribute.
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+  func @unparsable_element_input_sharding_configuration(%arg0: tensor<?xi32>) {
+    // expected-error@+1 {{bad 'input_sharding_configuration' attribute at index 0 with value 'test': failed to parse to xla::OpSharding}}
+    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["test"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
+    return
+  }
+  func @empty_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    return %arg0 : tensor<?xi32>
+  }
+}
+
+// -----
+
+// Tests `tf_device.launch_func` with missing `output_sharding_configuration` attribute.
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+  func @missing_output_sharding_configuration(%arg0: tensor<?xi32>) {
+    // expected-error@+1 {{requires attribute 'output_sharding_configuration'}}
+    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_ENTRY", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
+    return
+  }
+  func @empty_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    return %arg0 : tensor<?xi32>
+  }
+}
+
+// -----
+
+// Tests `tf_device.launch_func` with bad `output_sharding_configuration` attribute.
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+  func @bad_output_sharding_configuration(%arg0: tensor<?xi32>) {
+    // expected-error@+1 {{requires attribute 'output_sharding_configuration'}}
+    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ""} : (tensor<?xi32>) -> tensor<?xi32>
+    return
+  }
+  func @empty_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    return %arg0 : tensor<?xi32>
+  }
+}
+
+// -----
+
+// Tests `tf_device.launch_func` with mismatched `output_sharding_configuration` attribute size.
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+  func @mismatched_size_output_sharding_configuration(%arg0: tensor<?xi32>) {
+    // expected-error@+1 {{bad 'output_sharding_configuration' attribute, expected array attribute of size 1, got size 0}}
+    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = []} : (tensor<?xi32>) -> tensor<?xi32>
+    return
+  }
+  func @empty_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    return %arg0 : tensor<?xi32>
+  }
+}
+
+// -----
+
+
+// Tests `tf_device.launch_func` with bad element in `output_sharding_configuration` attribute.
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+  func @bad_element_output_sharding_configuration(%arg0: tensor<?xi32>) {
+    // expected-error@+1 {{bad 'output_sharding_configuration' attribute at index 0, not a string}}
+    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = [1]} : (tensor<?xi32>) -> tensor<?xi32>
+    return
+  }
+  func @empty_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    return %arg0 : tensor<?xi32>
+  }
+}
+
+// -----
+
+// Tests `tf_device.launch_func` with unparsable element in `output_sharding_configuration` attribute.
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+  func @unparsable_element_output_sharding_configuration(%arg0: tensor<?xi32>) {
+    // expected-error@+1 {{bad 'output_sharding_configuration' attribute at index 0 with value 'test': failed to parse to xla::OpSharding}}
+    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["test"]} : (tensor<?xi32>) -> tensor<?xi32>
+    return
+  }
+  func @empty_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    return %arg0 : tensor<?xi32>
+  }
+}
+
+// -----
+
 // Tests `tf_device.launch_func` with empty `step_marker_location` attribute
 // defaults to `STEP_MARK_AT_ENTRY`.
 //
@@ -191,7 +478,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   // CHECK-LABEL: func @default_step_marker_location
   func @default_step_marker_location() {
-    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = []} : () -> ()
+    "tf_device.launch_func"() {_tpu_replicate = "cluster0", device = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = []} : () -> ()
     // CHECK:      metadata
     // CHECK-SAME: num_replicas: 1
     // CHECK-SAME: num_cores_per_replica: 1
@@ -210,7 +497,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   // CHECK-LABEL: func @unranked_shape_arg
   func @unranked_shape_arg(%arg0: tensor<*xi32>) -> tensor<*xi32> {
-    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = []} : (tensor<*xi32>) -> tensor<*xi32>
+    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<*xi32>) -> tensor<*xi32>
     // CHECK:      metadata
     // CHECK-SAME: shape {\0A unknown_rank: true
 
@@ -228,7 +515,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   // CHECK-LABEL: func @partial_shape_arg
   func @partial_shape_arg(%arg0: tensor<?x?x3xi32>) -> tensor<?x?x3xi32> {
-    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = []} : (tensor<?x?x3xi32>) -> tensor<?x?x3xi32>
+    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?x?x3xi32>) -> tensor<?x?x3xi32>
     // CHECK:      metadata
     // CHECK-SAME: args
     // CHECK-SAME: shape {\0A dim {\0A size: -1\0A }\0A dim {\0A size: -1\0A }\0A dim {\0A size: 3\0A }\0A }
@@ -259,7 +546,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   // CHECK-LABEL: func @static_shape_arg
   func @static_shape_arg(%arg0: tensor<1x2x3xi32>) -> tensor<1x2x3xi32> {
-    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = []} : (tensor<1x2x3xi32>) -> tensor<1x2x3xi32>
+    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<1x2x3xi32>) -> tensor<1x2x3xi32>
     // CHECK:      metadata
     // CHECK-SAME: args
     // CHECK-SAME: shape
@@ -284,7 +571,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   // CHECK-LABEL: func @resource_arg
   func @resource_arg(%arg0: tensor<*x!tf.resource>) -> tensor<*x!tf.resource> {
-    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = []} : (tensor<*x!tf.resource>) -> tensor<*x!tf.resource>
+    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<*x!tf.resource>) -> tensor<*x!tf.resource>
     // CHECK:      metadata
     // CHECK:      dtype: DT_RESOURCE
     // CHECK-SAME: kind: VARIABLE
@@ -303,7 +590,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   // CHECK-LABEL: func @parameter_arg
   func @parameter_arg(%arg0: tensor<*xf32>) -> tensor<*xf32> {
-    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = []} : (tensor<*xf32>) -> tensor<*xf32>
+    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<*xf32>) -> tensor<*xf32>
     // CHECK:      metadata
     // CHECK:      dtype: DT_FLOAT
     // CHECK-SAME: kind: PARAMETER
@@ -363,7 +650,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   // CHECK-LABEL: func @metadata
   func @metadata(%arg0: tensor<8xi32>) -> tensor<8xi32> {
-    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"]} : (tensor<8xi32>) -> tensor<8xi32>
+    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<8xi32>) -> tensor<8xi32>
     // CHECK:      metadata
     // CHECK-SAME: args
     // CHECK-SAME: dtype: DT_INT32
@@ -407,9 +694,8 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NOT:  "tf.Shape"(%[[ARG_3]])
     // CHECK:      %[[ARG_0_SHAPE:[0-9]*]] = "tf.Shape"(%[[ARG_0]])
     // CHECK:      %[[ARG_2_SHAPE:[0-9]*]] = "tf.Shape"(%[[ARG_2]])
-    %0 = "tf_device.launch_func"(%arg0, %arg1, %arg2, %arg3) {_tpu_replicate = "cluster0", device = "", func = @_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = []} : (tensor<*xi32>, tensor<8xi32>, tensor<*xi32>, tensor<8xi32>) -> tensor<8xi32>
+    %0 = "tf_device.launch_func"(%arg0, %arg1, %arg2, %arg3) {_tpu_replicate = "cluster0", device = "", func = @_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<*xi32>, tensor<8xi32>, tensor<*xi32>, tensor<8xi32>) -> tensor<8xi32>
     // CHECK:      "tf._TPUCompileMlir"(%[[ARG_0_SHAPE]], %[[ARG_2_SHAPE]])
-    // CHECK-SAME: NumDynamicShapes = 2
 
     return %0: tensor<8xi32>
   }
@@ -429,11 +715,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
 
-    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"]} : (tensor<?xi32>) -> tensor<?xi32>
+    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
     // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
     // CHECK-NEXT: "tf._TPUCompileMlir"(%[[A_SHAPE_OUTPUT]])
-    // CHECK-SAME: NumDynamicShapes = 1
     // CHECK-SAME: metadata
     // CHECK-SAME: mlir_module
     // CHECK-SAME: func @main
@@ -479,7 +764,6 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
       // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[RI_0]])
       // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
       // CHECK-NEXT: "tf._TPUCompileMlir"(%[[A_SHAPE_OUTPUT]])
-      // CHECK-SAME: NumDynamicShapes = 1
       // CHECK-SAME: metadata
       // CHECK-SAME: mlir_module
       // CHECK-SAME: func @main
@@ -491,7 +775,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
       // CHECK: device = "/job:worker/replica:0/task:0/device:CPU:0"
       // CHECK: %[[EXECUTE_OUTPUT:[0-9]*]] = "tf_device.launch"
       // CHECK-NEXT: "tf.TPUExecute"(%[[RI_0]], %[[COMPILE_OUTPUT]]#1)
-      %2 = "tf_device.launch_func"(%ri_0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"]} : (tensor<?xi32>) -> tensor<?xi32>
+      %2 = "tf_device.launch_func"(%ri_0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
 
       // CHECK: tf_device.return %[[EXECUTE_OUTPUT]]
       tf_device.return %2 : tensor<?xi32>
@@ -519,7 +803,7 @@ module attributes {tf.versions = {producer = 888 : i32}} {
   func @single_gpu_launch_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
     %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
 
-    %1 = "tf_device.launch_func"(%0) {device = "gpu0", func = @gpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"]} : (tensor<?xi32>) -> tensor<?xi32>
+    %1 = "tf_device.launch_func"(%0) {device = "gpu0", func = @gpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: tf_device.launch_func
     // CHECK-SAME: device = "gpu0"
     // CHECK-SAME: func = @gpu0_func
@@ -547,11 +831,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
 
-    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"]} : (tensor<?xi32>) -> tensor<?xi32>
+    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
     // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
     // CHECK-NEXT: "tf._TPUCompileMlir"(%[[A_SHAPE_OUTPUT]])
-    // CHECK-SAME: NumDynamicShapes = 1
     // CHECK-SAME: metadata
     // CHECK-SAME: mlir_module
     // CHECK-SAME: func @main
@@ -597,11 +880,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
 
-    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"]} : (tensor<?xi32>) -> tensor<?xi32>
+    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
     // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
     // CHECK-NEXT: "tf._TPUCompileMlir"(%[[A_SHAPE_OUTPUT]])
-    // CHECK-SAME: NumDynamicShapes = 1
     // CHECK-SAME: metadata
     // CHECK-SAME: mlir_module
     // CHECK-SAME: func @main
@@ -643,11 +925,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
 
-    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"]} : (tensor<?xi32>) -> tensor<?xi32>
+    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
     // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
     // CHECK-NEXT: "tf._TPUCompileMlir"(%[[A_SHAPE_OUTPUT]])
-    // CHECK-SAME: NumDynamicShapes = 1
     // CHECK-SAME: metadata
     // CHECK-SAME: mlir_module
     // CHECK-SAME: func @main
@@ -697,11 +978,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
 
-    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"]} : (tensor<?xi32>) -> tensor<?xi32>
+    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
     // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
     // CHECK-NEXT: "tf._TPUCompileMlir"(%[[A_SHAPE_OUTPUT]])
-    // CHECK-SAME: NumDynamicShapes = 1
     // CHECK-SAME: metadata
     // CHECK-SAME: mlir_module
     // CHECK-SAME: func @main
@@ -745,11 +1025,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
 
-    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func0, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"]} : (tensor<?xi32>) -> tensor<?xi32>
+    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func0, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
     // CHECK: %[[COMPILE0_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
     // CHECK-NEXT: "tf._TPUCompileMlir"(%[[A_SHAPE_OUTPUT]])
-    // CHECK-SAME: NumDynamicShapes = 1
     // CHECK-SAME: metadata
     // CHECK-SAME: mlir_module
     // CHECK-SAME: func @main
@@ -760,11 +1039,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK: %[[EXECUTE0_OUTPUT:[0-9]*]] = "tf_device.launch"
     // CHECK-NEXT: "tf.TPUExecute"(%[[A_OUTPUT]], %[[COMPILE0_OUTPUT]]#1)
 
-    %2 = "tf_device.launch_func"(%1) {_tpu_replicate = "cluster1", device = "", func = @tpu0_func1, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"]} : (tensor<?xi32>) -> tensor<?xi32>
+    %2 = "tf_device.launch_func"(%1) {_tpu_replicate = "cluster1", device = "", func = @tpu0_func1, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[EXECUTE0_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[EXECUTE0_OUTPUT]])
     // CHECK: %[[COMPILE1_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
     // CHECK-NEXT: "tf._TPUCompileMlir"(%[[EXECUTE0_SHAPE_OUTPUT]])
-    // CHECK-SAME: NumDynamicShapes = 1
     // CHECK-SAME: metadata
     // CHECK-SAME: mlir_module
     // CHECK-SAME: func @main
@@ -803,11 +1081,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
 
-    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"]} : (tensor<?xi32>) -> tensor<?xi32>
+    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
     // CHECK: %[[COMPILE0_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
     // CHECK-NEXT: "tf._TPUCompileMlir"(%[[A_SHAPE_OUTPUT]])
-    // CHECK-SAME: NumDynamicShapes = 1
     // CHECK-SAME: metadata
     // CHECK-SAME: mlir_module
     // CHECK-SAME: func @main
@@ -818,11 +1095,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK: %[[EXECUTE0_OUTPUT:[0-9]*]] = "tf_device.launch"
     // CHECK-NEXT: "tf.TPUExecute"(%[[A_OUTPUT]], %[[COMPILE0_OUTPUT]]#1)
 
-    %2 = "tf_device.launch_func"(%1) {_tpu_replicate = "cluster1", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"]} : (tensor<?xi32>) -> tensor<?xi32>
+    %2 = "tf_device.launch_func"(%1) {_tpu_replicate = "cluster1", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[EXECUTE0_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[EXECUTE0_OUTPUT]])
     // CHECK: %[[COMPILE1_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
     // CHECK-NEXT: "tf._TPUCompileMlir"(%[[EXECUTE0_SHAPE_OUTPUT]])
-    // CHECK-SAME: NumDynamicShapes = 1
     // CHECK-SAME: metadata
     // CHECK-SAME: mlir_module
     // CHECK-SAME: func @main
@@ -857,11 +1133,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
 
-    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"]} : (tensor<?xi32>) -> tensor<?xi32>
+    %1 = "tf_device.launch_func"(%0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
     // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
     // CHECK-NEXT: "tf._TPUCompileMlir"(%[[A_SHAPE_OUTPUT]])
-    // CHECK-SAME: NumDynamicShapes = 1
     // CHECK-SAME: metadata
     // CHECK-SAME: mlir_module
     // CHECK-SAME: func @main
@@ -916,7 +1191,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests that TPUCompilationResult operations are properly rewritten
+// Tests that TPUCompilationResult operations are properly rewritten.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   // CHECK-LABEL: func @tpu_compilation_result
@@ -928,7 +1203,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT: "tf.TPUCompileSucceededAssert"
     // CHECK: %[[EXECUTE_OUTPUT:[0-9]*]] = "tf_device.launch"
     // CHECK-NEXT: "tf.TPUExecute"
-    %1 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = []} : (tensor<?xi32>) -> tensor<?xi32>
+    %1 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
 
     %compile_result = "tf.TPUCompilationResult"() {_tpu_replicate = "cluster0"} : () -> tensor<!tf.string>
     %compile_result2 = "tf.TPUCompilationResult"() {_tpu_replicate = "cluster0"} : () -> tensor<!tf.string>
@@ -944,3 +1219,179 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     return %0 : tensor<?xi32>
   }
 }
+
+// -----
+
+// Tests devices are set properly for non replicated model parallelism.
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0"]} {
+  // CHECK-LABEL: func @non_replicated_parallel_execute
+  func @non_replicated_parallel_execute(%arg0: tensor<8xi32>) -> tensor<8xi32> {
+    // CHECK:      %[[COMPILE:[a-z0-9]+]]:3 = "tf_device.launch"
+    // CHECK-NEXT:   "tf._TPUCompileMlir"()
+    // CHECK-NEXT:   tf_device.return
+    // CHECK-NEXT: device = "/job:localhost/replica:0/task:0/device:CPU:0"
+    // CHECK:      "tf_device.launch"
+    // CHECK-NEXT:   "tf.TPUCompileSucceededAssert"(%[[COMPILE]]#0)
+    // CHECK-NEXT:   tf_device.return
+    // CHECK-NEXT: device = "/job:localhost/replica:0/task:0/device:CPU:0"
+    // CHECK:      "tf_device.parallel_execute"
+    // CHECK-NEXT:   "tf_device.launch"
+    // CHECK-NEXT:     "tf.TPUExecute"
+    // CHECK-NEXT:     tf_device.return
+    // CHECK-NEXT:   device = "/job:localhost/replica:0/task:0/device:TPU:0"
+    // CHECK:        "tf_device.launch"
+    // CHECK-NEXT:     "tf.TPUExecute"
+    // CHECK-NEXT:     tf_device.return
+    // CHECK-NEXT:   device = "/job:localhost/replica:0/task:0/device:TPU:1"
+    %0 = "tf_device.launch_func"(%arg0) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "\0A\04\01\01\01\02\10\01\18\02\22\08\00\00\00\00\00\00\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<8xi32>) -> tensor<8xi32>
+    return %0 : tensor<8xi32>
+  }
+  func @tpu0_func(%arg0: tensor<8xi32>) -> tensor<8xi32> {
+    return %arg0 : tensor<8xi32>
+  }
+}
+
+// -----
+
+// The following topology is used in subsequent test cases:
+// Proto debug string:
+//   mesh_shape: 1
+//   mesh_shape: 2
+//   mesh_shape: 1
+//   mesh_shape: 2
+//   num_tasks: 2
+//   num_tpu_devices_per_task: 2
+//   device_coordinates: 0
+//   device_coordinates: 0
+//   device_coordinates: 0
+//   device_coordinates: 0
+//   device_coordinates: 0
+//   device_coordinates: 0
+//   device_coordinates: 0
+//   device_coordinates: 1
+//   device_coordinates: 0
+//   device_coordinates: 1
+//   device_coordinates: 0
+//   device_coordinates: 0
+//   device_coordinates: 0
+//   device_coordinates: 1
+//   device_coordinates: 0
+//   device_coordinates: 1
+// Serialized string:
+//   "\0A\04\01\02\01\02\10\02\18\02\22\10\00\00\00\00\00\00\00\01\00\01\00\00\00\01\00\01"
+// -----
+
+// Tests devices are set properly for replicated model parallelism.
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0", "/job:localhost/replica:0/task:1/device:CPU:0", "/job:localhost/replica:0/task:1/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU_SYSTEM:0"]} {
+  // CHECK-LABEL: func @replicated_parallel_execute
+  func @replicated_parallel_execute(%arg0: tensor<8xi32>, %arg1: tensor<8xi32>) -> (tensor<8xi32>, tensor<8xi32>) {
+    // CHECK: tf_device.replicate
+    // CHECK-SAME: devices =
+    // CHECK-SAME: TPU_REPLICATED_CORE_0 = ["/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1"]
+    // CHECK-SAME: TPU_REPLICATED_CORE_1 = ["/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU:0"]
+    %0:2 = tf_device.replicate([%arg0, %arg1] as %ri: tensor<8xi32>) {n = 2 : i32} {
+      // CHECK-NEXT: %[[COMPILE:[a-z0-9]+]]:3 = "tf_device.launch"
+      // CHECK-NEXT:   "tf._TPUCompileMlir"()
+      // CHECK-NEXT:   tf_device.return
+      // CHECK-NEXT: device = "/job:localhost/replica:0/task:0/device:CPU:0"
+      // CHECK:      "tf_device.launch"
+      // CHECK-NEXT:   "tf.TPUCompileSucceededAssert"(%[[COMPILE]]#0)
+      // CHECK-NEXT:   tf_device.return
+      // CHECK-NEXT: device = "/job:localhost/replica:0/task:0/device:CPU:0"
+      // CHECK:      "tf_device.parallel_execute"
+      // CHECK-NEXT:   "tf_device.launch"
+      // CHECK-NEXT:     "tf.TPUExecute"
+      // CHECK-NEXT:     tf_device.return
+      // CHECK-NEXT:   device = "TPU_REPLICATED_CORE_0"
+      // CHECK:        "tf_device.launch"
+      // CHECK-NEXT:     "tf.TPUExecute"
+      // CHECK-NEXT:     tf_device.return
+      // CHECK-NEXT:   device = "TPU_REPLICATED_CORE_1"
+      %1 = "tf_device.launch_func"(%ri) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "\0A\04\01\02\01\02\10\02\18\02\22\10\00\00\00\00\00\00\00\01\00\01\00\00\00\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<8xi32>) -> tensor<8xi32>
+      tf_device.return %1 : tensor<8xi32>
+    }
+    return %0#0, %0#1 : tensor<8xi32>, tensor<8xi32>
+  }
+  func @tpu0_func(%arg0: tensor<8xi32>) -> tensor<8xi32> {
+    return %arg0 : tensor<8xi32>
+  }
+}
+
+// -----
+
+// Tests devices are set properly for replicated model parallelism with
+// outputs to TPU computation placed on logical device 0.
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0", "/job:localhost/replica:0/task:1/device:CPU:0", "/job:localhost/replica:0/task:1/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU_SYSTEM:0"]} {
+  // CHECK-LABEL: func @parallel_execute_with_different_outputs
+  func @parallel_execute_with_different_outputs(%arg0: tensor<8xi32>, %arg1: tensor<8xi32>) -> (tensor<8xi32>, tensor<8xi32>) {
+    // CHECK: tf_device.replicate
+    // CHECK-SAME: devices =
+    // CHECK-SAME: TPU_REPLICATED_CORE_0 = ["/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1"]
+    // CHECK-SAME: TPU_REPLICATED_CORE_1 = ["/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU:0"]
+    %0:2 = tf_device.replicate([%arg0, %arg1] as %ri: tensor<8xi32>) {n = 2 : i32} {
+      // CHECK-NEXT: %[[COMPILE:[a-z0-9]+]]:3 = "tf_device.launch"
+      // CHECK-NEXT:   "tf._TPUCompileMlir"()
+      // CHECK:      device = "/job:localhost/replica:0/task:0/device:CPU:0"
+      // CHECK:      "tf_device.launch"
+      // CHECK-NEXT:   "tf.TPUCompileSucceededAssert"(%[[COMPILE]]#0)
+      // CHECK:      device = "/job:localhost/replica:0/task:0/device:CPU:0"
+      // CHECK:      %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+      // CHECK-NEXT:   %[[LAUNCH_0_OUTPUT:[0-9]*]] = "tf_device.launch"
+      // CHECK-NEXT:     %[[EXECUTE_OUTPUT:[0-9]*]] = "tf.TPUExecute"
+      // CHECK-NEXT:     tf_device.return %[[EXECUTE_OUTPUT]]
+      // CHECK-NEXT:   device = "TPU_REPLICATED_CORE_0"
+      // CHECK:        "tf_device.launch"
+      // CHECK-NEXT:     "tf.TPUExecute"
+      // CHECK:        device = "TPU_REPLICATED_CORE_1"
+      %1 = "tf_device.launch_func"(%ri) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "\0A\04\01\02\01\02\10\02\18\02\22\10\00\00\00\00\00\00\00\01\00\01\00\00\00\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<8xi32>) -> tensor<8xi32>
+      tf_device.return %1 : tensor<8xi32>
+    }
+    return %0#0, %0#1 : tensor<8xi32>, tensor<8xi32>
+  }
+  func @tpu0_func(%arg0: tensor<8xi32>) -> tensor<8xi32> {
+    return %arg0 : tensor<8xi32>
+  }
+}
+
+// -----
+
+// Tests devices are set properly for replicated model parallelism with
+// TPU computation with maximal and replicated outputs.
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0", "/job:localhost/replica:0/task:1/device:CPU:0", "/job:localhost/replica:0/task:1/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU_SYSTEM:0"]} {
+  // CHECK-LABEL: func @parallel_execute_with_replicated_output
+  func @parallel_execute_with_replicated_output(%arg0: tensor<8xi32>, %arg1: tensor<8xi32>) -> (tensor<*xi32>, tensor<*xi1>) {
+    // CHECK: tf_device.replicate
+    // CHECK-SAME: devices =
+    // CHECK-SAME: TPU_REPLICATED_CORE_0 = ["/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1"]
+    // CHECK-SAME: TPU_REPLICATED_CORE_1 = ["/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU:0"]
+    %0:2, %1:2 = tf_device.replicate([%arg0, %arg1] as %ri: tensor<8xi32>) {n = 2 : i32} {
+      // CHECK-NEXT: %[[COMPILE:[a-z0-9]+]]:3 = "tf_device.launch"
+      // CHECK-NEXT:   "tf._TPUCompileMlir"()
+      // CHECK:      device = "/job:localhost/replica:0/task:0/device:CPU:0"
+      // CHECK:      "tf_device.launch"
+      // CHECK-NEXT:   "tf.TPUCompileSucceededAssert"(%[[COMPILE]]#0)
+      // CHECK:      device = "/job:localhost/replica:0/task:0/device:CPU:0"
+      // CHECK:      %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]]:3 = "tf_device.parallel_execute"
+      // CHECK-NEXT:   %[[LAUNCH_0_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
+      // CHECK-NEXT:     %[[EXECUTE_0_OUTPUT:[0-9]*]]:2 = "tf.TPUExecute"
+      // CHECK-NEXT:     tf_device.return %[[EXECUTE_0_OUTPUT]]
+      // CHECK-NEXT:   device = "TPU_REPLICATED_CORE_0"
+      // CHECK:        %[[LAUNCH_1_OUTPUT:[0-9]*]] = "tf_device.launch"
+      // CHECK-NEXT:     %[[EXECUTE_1_OUTPUT:[0-9]*]] = "tf.TPUExecute"
+      // CHECK-NEXT:     tf_device.return %[[EXECUTE_1_OUTPUT]]
+      // CHECK:        device = "TPU_REPLICATED_CORE_1"
+      %1, %2 = "tf_device.launch_func"(%ri) {_tpu_replicate = "cluster0", device = "", func = @tpu0_func, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "\0A\04\01\02\01\02\10\02\18\02\22\10\00\00\00\00\00\00\00\01\00\01\00\00\00\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00", ""]} : (tensor<8xi32>) -> (tensor<*xi32>, tensor<*xi1>)
+      tf_device.return %1, %2 : tensor<*xi32>, tensor<*xi1>
+    }
+    return %0#0, %1#0 : tensor<*xi32>, tensor<*xi1>
+  }
+  func @tpu0_func(%arg0: tensor<8xi32>) -> (tensor<*xi32>, tensor<*xi1>) {
+    %1, %2 = "tf.A"(%arg0) : (tensor<8xi32>) -> (tensor<*xi32>, tensor<*xi1>)
+    %3 = "tf.XlaSharding"(%2) { _XlaSharding = "" } : (tensor<*xi1>) -> tensor<*xi1>
+    return %1, %3 : tensor<*xi32>, tensor<*xi1>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir
index 7b0c82aaf6a..87eb02eda94 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir
@@ -16,6 +16,24 @@ func @empty_func() {
 
 // -----
 
+// Tests with a block argument inputs/outputs with no xla sharding op attached
+// gets default maximal(0) sharding configuration.
+// CHECK-LABEL: func @check_default_sharding_for_block_arg_inputs_outputs
+func @check_default_sharding_for_block_arg_inputs_outputs(%arg0: tensor<*xi32>) {
+  "tf_device.launch_func"(%arg0) {device = "", func = @func_without_sharding, step_marker_location = ""} : (tensor<*xi32>) -> ()
+  // CHECK: input_sharding_configuration
+  // CHECK-SAME: ["\08\01\1A\01\01\22\01\00"]
+  // CHECK: output_sharding_configuration
+  // CHECK-SAME: ["\08\01\1A\01\01\22\01\00"]
+  return
+}
+
+func @func_without_sharding(%arg0: tensor<*xi32>) -> tensor<*xi32> {
+  return %arg0 : tensor<*xi32>
+}
+
+// -----
+
 // Tests with a inputs/outputs with no xla sharding op attached gets
 // default maximal(0) sharding configuration.
 // CHECK-LABEL: func @check_default_sharding_for_inputs_outputs
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/unroll-batch-matmul.mlir b/tensorflow/compiler/mlir/tensorflow/tests/unroll-batch-matmul.mlir
index 4a27e74ad70..5a3f0b6e997 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/unroll-batch-matmul.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/unroll-batch-matmul.mlir
@@ -234,26 +234,6 @@ func @batchMatMulMatrix(%arg0: tensor<4x5xf32>, %arg1: tensor<5x6xf32>) -> tenso
 
 // -----
 
-func @batchMatMulV2VectorLhsInputMatchFailure(%arg0: tensor<10xf32>, %arg1: tensor<10x20xf32>) -> tensor<10x20xf32> {
-  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) : (tensor<10xf32>, tensor<10x20xf32>) -> tensor<10x20xf32>
-  return %0 : tensor<10x20xf32>
-
-  // CHECK-LABEL: batchMatMulV2VectorLhs
-  // CHECK: %0 = "tf.BatchMatMulV2"(%arg0, %arg1) : (tensor<10xf32>, tensor<10x20xf32>) -> tensor<10x20xf32>
-}
-
-// -----
-
-func @batchMatMulV2VectorRhsInputMatchFailure(%arg0: tensor<10x20xf32>, %arg1: tensor<10xf32>) -> tensor<10x20xf32> {
-  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) : (tensor<10x20xf32>, tensor<10xf32>) -> tensor<10x20xf32>
-  return %0 : tensor<10x20xf32>
-
-  // CHECK-LABEL: batchMatMulV2VectorRhs
-  // CHECK: %0 = "tf.BatchMatMulV2"(%arg0, %arg1) : (tensor<10x20xf32>, tensor<10xf32>) -> tensor<10x20xf32>
-}
-
-// -----
-
 func @batchMatMulVectorLhsInputMatchFailure(%arg0: tensor<10xf32>, %arg1: tensor<10x20xf32>) -> tensor<10x20xf32> {
   %0 = "tf.BatchMatMul"(%arg0, %arg1) : (tensor<10xf32>, tensor<10x20xf32>) -> tensor<10x20xf32>
   return %0 : tensor<10x20xf32>
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc b/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc
new file mode 100644
index 00000000000..6cd82d1472d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc
@@ -0,0 +1,125 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.h"
+
+#include <climits>
+#include <cstdint>
+#include <numeric>
+
+#include "absl/memory/memory.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Analysis/LoopAnalysis.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
+#include "mlir/IR/Attributes.h"  // TF:llvm-project
+#include "mlir/IR/OpImplementation.h"  // TF:llvm-project
+#include "mlir/IR/PatternMatch.h"  // TF:llvm-project
+#include "mlir/IR/StandardTypes.h"  // TF:llvm-project
+#include "mlir/Pass/Pass.h"  // TF:llvm-project
+#include "mlir/Support/Functional.h"  // TF:llvm-project
+#include "mlir/Support/LLVM.h"  // TF:llvm-project
+#include "mlir/Support/LogicalResult.h"  // TF:llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/core/util/matmul_bcast.h"
+
+namespace mlir {
+namespace TF {
+
+namespace {
+// Replace TF BatchMatMul by TF Einsum
+struct BatchMatMulToEinsumPass : public FunctionPass<BatchMatMulToEinsumPass> {
+  void runOnFunction() override;
+};
+
+void BatchMatMulToEinsumPass::runOnFunction() {
+  OwningRewritePatternList patterns;
+  auto func = getFunction();
+
+  patterns.insert<ConvertTFBatchMatMulToEinsumOp<TF::BatchMatMulOp>,
+                  ConvertTFBatchMatMulToEinsumOp<TF::BatchMatMulV2Op>>(
+      &getContext());
+  applyPatternsGreedily(func, patterns);
+}
+
+}  // namespace
+
+template <typename BatchMatMulOpType>
+PatternMatchResult
+ConvertTFBatchMatMulToEinsumOp<BatchMatMulOpType>::matchAndRewrite(
+    BatchMatMulOpType op, PatternRewriter& rewriter) const {
+  Value input_lhs = op.x();
+  Value input_rhs = op.y();
+
+  if (!input_lhs.getType().isa<RankedTensorType>()) {
+    // LHS must be a ranked tensor type
+    return this->matchFailure();
+  }
+  if (!input_rhs.getType().isa<RankedTensorType>()) {
+    // RHS must be a ranked tensor type
+    return this->matchFailure();
+  }
+
+  auto lhs_type = input_lhs.getType().dyn_cast<RankedTensorType>();
+  auto rhs_type = input_rhs.getType().dyn_cast<RankedTensorType>();
+
+  if (!lhs_type || !rhs_type) {
+    return this->matchFailure();
+  }
+
+  auto lhs_shape = lhs_type.getShape();
+  auto rhs_shape = rhs_type.getShape();
+
+  Location loc = op.getLoc();
+
+  // Ensure that input ranks are at least 2.
+  const int dims_a = lhs_shape.size();
+  const int dims_b = rhs_shape.size();
+  if (dims_a < 2 || dims_b < 2) {
+    // Both inputs must have rank >= 2
+    return this->matchFailure();
+  }
+
+  // einsum equation for batchmatmul
+  std::string equation("...mk,...kn->...mn");
+
+  if (op.adj_x()) {
+    std::swap(equation[3], equation[4]);
+  }
+  if (op.adj_y()) {
+    std::swap(equation[6 + 3], equation[6 + 4]);
+  }
+
+  llvm::SmallVector<Value, 2> inputs = {input_lhs, input_rhs};
+  rewriter.replaceOpWithNewOp<TF::EinsumOp>(op, op.getType(),
+                                            /*inputs=*/ValueRange(inputs),
+                                            /*equation=*/equation);
+
+  return this->matchSuccess();
+}
+
+static PassRegistration<BatchMatMulToEinsumPass> pass(
+    "tf-batch-matmul-to-tf-einsum",
+    "Replace TF BatchMatMul op by TF Einsum op.");
+
+std::unique_ptr<OpPassBase<FuncOp>> CreateBatchMatMulToEinsumPass() {
+  return std::make_unique<BatchMatMulToEinsumPass>();
+}
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.h b/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.h
new file mode 100644
index 00000000000..cd836892ae9
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.h
@@ -0,0 +1,43 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_BATCHMATMUL_TO_EINSUM_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_BATCHMATMUL_TO_EINSUM_H_
+
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/IR/Location.h"  // TF:llvm-project
+#include "mlir/IR/PatternMatch.h"  // TF:llvm-project
+#include "mlir/IR/TypeUtilities.h"  // TF:llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/core/util/matmul_bcast.h"
+
+namespace mlir {
+namespace TF {
+
+// Replace TF BatchMatMul by TF Einsum op
+template <typename BatchMatMulOpType>
+class ConvertTFBatchMatMulToEinsumOp
+    : public OpRewritePattern<BatchMatMulOpType> {
+  using OpRewritePattern<BatchMatMulOpType>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(
+      BatchMatMulOpType op,
+      PatternRewriter& rewriter) const override;  // NOLINT
+};
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_BATCHMATMUL_TO_EINSUM_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
index 992284b320b..73110a724ea 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
@@ -43,6 +43,8 @@ void AddGraphExportLoweringPasses(OpPassManager &pm) {
   pm.addNestedPass<FuncOp>(CreateBreakUpIslandsPass());
   pm.addNestedPass<FuncOp>(TFDevice::CreateReplicateToIslandPass());
   pm.addNestedPass<FuncOp>(CreateBreakUpIslandsPass());
+  pm.addNestedPass<FuncOp>(TFDevice::CreateParallelExecuteToIslandsPass());
+  pm.addNestedPass<FuncOp>(CreateBreakUpIslandsPass());
   pm.addNestedPass<FuncOp>(TFDevice::CreateLaunchToDeviceAttributePass());
 }
 
@@ -71,6 +73,8 @@ void CreateTPUBridgePipeline(OpPassManager &pm) {
   // Run island coarsening before shape inference to allow more exact shape
   // inference using constant folding within islands.
   pm.addNestedPass<FuncOp>(tf_executor::CreateTFExecutorIslandCoarseningPass());
+  // TODO(b/150462212): Move graph pruning before island coarsening.
+  pm.addNestedPass<FuncOp>(tf_executor::CreateTFExecutorGraphPruningPass());
   // Run shape inference so that tf_executor/tf_device ops created later will
   // likely to inherit more concrete types.
   pm.addPass(TF::CreateTFShapeInferencePass());
@@ -90,6 +94,7 @@ void CreateTPUBridgePipeline(OpPassManager &pm) {
   pm.addPass(TF::CreateResourceDeviceInferencePass());
   pm.addPass(TFDevice::CreateClusterOutliningPass());
   pm.addPass(CreateTPUDynamicPaddingMapperPass());
+  pm.addPass(CreateTPUShardingIdentificationPass());
   pm.addPass(TFDevice::CreateAnnotateParameterReplicationPass());
   pm.addPass(CreateTPURewritePass());
   pm.addNestedPass<FuncOp>(TFDevice::CreateReplicateInvariantOpHoistingPass());
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td b/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
index 7c4030ed3f4..05d7a22261a 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
@@ -27,6 +27,10 @@ def SingleResultAndOperandHaveSameType : Constraint<
 
 def IsRank2Tensor : Type<HasAnyRankOfPred<[2]>, "Rank 2 tensor">;
 
+// Checks if the value has only one user.
+def HasOneUse : Constraint<CPred<"$0.hasOneUse()">>;
+
+
 //===----------------------------------------------------------------------===//
 // Add op patterns.
 //===----------------------------------------------------------------------===//
@@ -175,3 +179,11 @@ def TruncateDivWithSqrtDivisor : Pat<(TF_TruncateDivOp $arg0,
 
 def XdivyWithSqrtDivisor : Pat<(TF_XdivyOp $arg0, (TF_SqrtOp $arg1)),
                                (TF_MulNoNanOp (TF_RsqrtOp $arg1), $arg0)>;
+
+
+//===----------------------------------------------------------------------===//
+// Cast op followed by a ReadVariable op can be folded into the ReadVariable
+//===----------------------------------------------------------------------===//
+
+def ReadVariableOfCast : Pat<(TF_ReadVariableOp (TF_CastOp:$output $x, BoolAttr:$Truncate)), (TF_ReadVariableOp $x), [(HasOneUse $output)]>;
+
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc b/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc
new file mode 100644
index 00000000000..71426b04d99
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc
@@ -0,0 +1,167 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h"
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
+#include "mlir/IR/Attributes.h"  // TF:llvm-project
+#include "mlir/IR/Builders.h"  // TF:llvm-project
+#include "mlir/IR/Function.h"  // TF:llvm-project
+#include "mlir/IR/Location.h"  // TF:llvm-project
+#include "mlir/IR/Module.h"  // TF:llvm-project
+#include "mlir/IR/StandardTypes.h"  // TF:llvm-project
+#include "mlir/Pass/Pass.h"  // TF:llvm-project
+#include "mlir/Support/LogicalResult.h"  // TF:llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace mlir {
+namespace TF {
+namespace collection_ops_util {
+
+Value CreateScalarConst(int value, OpBuilder builder, Location loc) {
+  tensorflow::Tensor scalar_tensor(tensorflow::DT_INT32, {});
+  scalar_tensor.scalar<tensorflow::int32>()() = value;
+  return builder.create<TF::ConstOp>(
+      loc, tensorflow::ConvertTensor(scalar_tensor, &builder).ValueOrDie());
+}
+
+Value GetR1Const(ArrayRef<int64_t> r1, OpBuilder builder, Location loc) {
+  tensorflow::Tensor shape_tensor(tensorflow::DT_INT32,
+                                  {static_cast<int64_t>(r1.size())});
+  for (int i = 0; i < r1.size(); ++i) {
+    shape_tensor.vec<tensorflow::int32>()(i) = r1[i];
+  }
+  return builder.create<TF::ConstOp>(
+      loc, tensorflow::ConvertTensor(shape_tensor, &builder).ValueOrDie());
+}
+
+Value GetIndicesForElement(Value index, Value buffer, OpBuilder builder,
+                           Location loc) {
+  auto buffer_type = buffer.getType().cast<RankedTensorType>();
+  if (buffer_type.getShape().size() == 1) return index;
+  // Create a concat of index and trailing zeros.
+  llvm::SmallVector<int64_t, 8> zeros(buffer_type.getShape().size() - 1, 0);
+  auto zeros_tensor = GetR1Const(zeros, builder, loc);
+  return builder.create<TF::ConcatV2Op>(
+      loc,
+      ArrayRef<Type>{RankedTensorType::get(
+          {static_cast<int64_t>(buffer_type.getShape().size())},
+          getElementTypeOrSelf(index.getType()))},
+      ArrayRef<Value>{index, zeros_tensor, CreateScalarConst(0, builder, loc)},
+      ArrayRef<NamedAttribute>{});
+}
+
+Value GetElement(Value index, Value buffer, OpBuilder builder, Location loc) {
+  auto buffer_type = buffer.getType().cast<RankedTensorType>();
+  // Create a slice then reshape to remove the leading trivial dimension of
+  // size 1.
+  llvm::SmallVector<int64_t, 8> slice_size =
+      llvm::to_vector<8>(buffer_type.getShape());
+  slice_size[0] = 1;
+  auto size_const = GetR1Const(slice_size, builder, loc);
+  auto slice_type =
+      RankedTensorType::get(slice_size, buffer_type.getElementType());
+  auto slice = builder.create<TF::SliceOp>(
+      loc, ArrayRef<Type>{slice_type},
+      ArrayRef<Value>{buffer, GetIndicesForElement(index, buffer, builder, loc),
+                      size_const},
+      ArrayRef<NamedAttribute>{});
+  auto element_type = RankedTensorType::get(buffer_type.getShape().drop_front(),
+                                            buffer_type.getElementType());
+  auto reshape = builder.create<TF::ReshapeOp>(
+      loc, ArrayRef<Type>{element_type},
+      ArrayRef<Value>{slice, GetR1Const(element_type.getShape(), builder, loc)},
+      ArrayRef<NamedAttribute>{});
+  return reshape.output();
+}
+
+Value SetElement(Value index, Value buffer, Value element, OpBuilder builder,
+                 Location loc) {
+  auto buffer_type = buffer.getType().cast<RankedTensorType>();
+  // Reshape the element to add a leading dimension of size 1, then perform a
+  // dynamic update slice.
+  auto slice_shape = llvm::to_vector<8>(buffer_type.getShape());
+  slice_shape[0] = 1;
+  auto update_slice = builder.create<TF::ReshapeOp>(
+      loc,
+      ArrayRef<Type>{
+          RankedTensorType::get(slice_shape, buffer_type.getElementType())},
+      ArrayRef<Value>{element, GetR1Const(slice_shape, builder, loc)},
+      ArrayRef<NamedAttribute>{});
+  return builder
+      .create<TF::XlaDynamicUpdateSliceOp>(
+          loc, ArrayRef<Type>{buffer.getType()},
+          ArrayRef<Value>{buffer, update_slice,
+                          GetIndicesForElement(index, buffer, builder, loc)},
+          ArrayRef<NamedAttribute>{})
+      .output();
+}
+
+TensorType GetSizeType(OpBuilder builder) {
+  return RankedTensorType::get({1}, builder.getIntegerType(32));
+}
+
+Value ReshapeScalarToSizeType(OpBuilder builder, Value scalar, Location loc) {
+  auto size_type = GetSizeType(builder);
+  return builder.create<TF::ReshapeOp>(
+      loc, ArrayRef<Type>{size_type},
+      ArrayRef<Value>{scalar, GetR1Const(size_type.getShape(), builder, loc)},
+      ArrayRef<NamedAttribute>{});
+}
+
+LogicalResult CreateInitBufferValue(ArrayRef<int64_t> element_shape,
+                                    Value max_size, Operation* op,
+                                    Type element_dtype, OpBuilder builder,
+                                    Value* buffer) {
+  auto max_count_op = max_size.getDefiningOp();
+  if (!max_count_op) return op->emitOpError("unknown max element count");
+  auto max_count_const_op = llvm::dyn_cast<TF::ConstOp>(max_count_op);
+  if (!max_count_const_op) return op->emitOpError("unknown max element count");
+  int64_t max_size_const =
+      (*max_count_const_op.value().getValues<APInt>().begin()).getSExtValue();
+  llvm::SmallVector<int64_t, 8> buffer_shape;
+  buffer_shape.push_back(max_size_const);
+  for (int64_t dim : element_shape) {
+    buffer_shape.push_back(dim);
+  }
+  auto zero = CreateScalarConst(0, builder, op->getLoc());
+  if (getElementTypeOrSelf(zero.getType()) != element_dtype) {
+    zero = builder.create<TF::CastOp>(
+        op->getLoc(), ArrayRef<Type>{RankedTensorType::get({}, element_dtype)},
+        ArrayRef<Value>{zero}, ArrayRef<NamedAttribute>{});
+  }
+  auto buffer_type = RankedTensorType::get(buffer_shape, element_dtype);
+  auto broadcast = builder.create<TF::BroadcastToOp>(
+      op->getLoc(), ArrayRef<Type>{buffer_type},
+      ArrayRef<Value>{zero, GetR1Const(buffer_shape, builder, op->getLoc())},
+      ArrayRef<NamedAttribute>{});
+  *buffer = broadcast.output();
+  return success();
+}
+}  // namespace collection_ops_util
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h b/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h
new file mode 100644
index 00000000000..6b86cafed3f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h
@@ -0,0 +1,72 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_COLLECTION_OPS_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_COLLECTION_OPS_UTIL_H_
+
+#include "llvm/ADT/ArrayRef.h"
+#include "mlir/IR/Builders.h"  // TF:llvm-project
+#include "mlir/IR/Location.h"  // TF:llvm-project
+#include "mlir/IR/Value.h"  // TF:llvm-project
+#include "mlir/Support/LLVM.h"  // TF:llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+
+namespace mlir {
+namespace TF {
+namespace collection_ops_util {
+
+// This file includes utilities for decomposing collection ops (stack, tensor
+// list, tensor array) in TF. We represent such a data structure as a buffer of
+// shape [max_element_count, element_shape].
+
+// Creates an i32 scalar tf.Const.
+Value CreateScalarConst(int value, OpBuilder builder, Location loc);
+
+// Creates an i32 vector tf.Const.
+Value GetR1Const(ArrayRef<int64_t> r1, OpBuilder builder, Location loc);
+
+// Returns the type of the size tensor used to track a data structure's element
+// count. It is a tensor<1xi32>, and we use R1 instead of a scalar because it is
+// easier to concat it with other offsets.
+TensorType GetSizeType(OpBuilder builder);
+
+// Reshapes a scalar value to match the size type tensor<i32>.
+Value ReshapeScalarToSizeType(OpBuilder builder, Value scalar, Location loc);
+
+// Creates ops that represent the indices of the slice for an element in the
+// buffer. Requires `index` to have tensor<1xi32> type.
+Value GetIndicesForElement(Value index, Value buffer, OpBuilder builder,
+                           Location loc);
+
+// Creates ops that slice the element out of a buffer at the given index.
+// Requires `index` to have tensor<1xi32> type.
+Value GetElement(Value index, Value buffer, OpBuilder builder, Location loc);
+
+// Creates ops that copy the buffer and update an element at the given index.
+// Requires `index` to have tensor<1xi32> type.
+Value SetElement(Value index, Value buffer, Value element, OpBuilder builder,
+                 Location loc);
+
+// Creates the buffer for the data structure with given element shape, type and
+// maximum size.
+LogicalResult CreateInitBufferValue(ArrayRef<int64_t> element_shape,
+                                    Value max_size, Operation* op,
+                                    Type element_dtype, OpBuilder builder,
+                                    Value* buffer);
+}  // namespace collection_ops_util
+}  // namespace TF
+}  // namespace mlir
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_COLLECTION_OPS_UTIL_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
index 7b46c6aec04..c1a87c289bf 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 
+#include "mlir/Interfaces/SideEffects.h"  // TF:llvm-project
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
@@ -31,7 +32,8 @@ LogicalResult ConstantFoldFallbackHook(
     SmallVectorImpl<Attribute>& results) {  // NOLINT
   // Instructions with side effects should not be constant folded to preserve
   // the original semantics.
-  if (!inst->hasNoSideEffect()) return failure();
+  if (inst->getNumRegions() != 0 || !MemoryEffectOpInterface::hasNoEffect(inst))
+    return failure();
 
   // If any of the result types are variants, don't try to constant fold them.
   // This creates opaque variant constants which lose information and would
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc b/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
new file mode 100644
index 00000000000..5410ce4faf7
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
@@ -0,0 +1,296 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/transforms/einsum.h"
+
+#include <climits>
+#include <cstdint>
+
+#include "absl/memory/memory.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Regex.h"
+#include "mlir/Analysis/LoopAnalysis.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
+#include "mlir/IR/Attributes.h"  // TF:llvm-project
+#include "mlir/IR/OpImplementation.h"  // TF:llvm-project
+#include "mlir/IR/PatternMatch.h"  // TF:llvm-project
+#include "mlir/IR/StandardTypes.h"  // TF:llvm-project
+#include "mlir/Pass/Pass.h"  // TF:llvm-project
+#include "mlir/Support/Functional.h"  // TF:llvm-project
+#include "mlir/Support/LLVM.h"  // TF:llvm-project
+#include "mlir/Support/LogicalResult.h"  // TF:llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/core/util/matmul_bcast.h"
+
+namespace mlir {
+namespace TF {
+
+namespace {
+
+// All supported Einsum equations.
+enum EinsumEquation {
+  BatchMatMul,
+  FourDMatrixDotProd,
+  ThreeDReshapeTail,
+  FourDBatchMatMul,
+  UnsupportedEquation
+};
+
+// Tokens for parsing the given equation string.
+enum EquationToken {
+  A,
+  B,
+  C,
+  D,
+  E,
+  COMMA,
+  ARROW,
+};
+constexpr int kNumSupportedEquationVariables = 5;  // A - E for now.
+
+bool tokenizeEquation(const llvm::StringRef& equation,
+                      std::vector<EquationToken>* tokens) {
+  std::map<char, EquationToken> label_axis_mapping;
+  int index = 0;
+  int variable_count = 0;
+  llvm::Regex r("[[:alpha:]]");
+  while (index < equation.size()) {
+    if (r.match(equation.substr(index, 1))) {
+      const char ltr = equation[index];
+      auto itr = label_axis_mapping.find(ltr);
+      if (itr == label_axis_mapping.end() &&
+          variable_count < kNumSupportedEquationVariables) {
+        label_axis_mapping[ltr] = EquationToken(variable_count);
+        tokens->push_back(EquationToken(variable_count));
+        variable_count++;
+      } else if (itr != label_axis_mapping.end()) {
+        tokens->push_back(itr->second);
+      } else {
+        // Ran out of equation variables.
+        return false;
+      }
+    } else if (equation.substr(index, 1).contains(",")) {
+      tokens->push_back(COMMA);
+    } else if ((index < (equation.size() - 1)) &&
+               (equation.substr(index, 2).contains("->"))) {
+      tokens->push_back(ARROW);
+      index++;
+    } else {
+      // Unallowed character encountered.
+      return false;
+    }
+    index++;
+  }
+  return true;
+}
+
+EinsumEquation parseEquation(const std::vector<EquationToken>& eqn) {
+  auto is_equal = [](const std::vector<EquationToken>& eqn1,
+                     const std::initializer_list<EquationToken>& eqn2) {
+    return std::equal(eqn1.begin(), eqn1.end(), eqn2.begin(), eqn2.end());
+  };
+  // IJK,IKM->IJM
+  if (is_equal(eqn, {A, B, C, COMMA, A, C, D, ARROW, A, B, D})) {
+    return EinsumEquation::BatchMatMul;
+  }
+  // BFND,NDH->BFH
+  if (is_equal(eqn, {A, B, C, D, COMMA, C, D, E, ARROW, A, B, E})) {
+    return EinsumEquation::FourDMatrixDotProd;
+  }
+  // BFNH,BTNH->BNFT
+  if (is_equal(eqn, {A, B, C, D, COMMA, A, E, C, D, ARROW, A, C, B, E})) {
+    return EinsumEquation::FourDBatchMatMul;
+  }
+  // BFD,DNH->BFNH
+  if (is_equal(eqn, {A, B, C, COMMA, C, D, E, ARROW, A, B, D, E})) {
+    return EinsumEquation::ThreeDReshapeTail;
+  }
+  return EinsumEquation::UnsupportedEquation;
+}
+
+EinsumEquation tokenizeAndParse(const llvm::StringRef& equation) {
+  std::vector<EquationToken> tokens;
+  if (tokenizeEquation(equation, &tokens)) {
+    return parseEquation(tokens);
+  }
+  return EinsumEquation::UnsupportedEquation;
+}
+
+TF::TransposeOp createTransposeOp(Value value, Location loc,
+                                  llvm::ArrayRef<int32_t> permutation,
+                                  PatternRewriter* rewriter) {
+  auto value_type = value.getType().cast<RankedTensorType>();
+  auto shape = value_type.getShape();
+  auto perm_type = RankedTensorType::get(
+      {static_cast<int32_t>(permutation.size())}, rewriter->getIntegerType(32));
+  auto perm_attr = DenseElementsAttr::get(perm_type, permutation);
+  auto perm_op = rewriter->create<ConstantOp>(loc, perm_type, perm_attr);
+  std::vector<int64_t> transposed_shape(shape.begin(), shape.end());
+  for (int i = 0; i < shape.size(); ++i) {
+    transposed_shape[i] = shape[permutation[i]];
+  }
+  auto transposed_type =
+      RankedTensorType::get(transposed_shape, value_type.getElementType());
+  return rewriter->create<TF::TransposeOp>(loc, transposed_type, value,
+                                           perm_op);
+}
+
+TF::ReshapeOp createReshapeOp(Value value, ArrayRef<int64_t> shape,
+                              Type element_type, Location loc,
+                              PatternRewriter* rewriter) {
+  int64_t shape_rank = shape.size();
+  auto shape_spec_type =
+      RankedTensorType::get({shape_rank}, rewriter->getIntegerType(64));
+  Type resultType = RankedTensorType::get(shape, element_type);
+  auto constant_attr = DenseElementsAttr::get(shape_spec_type, shape);
+  auto shape_tensor =
+      rewriter->create<ConstantOp>(loc, shape_spec_type, constant_attr);
+  return rewriter->create<TF::ReshapeOp>(loc, resultType, /*tensor=*/value,
+                                         /*shape=*/shape_tensor);
+}
+
+}  // namespace
+
+PatternMatchResult ConvertTFEinsumOp::matchAndRewrite(
+    TF::EinsumOp op, PatternRewriter& rewriter) const {
+  Type output_type = op.getResult().getType();
+  Value lhs = op.getOperand(0);
+  Value rhs = op.getOperand(1);
+  Location loc = op.getLoc();
+
+  if (!lhs.getType().isa<RankedTensorType>()) {
+    // LHS must be a ranked tensor type
+    return matchFailure();
+  }
+  if (!rhs.getType().isa<RankedTensorType>()) {
+    // RHS must be a ranked tensor type
+    return matchFailure();
+  }
+
+  auto lhs_type = lhs.getType().cast<RankedTensorType>();
+  auto rhs_type = rhs.getType().cast<RankedTensorType>();
+  auto lhs_shape = lhs_type.getShape();
+  auto rhs_shape = rhs_type.getShape();
+
+  // Currently only support static shapes.
+  if (!(lhs_type.hasStaticShape() && rhs_type.hasStaticShape())) {
+    return matchFailure();
+  }
+
+  // Currently support use cases of LHS, RHS dims = 3 or 4
+  const int dims_lhs = lhs_shape.size();
+  const int dims_rhs = rhs_shape.size();
+  if (dims_rhs < 3 || dims_rhs > 4 || dims_lhs < 3 || dims_lhs > 4) {
+    return matchFailure();
+  }
+
+  EinsumEquation einsum_eqn = tokenizeAndParse(op.equation());
+  if (einsum_eqn == EinsumEquation::BatchMatMul) {
+    // Case "IJK,IKM->IJM"
+    auto bmm_op = rewriter.create<TF::BatchMatMulV2Op>(
+        loc, ArrayRef<Type>{output_type}, lhs, rhs, rewriter.getBoolAttr(false),
+        rewriter.getBoolAttr(false));
+    rewriter.replaceOp(op, bmm_op.getResult());
+    return matchSuccess();
+  }
+  if (einsum_eqn == EinsumEquation::ThreeDReshapeTail) {
+    // Case "BFD,DNH->BFNH"
+    auto lhs_type = lhs.getType().cast<RankedTensorType>();
+    auto lhs_shape = lhs_type.getShape();
+    const int lhs_dim0 = lhs_shape[0];
+    const int lhs_dim1 = lhs_shape[1];
+    // Reshape RHS
+    auto rhs_type = rhs.getType().cast<RankedTensorType>();
+    auto rhs_shape = rhs_type.getShape();
+    auto rhs_element_type = rhs_type.getElementType();
+    const int rhs_dim0 = rhs_shape[0];
+    const int rhs_dim1 = rhs_shape[1];
+    const int rhs_dim2 = rhs_shape[2];
+    auto reshaped_rhs = createReshapeOp(rhs, {rhs_dim0, rhs_dim1 * rhs_dim2},
+                                        rhs_element_type, loc, &rewriter);
+
+    std::vector<int64_t> bmm_shape = {lhs_dim0, lhs_dim1, rhs_dim1 * rhs_dim2};
+    auto bmm_type = RankedTensorType::get(bmm_shape, rhs_type.getElementType());
+    auto bmm_op = rewriter.create<TF::BatchMatMulV2Op>(
+        loc, ArrayRef<Type>{bmm_type}, lhs, reshaped_rhs,
+        rewriter.getBoolAttr(false), rewriter.getBoolAttr(false));
+    auto bmm_element_type = bmm_type.getElementType();
+    auto final_reshape =
+        createReshapeOp(bmm_op, {lhs_dim0, lhs_dim1, rhs_dim1, rhs_dim2},
+                        bmm_element_type, loc, &rewriter);
+    rewriter.replaceOp(op, {final_reshape.getResult()});
+    return matchSuccess();
+  }
+  if (einsum_eqn == EinsumEquation::FourDMatrixDotProd) {
+    // Case "BFND,NDH->BFH"
+    // Reshape LHS
+    auto lhs_element_type = lhs_type.getElementType();
+    const int lhs_dim0 = lhs_shape[0];
+    const int lhs_dim1 = lhs_shape[1];
+    const int lhs_dim2 = lhs_shape[2];
+    const int lhs_dim3 = lhs_shape[3];
+    auto reshaped_lhs =
+        createReshapeOp(lhs, {lhs_dim0, lhs_dim1, lhs_dim2 * lhs_dim3},
+                        lhs_element_type, loc, &rewriter);
+    // Reshape RHS
+    auto rhs_element_type = rhs_type.getElementType();
+    const int rhs_dim0 = rhs_shape[0];
+    const int rhs_dim1 = rhs_shape[1];
+    const int rhs_dim2 = rhs_shape[2];
+    auto reshaped_rhs = createReshapeOp(rhs, {rhs_dim0 * rhs_dim1, rhs_dim2},
+                                        rhs_element_type, loc, &rewriter);
+    auto bmm_op = rewriter.create<TF::BatchMatMulV2Op>(
+        loc, ArrayRef<Type>{output_type}, reshaped_lhs, reshaped_rhs,
+        rewriter.getBoolAttr(false), rewriter.getBoolAttr(false));
+    rewriter.replaceOp(op, {bmm_op.getResult()});
+    return matchSuccess();
+  }
+  if (einsum_eqn == EinsumEquation::FourDBatchMatMul) {
+    // Case "BFNH,BTNH->BNFT"
+    // Transpose LHS
+    lhs = createTransposeOp(lhs, loc, {0, 2, 1, 3}, &rewriter);
+    // Transpose RHS
+    rhs = createTransposeOp(rhs, loc, {0, 2, 3, 1}, &rewriter);
+    auto bmm_op = rewriter.create<TF::BatchMatMulV2Op>(
+        loc, ArrayRef<Type>{output_type}, lhs, rhs, rewriter.getBoolAttr(false),
+        rewriter.getBoolAttr(false));
+    rewriter.replaceOp(op, {bmm_op.getResult()});
+    return matchSuccess();
+  }
+  return matchFailure();
+}
+
+// Transform Einsum to other TF Ops for the supported variants.
+struct TransformEinsumPass : public FunctionPass<TransformEinsumPass> {
+  void runOnFunction() override;
+};
+
+void TransformEinsumPass::runOnFunction() {
+  OwningRewritePatternList patterns;
+  auto func = getFunction();
+
+  patterns.insert<ConvertTFEinsumOp>(&getContext());
+  applyPatternsGreedily(func, patterns);
+}
+
+static PassRegistration<TransformEinsumPass> pass(
+    "tf-einsum", "Transform Einsum to other TF Ops for the supported variants");
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/einsum.h b/tensorflow/compiler/mlir/tensorflow/transforms/einsum.h
new file mode 100644
index 00000000000..77b0c72aaef
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/einsum.h
@@ -0,0 +1,55 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This pass identifies patterns for certain Einsum Ops and replaces them
+// with other equivalent TF Ops.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_EINSUM_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_EINSUM_H_
+
+#include <cstdint>
+#include <initializer_list>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/Attributes.h"  // TF:llvm-project
+#include "mlir/IR/Location.h"  // TF:llvm-project
+#include "mlir/IR/MLIRContext.h"  // TF:llvm-project
+#include "mlir/IR/Matchers.h"  // TF:llvm-project
+#include "mlir/IR/PatternMatch.h"  // TF:llvm-project
+#include "mlir/IR/StandardTypes.h"  // TF:llvm-project
+#include "mlir/IR/TypeUtilities.h"  // TF:llvm-project
+#include "mlir/Pass/Pass.h"  // TF:llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/core/util/matmul_bcast.h"
+
+namespace mlir {
+namespace TF {
+
+// TF.Einsum provides fully general tensor contractions. For a few select
+// cases, we can convert this op to other TF Ops, which in later passes
+// properly convert to TF Lite ops.
+struct ConvertTFEinsumOp : public OpRewritePattern<TF::EinsumOp> {
+ public:
+  explicit ConvertTFEinsumOp(MLIRContext* context)
+      : OpRewritePattern<TF::EinsumOp>(context) {}
+
+  PatternMatchResult matchAndRewrite(TF::EinsumOp op,
+                                     PatternRewriter& rewriter) const override;
+};
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_EINSUM_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc b/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc
new file mode 100644
index 00000000000..82c198ac82f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc
@@ -0,0 +1,120 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <vector>
+
+#include "mlir/IR/Builders.h"  // TF:llvm-project
+#include "mlir/IR/Module.h"  // TF:llvm-project
+#include "mlir/IR/UseDefLists.h"  // TF:llvm-project
+#include "mlir/Pass/Pass.h"  // TF:llvm-project
+#include "mlir/Support/LLVM.h"  // TF:llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+
+namespace mlir {
+namespace tf_saved_model {
+namespace {
+
+// This pass will replace a func's bound inputs which are bound to
+// tf.ReadVariable ops global tensors with tf.Const ops inside the func's body.
+// If this pass runs successfully, the resultant IR will be guaranteed to:
+//
+// 1. Not contain any tf_saved_model.global_tensor ops
+// 2. Not contain any  tf_saved_model.bound_input arg attrs on tf_saved_model
+// exported functions
+// Else, the pass fails.
+//
+// The reason this pass has this contract is so that once this succeeds, we know
+// the IR is in correct form for inference backends (like lite) that do not
+// support resources/variables . Further, this contract also ensures that this
+// pass lowers from saved model to pure TF. Hence it fails, if it cannot lower.
+struct FreezeGlobalTensorsPass : public ModulePass<FreezeGlobalTensorsPass> {
+  void runOnModule() override;
+};
+
+void FreezeGlobalTensorsPass::runOnModule() {
+  auto module = getModule();
+  SymbolTable symbol_table(module);
+  DenseSet<Operation*> frozen_global_tensors;
+
+  for (auto func : module.getOps<FuncOp>()) {
+    SmallVector<unsigned, 4> args_to_erase;
+    OpBuilder builder(func.getBody());
+
+    for (int i = 0, e = func.getNumArguments(); i < e; ++i) {
+      SmallVector<TF::ReadVariableOp, 4> read_variable_ops_to_erase;
+      auto global_tensor = LookupBoundInput(func, i, symbol_table);
+
+      if (!global_tensor) continue;
+      frozen_global_tensors.insert(global_tensor);
+
+      // This pass assumes that all global tensors as immutable (e.g. by a
+      // previous optimize global tensors pass). If not, this pass has to fail
+      // since it cannot perform one of its goals.
+      if (global_tensor.is_mutable()) {
+        global_tensor.emitError() << "is not immutable";
+        return signalPassFailure();
+      }
+
+      auto arg = func.getArgument(i);
+      for (auto user : arg.getUsers()) {
+        if (auto read_op = llvm::dyn_cast<TF::ReadVariableOp>(user)) {
+          // Collect all read variable ops so that all its uses can be replaced
+          // with the tf.constant corresponding to the global tensor op.
+          read_variable_ops_to_erase.push_back(read_op);
+        } else {
+          // Current assumption is all users are tf.ReadVariableOp. Need to
+          // expand this to handle control flow and call ops.
+          user->emitError() << "could not rewrite use of immutable bound input";
+          return signalPassFailure();
+        }
+      }
+
+      // Replace the arg with a tf.Const op in the function body.
+      auto const_op = builder.create<TF::ConstOp>(global_tensor.getLoc(),
+                                                  global_tensor.value());
+      args_to_erase.push_back(i);
+      for (auto read_op : read_variable_ops_to_erase) {
+        read_op.getResult().replaceAllUsesWith(const_op.getResult());
+        read_op.erase();
+      }
+    }
+    func.eraseArguments(args_to_erase);
+  }
+  // Erase all global tensors that were frozen.
+  for (auto global_tensor : frozen_global_tensors) {
+    global_tensor->erase();
+  }
+
+  if (!module.getOps<GlobalTensorOp>().empty()) {
+    module.emitError() << "could not freeze all global tensors in the module";
+    return signalPassFailure();
+  }
+}
+
+}  // namespace
+
+// For "opt" to pick up this pass.
+static PassRegistration<FreezeGlobalTensorsPass> pass(
+    "tf-saved-model-freeze-global-tensors",
+    "Freeze tf_saved_model.global_tensor's in func bodies.");
+
+std::unique_ptr<OpPassBase<ModuleOp>> CreateFreezeGlobalTensorsPass() {
+  return std::make_unique<FreezeGlobalTensorsPass>();
+}
+
+}  // namespace tf_saved_model
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/gpu_fusion.cc b/tensorflow/compiler/mlir/tensorflow/transforms/gpu_fusion.cc
new file mode 100644
index 00000000000..0a8d261ee39
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/gpu_fusion.cc
@@ -0,0 +1,134 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/IR/Attributes.h"  // TF:llvm-project
+#include "mlir/IR/Builders.h"  // TF:llvm-project
+#include "mlir/IR/Function.h"  // TF:llvm-project
+#include "mlir/IR/PatternMatch.h"  // TF:llvm-project
+#include "mlir/Pass/Pass.h"  // TF:llvm-project
+#include "mlir/Pass/PassManager.h"  // TF:llvm-project
+#include "mlir/Pass/PassRegistry.h"  // TF:llvm-project
+#include "mlir/Transforms/Passes.h"  // TF:llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+
+#define DEBUG_TYPE "tf-gpu-op-fusion"
+
+namespace mlir {
+namespace TF {
+
+namespace {
+
+// GpuOpFusionPass is a pass performing fusion specific to GPU targets.
+// This is an ad-hoc pass for now, but should be integrated with some notion
+// of "target" in the MLIR pipeline in the future.
+class GpuOpFusionPass : public FunctionPass<GpuOpFusionPass> {
+ public:
+  void runOnFunction() final;
+};
+
+//   %y:6 = "tf.FusedBatchNormV3"(%x, %scale, %offset, %mean, %variance)
+//   %0 = "tf.Relu"(%y#0)
+// ->
+//   %y:6 = "tf._FusedBatchNormEx"(%x, %scale, %offset, %mean, %variance)
+//
+// Or:
+//   %y:6 = "tf.FusedBatchNormV3"(%x, %scale, %offset, %mean, %variance)
+//   %0 = "tf.AddV2"(%y#0, %side_input)
+//   %1 = "tf.Relu"(%0)
+// ->
+//  %y:6 = "tf._FusedBatchNormEx"(%x, %scale, %offset, %mean, %variance,
+//                                %side_input)
+// TODO(aminim): we should revisit this as a declarative pattern.
+// For the second pattern, there is not good way in the framework to handle the
+// commutativity of the AddV2: we want the FusedBatchNormV3 on any side.
+// Also we need some native calls to handle the "hasOneUse" aspects and the
+// optional extra operands for the AddV2 case.
+struct ReluToFusedBatchNorm : public OpRewritePattern<ReluOp> {
+  using OpRewritePattern<ReluOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(ReluOp relu_op,
+                                     PatternRewriter &rewriter) const override {
+    Operation *relu_input = relu_op.features().getDefiningOp();
+    if (!relu_input) return matchFailure();
+    auto batch_norm = dyn_cast_or_null<FusedBatchNormV3Op>(relu_input);
+    AddV2Op add_op;
+    Value side_input;
+    if (!batch_norm) {
+      // We don't have a FusedBatchNorm as input to the ReLu, but we can get
+      // through an AddV2 as well.
+      add_op = dyn_cast_or_null<AddV2Op>(relu_input);
+      if (!add_op) return matchFailure();
+
+      batch_norm =
+          dyn_cast_or_null<FusedBatchNormV3Op>(add_op.x().getDefiningOp());
+      if (batch_norm) {
+        side_input = add_op.y();
+      } else {
+        // Didn't get a FusedBatchNorm on the LHS of the AddV2, try the RHS.
+        batch_norm =
+            dyn_cast_or_null<FusedBatchNormV3Op>(add_op.y().getDefiningOp());
+        if (!batch_norm) return matchFailure();
+        side_input = add_op.x();
+      }
+    }
+    assert(batch_norm);
+    if (batch_norm.is_training()) return matchFailure();
+    if (!batch_norm.y().hasOneUse()) return matchFailure();
+
+    // Build the newly fused operation to replace the batch norm
+    OperationState state(batch_norm.getLoc(),
+                         FusedBatchNormExOp::getOperationName());
+    state.addOperands(batch_norm.getOperands());
+    if (side_input) state.operands.push_back(side_input);
+    state.addTypes(batch_norm.getResultTypes());
+    state.addAttributes(batch_norm.getAttrs());
+    Operation *op = rewriter.createOperation(state);
+    rewriter.replaceOp(batch_norm, op->getResults());
+
+    // Depending on the case, we may fuse the add, the relu, or both.
+    if (!add_op || add_op.z().hasOneUse()) {
+      // We fuse the Relu only if the add has a single use, otherwise we only
+      // fuse the add itself.
+      op->setAttr("activation_mode", rewriter.getStringAttr("Relu"));
+      rewriter.replaceOp(relu_op, op->getResult(0));
+    }
+    if (add_op) {
+      rewriter.replaceOp(add_op, op->getResult(0));
+    }
+
+    return matchSuccess();
+  }
+};
+
+void GpuOpFusionPass::runOnFunction() {
+  FuncOp func = getFunction();
+  OwningRewritePatternList patterns;
+  patterns.insert<ReluToFusedBatchNorm>(&getContext());
+  applyPatternsGreedily(func, patterns);
+}
+
+}  // namespace
+
+std::unique_ptr<OpPassBase<FuncOp>> CreateGpuOpFusionPass() {
+  return std::make_unique<GpuOpFusionPass>();
+}
+
+static PassRegistration<GpuOpFusionPass> layout_assignment(
+    "tf-gpu-op-fusion", "Fusion optimization for GPU targets");
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/libtftpu.h b/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.cc
similarity index 55%
rename from tensorflow/compiler/xla/python/tpu_driver/client/libtftpu.h
rename to tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.cc
index 0562afb2141..281a6011af6 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/libtftpu.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.cc
@@ -13,23 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_TPU_DRIVER_CLIENT_LIBTFTPU_H_
-#define TENSORFLOW_COMPILER_XLA_PYTHON_TPU_DRIVER_CLIENT_LIBTFTPU_H_
+#include "tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.h"
 
-#ifdef __cplusplus
-extern "C" {
-#endif
+namespace tensorflow {
 
-typedef struct TfTpuDriver_CompileOp TfTpuDriver_CompileOp;
+Status MlirGraphOptimizationPass::Run(const ConfigProto& config_proto,
+                                      mlir::ModuleOp module) {
+  if (!config_proto.experimental().enable_mlir_graph_optimization()) {
+    VLOG(1) << "Skipping MLIR Graph Optimization Pass"
+            << ", session flag not enabled";
+    return Status::OK();
+  }
 
-TfTpuDriver_CompileOp* TfTpuDriver_CompileOpConstructor(void* ctx);
+  // TODO(ezhulenev): Add something here.
 
-void TfTpuDriver_CompileOpExecute(TfTpuDriver_CompileOp* op, void* ctx);
-
-void TfTpuDriver_CompileOpFree(TfTpuDriver_CompileOp* op);
-
-#ifdef __cplusplus
+  return Status::OK();
 }
-#endif
 
-#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_TPU_DRIVER_CLIENT_LIBTFTPU_H_
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.h b/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.h
new file mode 100644
index 00000000000..955da470494
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.h
@@ -0,0 +1,38 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_GRAPH_OPTIMIZATION_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_GRAPH_OPTIMIZATION_PASS_H_
+
+#include "tensorflow/compiler/mlir/mlir_graph_optimization_pass.h"
+
+namespace tensorflow {
+
+// Bundle generic MLIR graph optimization passes (some derived from TF Grappler
+// graph optimizers) into a single MLIR optimization pass.
+class MlirGraphOptimizationPass : public MlirOptimizationPass {
+ public:
+  llvm::StringRef name() const override { return "graph_optimization"; }
+
+  bool IsEnabled(const ConfigProto& config_proto) const override {
+    return config_proto.experimental().enable_mlir_graph_optimization();
+  }
+
+  Status Run(const ConfigProto& config_proto, mlir::ModuleOp module) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_GRAPH_OPTIMIZATION_PASS_H_
diff --git a/tensorflow/lite/python/testdata/test_registerer.i b/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass_registration.cc
similarity index 55%
rename from tensorflow/lite/python/testdata/test_registerer.i
rename to tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass_registration.cc
index 1cd41c9164d..4681f8a0f33 100644
--- a/tensorflow/lite/python/testdata/test_registerer.i
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass_registration.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,8 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-%{
-#include "tensorflow/lite/python/testdata/test_registerer.h"
-%}
+#include <memory>
 
-%include "tensorflow/lite/python/testdata/test_registerer.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.h"
+
+namespace tensorflow {
+namespace {
+constexpr int kMlirGraphOptimizationPriority = 0;
+}
+
+static mlir_pass_registration::MlirOptimizationPassRegistration
+    register_mlir_graph_optimization_pass(
+        kMlirGraphOptimizationPriority,
+        std::make_unique<MlirGraphOptimizationPass>());
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc b/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
index e6c4024d5ec..7d65d16e42d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
@@ -17,12 +17,15 @@ limitations under the License.
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/Function.h"  // TF:llvm-project
+#include "mlir/IR/Module.h"  // TF:llvm-project
 #include "mlir/Pass/Pass.h"  // TF:llvm-project
 #include "mlir/Pass/PassManager.h"  // TF:llvm-project
 #include "mlir/Pass/PassRegistry.h"  // TF:llvm-project
 #include "mlir/Transforms/Passes.h"  // TF:llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
 
 #define DEBUG_TYPE "tf-layout-optimization"
 
@@ -90,22 +93,32 @@ Permutation GetDataFormatPermutation(StringRef from_data_format,
 void LayoutAssignmentPass::runOnFunction() {
   FuncOp func = getFunction();
 
-  // TODO(ezhulenev): LayoutSensitiveInterface should select the optimal data
-  // layout if there is no explicitly forced data format.
-  if (force_data_format_.empty()) return;
+  // Get runtime devices information from the closest parent module.
+  RuntimeDevices devices;
+  ::tensorflow::GetDevicesFromOp(func.getParentOfType<ModuleOp>(), &devices);
+
+  // If there is no runtime device information and data format is not explicitly
+  // forced, there is nothing to do.
+  if (devices.NumDevices() == 0 && force_data_format_.empty()) return;
 
   func.walk([&](LayoutSensitiveInterface layout_sensitive_interface) {
+    // Get desired op data format.
+    StringRef target_data_format = force_data_format_;
+    if (target_data_format.empty()) {
+      target_data_format = layout_sensitive_interface.GetOptimalLayout(devices);
+    }
+
     // Skip ops that already use target data format.
     auto data_format = layout_sensitive_interface.data_format();
-    if (data_format == force_data_format_) return;
+    if (data_format == target_data_format) return;
 
     // Transpose arguments into the target data format.
     Permutation args_permutation =
-        GetDataFormatPermutation(data_format, force_data_format_);
+        GetDataFormatPermutation(data_format, target_data_format);
 
     // Transpose results back to the original data format.
     Permutation res_permutation =
-        GetDataFormatPermutation(force_data_format_, data_format);
+        GetDataFormatPermutation(target_data_format, data_format);
 
     if (args_permutation.empty() || res_permutation.empty()) return;
 
@@ -119,7 +132,7 @@ void LayoutAssignmentPass::runOnFunction() {
     };
 
     // Change operation data format.
-    if (failed(layout_sensitive_interface.UpdateDataFormat(force_data_format_)))
+    if (failed(layout_sensitive_interface.UpdateDataFormat(target_data_format)))
       return;
 
     // Permute arguments into the target data format.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc b/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
index 1aaceb8ecc7..68617e36f0c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <set>
 
 #include "llvm/ADT/DenseMap.h"
-#include "mlir/Analysis/CallInterfaces.h"  // TF:llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/Function.h"  // TF:llvm-project
@@ -29,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // TF:llvm-project
 #include "mlir/IR/SymbolTable.h"  // TF:llvm-project
 #include "mlir/IR/Types.h"  // TF:llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // TF:llvm-project
 #include "mlir/Pass/Pass.h"  // TF:llvm-project
 #include "mlir/Support/LLVM.h"  // TF:llvm-project
 #include "mlir/Support/LogicalResult.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index 3bd7e164ec7..332e181c9ed 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -46,9 +46,15 @@ std::unique_ptr<OpPassBase<ModuleOp>> CreateTFShapeInferencePass();
 // Optional pass which will unroll BatchMatMul and use only MatMul
 std::unique_ptr<OpPassBase<FuncOp>> CreateUnrollBatchMatMulPassPass();
 
+// Optional pass which will map TF BatchMatMul to TF Einsum
+std::unique_ptr<OpPassBase<FuncOp>> CreateBatchMatMulToEinsumPass();
+
 // Optimizes Tensorflow graph.
 std::unique_ptr<OpPassBase<FuncOp>> CreateTFOptimizePass();
 
+// Performs specific fusion for GPU targets.
+std::unique_ptr<OpPassBase<FuncOp>> CreateGpuOpFusionPass();
+
 struct LayoutOptimizationPipelineOptions
     : public PassPipelineOptions<LayoutOptimizationPipelineOptions> {
   Option<std::string> force_data_format{
@@ -107,6 +113,10 @@ LogicalResult ResourceLiftingForFunctionalControlFlow(FuncOp function);
 // removed by resource lifting. Requires known maximum sizes of stacks and
 // known element shapes of push ops.
 std::unique_ptr<OpPassBase<ModuleOp>> CreateStackOpsDecompositionPass();
+
+// Converts tensor list operations into operations on buffers and sizes. Needs
+// static shapes and known max element count.
+std::unique_ptr<OpPassBase<ModuleOp>> CreateTensorListOpsDecompositionPass();
 }  // namespace TF
 
 namespace TFControlFlow {
@@ -244,6 +254,9 @@ namespace tf_saved_model {
 // Creates a pass that optimizes tf_saved_model.global_tensor ops.
 std::unique_ptr<OpPassBase<ModuleOp>> CreateOptimizeGlobalTensorsPass();
 
+// Creates a pass that freezes tf_saved_model.global_tensor ops.
+std::unique_ptr<OpPassBase<ModuleOp>> CreateFreezeGlobalTensorsPass();
+
 // Creates a pass that uses tf_saved_model dialect linkage information
 // to mark function visibility. That is, exported functions are marked with
 // public visibility while the other functions are marked with private
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
index b7f8ea263b6..32dbb6f5d34 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
@@ -54,8 +54,6 @@ namespace mlir {
 
 namespace {
 
-constexpr char kDTypeAttr[] = "dtype";
-
 // This pass lifts resource variable operations outside of device computation.
 // This is useful because a lot of accelerator devices can not interact with
 // resource variables directly..
@@ -188,7 +186,7 @@ void ForwardStoreToLoad(Block* block) {
 }
 
 // Moves resource load operations with the provided `move_load` function. This
-// assumes load-store forwarding has been performed on this launch_op such that
+// assumes load-store forwarding has been performed on this block such that
 // all loads of same resource are on its initial values. A `skip_load` functions
 // is used to indicate whether a load should be skipped. If there are multiple
 // loads on the same resource, only the first one will be moved, and the later
@@ -198,7 +196,7 @@ void HoistResourceLoads(
     llvm::function_ref<void(TF::ReadVariableOp)> move_load) {
   llvm::SmallDenseMap<Value, TF::ReadVariableOp> resource_to_read_ops;
 
-  // Only iterate through ops directly in launch_op's body as we can't handle
+  // Only iterate through ops directly in the body as we can't handle
   // ops nested deeper in regions.
   for (Operation& op : llvm::make_early_inc_range(*block)) {
     auto read_variable_op = dyn_cast<TF::ReadVariableOp>(&op);
@@ -220,28 +218,25 @@ void HoistResourceLoads(
   }
 }
 
-// If there are any stores to resource defined outside of launch_op's body
-// region, the stored values must be returned by launch_op and its return op so
-// that new values can be used by sunk resource stores.
+// If there are any stores to resource defined outside of the block then the
+// stored values must be returned so that new values can be used by sunk
+// resource stores.
 // Returns true if any resource variable stored values are appended, otherwise
 // false.
-bool AppendResourceStoreValueToReturn(tf_device::LaunchOp launch_op) {
+bool AppendResourceStoreValueToReturn(Block* body) {
   bool has_resource_store = false;
-  Block* body = &launch_op.GetBody();
   auto old_return = body->getTerminator();
 
   llvm::SmallVector<Value, 4> new_return_operands(old_return->getOperands());
 
-  // Only iterate through ops directly in launch_op's body as we can't handle
-  // ops nested deeper in regions.
-  for (Operation& op : launch_op.GetBody()) {
-    auto assign_variable_op = dyn_cast<TF::AssignVariableOp>(&op);
-    if (!assign_variable_op) continue;
+  // Only iterate through ops directly in the body as we can't handle ops nested
+  // deeper in regions.
+  for (auto assign_variable_op : body->getOps<TF::AssignVariableOp>()) {
     Value resource = assign_variable_op.resource();
     if (!resource) continue;
 
-    // Skip resources created inside of launch_op.
-    if (resource.getParentRegion() == &launch_op.body()) continue;
+    // Skip resources created inside of the body.
+    if (resource.getParentRegion() == body->getParent()) continue;
 
     // TODO(ycao): Prevent same value from being returned multiple times.
     // TODO(ycao): Do not return resource store value if it is defined outside
@@ -267,8 +262,7 @@ tf_device::LaunchOp SinkResourceStores(tf_device::LaunchOp launch_op,
                                        OpBuilder* builder) {
   // Update ReturnOp inside launch_op's body to output final values of updated
   // external resources.
-  bool has_resource_store = AppendResourceStoreValueToReturn(launch_op);
-  if (!has_resource_store) return launch_op;
+  if (!AppendResourceStoreValueToReturn(&launch_op.GetBody())) return launch_op;
 
   auto new_return_op = launch_op.GetBody().getTerminator();
   llvm::SmallVector<Type, 4> new_launch_return_types(
@@ -352,9 +346,9 @@ LogicalResult HoistResourceOpsFromLaunchOp(tf_device::LaunchOp launch_op) {
 
 // Holds information about a function's use of a resource argument.
 struct ResourceArgUseInfo {
-  bool used;
   Type data_type;
   bool updated;
+  bool used;
 };
 
 // Finds the ResourceArgUseInfo for each resource argument. Forwarding to the
@@ -501,13 +495,13 @@ void LiftArgRetResourcesForFunction(
       });
   // Record the stores in resource_arg_read.
   for (auto& op : llvm::make_early_inc_range(func_op.front())) {
-    if (auto write = llvm::dyn_cast<TF::AssignVariableOp>(&op)) {
-      auto arg = write.resource().dyn_cast<BlockArgument>();
-      if (!arg) continue;
-      // After ForwardStoreToLoad(), there should be just one store for each
-      // resource.
-      resource_arg_write[arg] = write;
-    }
+    auto write = llvm::dyn_cast<TF::AssignVariableOp>(&op);
+    if (!write) continue;
+    auto arg = write.resource().dyn_cast<BlockArgument>();
+    if (!arg) continue;
+    // After ForwardStoreToLoad(), there should be just one store for each
+    // resource.
+    resource_arg_write[arg] = write;
   }
   // Now change the input types to non-resource and remove the internal loads.
   auto new_types = llvm::to_vector<8>(func_op.getType().getInputs());
@@ -542,8 +536,8 @@ llvm::SmallVector<T, 4> FilterRange(
   llvm::SmallVector<T, 4> filtered;
   for (auto entry : llvm::enumerate(range)) {
     auto it = resource_arg_uses.find(entry.index());
-    if (it != resource_arg_uses.end() && !it->getSecond().used) continue;
-    filtered.push_back(entry.value());
+    if (it == resource_arg_uses.end() || it->getSecond().used)
+      filtered.push_back(entry.value());
   }
   return filtered;
 }
@@ -882,13 +876,6 @@ LogicalResult HandlePartitionedCallOpCallee(
   auto module = callee.getParentOfType<ModuleOp>();
   name_base += "_resource_lifted";
   auto name = name_base;
-  {
-    int64_t counter = 0;
-    while (module.lookupSymbol(name)) {
-      auto name = name_base;
-      name += "_" + std::to_string(counter++);
-    }
-  }
   callee = callee.clone();
   callee.setName(name);
   SymbolTable(module).insert(callee);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index b3474e2faf1..6d2ce76eca8 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "mlir/IR/Diagnostics.h"  // TF:llvm-project
 #include "mlir/IR/Location.h"  // TF:llvm-project
 #include "mlir/IR/Operation.h"  // TF:llvm-project
+#include "mlir/IR/OperationSupport.h"  // TF:llvm-project
 #include "mlir/IR/StandardTypes.h"  // TF:llvm-project
 #include "mlir/IR/SymbolTable.h"  // TF:llvm-project
 #include "mlir/IR/Value.h"  // TF:llvm-project
@@ -184,20 +185,79 @@ bool InferShapeForNonTFDialectOperation(Operation* op, Dialect* tf_dialect) {
   return false;
 }
 
+// Gets the subtype's shape and data type for `type`. Templated to support both
+// ResourceType and VariantType.
+template <typename T>
+std::unique_ptr<std::vector<
+    std::pair<tensorflow::PartialTensorShape, tensorflow::DataType>>>
+GetSubtypesHelper(Type type) {
+  auto type_with_subtypes =
+      type.cast<TensorType>().getElementType().dyn_cast<T>();
+  if (!type_with_subtypes || type_with_subtypes.getSubtypes().empty()) {
+    return nullptr;
+  }
+  auto shapes_and_types = absl::make_unique<std::vector<
+      std::pair<tensorflow::PartialTensorShape, tensorflow::DataType>>>();
+  for (auto subtype : type_with_subtypes.getSubtypes()) {
+    auto shape = GetShapeFromMlirType(subtype);
+    // handle_shapes_and_types requires all shapes to be known. So if any
+    // subtype is unknown, clear the vector.
+    if (!shape) {
+      shapes_and_types = nullptr;
+      break;
+    }
+    tensorflow::DataType dtype;
+    auto status =
+        tensorflow::ConvertToDataType(subtype.getElementType(), &dtype);
+    assert(status.ok() && "Unknown element type");
+    shapes_and_types->emplace_back(*shape, dtype);
+  }
+  return shapes_and_types;
+}
+
+// Gets the subtype's shape and data type for `type`.
+std::unique_ptr<std::vector<
+    std::pair<tensorflow::PartialTensorShape, tensorflow::DataType>>>
+GetSubtypes(Type type) {
+  auto subclasses = GetSubtypesHelper<TF::ResourceType>(type);
+  if (subclasses) return subclasses;
+  return GetSubtypesHelper<TF::VariantType>(type);
+}
+
+// Makes result types match the operand types. Returns if anything is changed.
+bool PassThroughOperandTypes(OperandRange operands, ResultRange results) {
+  bool changed = false;
+  for (auto entry : llvm::zip(operands, results)) {
+    Type operand_type = std::get<0>(entry).getType();
+    if (operand_type == std::get<1>(entry).getType()) continue;
+    std::get<1>(entry).setType(operand_type);
+    changed = true;
+  }
+  return changed;
+}
+
 }  // namespace
 
 bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
                                   int64_t graph_version) {
   assert(tf_dialect == op->getDialect());
+  // The shape function of these ops sometimes does not propagate subtypes
+  // (handle shapes) for resource and variant types. We use a simple passthrough
+  // to make sure they are preserved in the output.
+  if (isa<TF::IdentityOp>(op) || isa<TF::IdentityNOp>(op) ||
+      isa<TF::ZerosLikeOp>(op) || isa<TF::WhileOp>(op)) {
+    return PassThroughOperandTypes(op->getOperands(), op->getResults());
+  }
 
   // If no result for this op needs shape inference, we have a fast-path return.
-  // But if the type is a resource, we do not skip it because we might not have
-  // the handle shapes.
+  // But if the type is a resource/variant, we do not skip it because we might
+  // not have the handle shapes.
   if (llvm::all_of(op->getResultTypes(), [](Type type) {
         auto shape_type = type.dyn_cast<ShapedType>();
         return !shape_type ||
                (shape_type.hasStaticShape() &&
-                !shape_type.getElementType().isa<TF::ResourceType>());
+                !shape_type.getElementType().isa<TF::ResourceType>() &&
+                !shape_type.getElementType().isa<TF::VariantType>());
       })) {
     LLVM_DEBUG(llvm::dbgs() << "Skipping inference for statically shaped op '"
                             << op->getName() << "'.\n";);
@@ -282,29 +342,8 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
     if (auto shape = GetShapeFromMlirType(operand_type)) {
       input_shapes[index] = *shape;
     }
-    // Collect the handle shapes and types for a resource.
-    if (auto resource_type = operand_type.cast<TensorType>()
-                                 .getElementType()
-                                 .dyn_cast<TF::ResourceType>()) {
-      if (resource_type.getSubtypes().empty()) continue;
-      auto shapes_and_types = absl::make_unique<std::vector<
-          std::pair<tensorflow::PartialTensorShape, tensorflow::DataType>>>();
-      for (auto subtype : resource_type.getSubtypes()) {
-        auto shape = GetShapeFromMlirType(subtype);
-        // handle_shapes_and_types requires all shapes to be known. So if any
-        // subtype is unknown, clear the vector.
-        if (!shape) {
-          shapes_and_types = nullptr;
-          break;
-        }
-        tensorflow::DataType dtype;
-        auto status =
-            tensorflow::ConvertToDataType(subtype.getElementType(), &dtype);
-        assert(status.ok() && "Unknown element type");
-        shapes_and_types->emplace_back(*shape, dtype);
-      }
-      handle_shapes_and_types[index] = std::move(shapes_and_types);
-    }
+    // Collect the handle shapes and types for a resource/variant.
+    handle_shapes_and_types[index] = GetSubtypes(operand_type);
   }
 
   // Perform the shape inference using an InferenceContext with the input
@@ -346,8 +385,9 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
       return RankedTensorType::get(shape, element_type);
     };
     auto new_element_type = shaped_type.getElementType();
-    // Populate the handle shapes for a resource.
-    if (auto resource_type = new_element_type.dyn_cast<TF::ResourceType>()) {
+    // Populate the handle shapes for a resource/variant.
+    if (new_element_type.isa<TF::ResourceType>() ||
+        new_element_type.isa<TF::VariantType>()) {
       auto handle_shapes_types = c.output_handle_shapes_and_types(output);
       if (handle_shapes_types) {
         llvm::SmallVector<mlir::TensorType, 1> subtypes;
@@ -359,7 +399,11 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
           assert(status.ok() && "Unknown element type");
           subtypes.push_back(get_tensor_type(shape_n_type.shape, element_type));
         }
-        new_element_type = TF::ResourceType::get(subtypes, op->getContext());
+        if (new_element_type.isa<TF::ResourceType>()) {
+          new_element_type = TF::ResourceType::get(subtypes, op->getContext());
+        } else {
+          new_element_type = TF::VariantType::get(subtypes, op->getContext());
+        }
       }
     }
     auto new_type = get_tensor_type(shape_handle, new_element_type);
@@ -452,11 +496,13 @@ LogicalResult PropagateShapeIntoAttachedFunctions(Operation* op,
     return PropagateShapeToFunctions(module, while_op.getOperandTypes(),
                                      {while_op.cond(), while_op.body()},
                                      graph_version, max_iteration);
-  } else if (auto call_op = dyn_cast<TF::PartitionedCallOp>(op)) {
-    if (call_op.f().isa<FlatSymbolRefAttr>())
-      return PropagateShapeToFunctions(module, call_op.getOperandTypes(),
-                                       {call_op.f().getRootReference()},
-                                       graph_version, max_iteration);
+  } else if (auto call_op = dyn_cast<CallOpInterface>(op)) {
+    CallInterfaceCallable callable = call_op.getCallableForCallee();
+    if (SymbolRefAttr sym = callable.dyn_cast<SymbolRefAttr>()) {
+      return PropagateShapeToFunctions(
+          module, call_op.getArgOperands().getTypes(), {sym.getRootReference()},
+          graph_version, max_iteration);
+    }
   }
 
   // TODO(ycao): Implement support for Call op, including function reuse.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
index b58a2402f4e..4033d522091 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // TF:llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
@@ -52,6 +53,8 @@ namespace mlir {
 
 namespace {
 
+namespace cutil = TF::collection_ops_util;
+
 // A pass that converts stack operations to tensor operations and read/assign
 // ops on local variables. A later resource lifting pass can further remove the
 // local variables.
@@ -106,88 +109,14 @@ TF::AssignVariableOp WriteLocalVariable(Value local_var, Value value,
                                               ArrayRef<NamedAttribute>{});
 }
 
-// Creates an i32 scalar tf.Const.
-TF::ConstOp CreateScalarConst(int value, OpBuilder builder, Location loc) {
-  tensorflow::Tensor scalar_tensor(tensorflow::DT_INT32, {});
-  scalar_tensor.scalar<tensorflow::int32>()() = value;
-  return builder.create<TF::ConstOp>(
-      loc, tensorflow::ConvertTensor(scalar_tensor, &builder).ValueOrDie());
-}
-
-// Creates an i32 vector tf.Const.
-TF::ConstOp GetR1Const(ArrayRef<int64_t> r1, OpBuilder builder, Location loc) {
-  tensorflow::Tensor shape_tensor(tensorflow::DT_INT32,
-                                  {static_cast<int64_t>(r1.size())});
-  for (int i = 0; i < r1.size(); ++i) {
-    shape_tensor.vec<tensorflow::int32>()(i) = r1[i];
-  }
-  return builder.create<TF::ConstOp>(
-      loc, tensorflow::ConvertTensor(shape_tensor, &builder).ValueOrDie());
-}
-
-// Creates a rank-1 op that represents the offsets of the stack element in the
-// stack buffer.
-Value GetIndicesForStackElement(Value index, Value stack_value,
-                                OpBuilder builder, Location loc) {
-  auto stack_type = stack_value.getType().cast<RankedTensorType>();
-  if (stack_type.getShape().size() == 1) return index;
-  llvm::SmallVector<int64_t, 8> zeros(stack_type.getShape().size() - 1, 0);
-  auto zeros_tensor = GetR1Const(zeros, builder, loc);
-  return builder.create<TF::ConcatV2Op>(
-      loc,
-      ArrayRef<Type>{RankedTensorType::get(
-          {static_cast<int64_t>(stack_type.getShape().size())},
-          getElementTypeOrSelf(index.getType()))},
-      ArrayRef<Value>{index, zeros_tensor, CreateScalarConst(0, builder, loc)},
-      ArrayRef<NamedAttribute>{});
-}
-
-// Returns the type of the local variable for the stack size. It is a
-// tensor<1xi32>, and we use R1 instead of a scalar because it is easier to
-// concat it with other offsets.
+// Returns the type of the local variable for the stack size.
 Type GetSizeVarType(OpBuilder builder) {
-  auto size_type = RankedTensorType::get({1}, builder.getIntegerType(32));
+  auto size_type = cutil::GetSizeType(builder);
   return RankedTensorType::get(
       {}, TF::ResourceType::get(ArrayRef<TensorType>{size_type},
                                 builder.getContext()));
 }
 
-// Creates the buffer and size local variables for a stack.
-std::pair<Value, Value> CreateVariablesForStack(TensorType stack_tensor_type,
-                                                TF::StackV2Op stack) {
-  OpBuilder builder(stack);
-  auto size_var_type = GetSizeVarType(builder);
-  auto var_type = RankedTensorType::get(
-      {}, TF::ResourceType::get(ArrayRef<TensorType>{stack_tensor_type},
-                                stack.getContext()));
-  auto local_var = builder.create<TF::MlirLocalVarOp>(
-      stack.getLoc(), ArrayRef<Type>{var_type}, ArrayRef<Value>{},
-      ArrayRef<NamedAttribute>{});
-  auto local_size_var = builder.create<TF::MlirLocalVarOp>(
-      stack.getLoc(), ArrayRef<Type>{size_var_type}, ArrayRef<Value>{},
-      ArrayRef<NamedAttribute>{});
-
-  // Zero-initialize the local vars.
-  WriteLocalVariable(local_size_var, GetR1Const({0LL}, builder, stack.getLoc()),
-                     builder, stack.getLoc());
-  auto zero = CreateScalarConst(0, builder, stack.getLoc()).output();
-  if (getElementTypeOrSelf(zero.getType()) !=
-      stack_tensor_type.getElementType()) {
-    zero = builder.create<TF::CastOp>(
-        stack.getLoc(),
-        ArrayRef<Type>{
-            RankedTensorType::get({}, stack_tensor_type.getElementType())},
-        ArrayRef<Value>{zero}, ArrayRef<NamedAttribute>{});
-  }
-  auto broadcast = builder.create<TF::BroadcastToOp>(
-      stack.getLoc(), ArrayRef<Type>{stack_tensor_type},
-      ArrayRef<Value>{zero, GetR1Const(stack_tensor_type.getShape(), builder,
-                                       stack.getLoc())},
-      ArrayRef<NamedAttribute>{});
-  WriteLocalVariable(local_var, broadcast, builder, stack.getLoc());
-  return {local_var, local_size_var};
-}
-
 // Tries to infer the stack element type with full shape based on its uses.
 llvm::Optional<RankedTensorType> GetStackElementType(Value stack,
                                                      ModuleOp module) {
@@ -449,7 +378,7 @@ LogicalResult HandlePartitionedCallOp(
       if (arg_it == info.stack_var_arg_to_size_arg.end()) continue;
       auto it = data_var_to_size_var.find(call.getOperand(i));
       if (it == data_var_to_size_var.end()) {
-        call.emitOpError("Unknown stack.");
+        call.emitOpError("unknown stack");
         return failure();
       }
       assert(arg_it->second == new_operands.size());
@@ -532,25 +461,32 @@ LogicalResult HandleStackV2Op(
   // Create a buffer variable and a size variable to replace the stack.
   auto elem_type = GetStackElementType(stack.handle(), module);
   if (!elem_type.hasValue()) {
-    return stack.emitOpError("cannot infer element shape of stack.");
+    return stack.emitOpError("cannot infer element shape of stack");
   }
-  auto size_op = stack.max_size().getDefiningOp();
-  if (!size_op || !llvm::isa<TF::ConstOp>(size_op)) {
-    return stack.emitOpError("max size of stack is not a constant.");
+  OpBuilder builder(stack);
+  Value buffer;
+  if (failed(cutil::CreateInitBufferValue(
+          elem_type->getShape(), stack.max_size(), stack,
+          elem_type->getElementType(), builder, &buffer))) {
+    return failure();
   }
-  int64_t max_size =
-      (*llvm::cast<TF::ConstOp>(size_op).value().getValues<APInt>().begin())
-          .getSExtValue();
-  llvm::SmallVector<int64_t, 8> stack_shape;
-  stack_shape.push_back(max_size);
-  for (int64_t dim : elem_type->getShape()) stack_shape.push_back(dim);
-  auto stack_tensor_type =
-      RankedTensorType::get(stack_shape, elem_type->getElementType());
-  Value local_var;
-  Value local_size_var;
-  std::tie(local_var, local_size_var) =
-      CreateVariablesForStack(stack_tensor_type, stack);
-  stack.replaceAllUsesWith(local_var);
+  auto size_var_type = GetSizeVarType(builder);
+  auto var_type = RankedTensorType::get(
+      {}, TF::ResourceType::get(
+              ArrayRef<TensorType>{buffer.getType().cast<TensorType>()},
+              stack.getContext()));
+  auto local_var = builder.create<TF::MlirLocalVarOp>(
+      stack.getLoc(), ArrayRef<Type>{var_type}, ArrayRef<Value>{},
+      ArrayRef<NamedAttribute>{});
+  auto local_size_var = builder.create<TF::MlirLocalVarOp>(
+      stack.getLoc(), ArrayRef<Type>{size_var_type}, ArrayRef<Value>{},
+      ArrayRef<NamedAttribute>{});
+  // Zero-initialize the local vars.
+  WriteLocalVariable(local_size_var,
+                     cutil::GetR1Const({0LL}, builder, stack.getLoc()), builder,
+                     stack.getLoc());
+  WriteLocalVariable(local_var, buffer, builder, stack.getLoc());
+  stack.handle().replaceAllUsesWith(local_var);
   (*data_var_to_size_var)[local_var] = local_size_var;
   stack.erase();
   return success();
@@ -561,7 +497,7 @@ LogicalResult HandleStackPushV2Op(
     llvm::SmallDenseMap<Value, Value>* data_var_to_size_var) {
   auto it = data_var_to_size_var->find(push.handle());
   if (it == data_var_to_size_var->end()) {
-    return push.emitOpError("unknown stack.");
+    return push.emitOpError("unknown stack");
   }
   // Push output simply forward the input element.
   push.replaceAllUsesWith(push.elem());
@@ -569,31 +505,13 @@ LogicalResult HandleStackPushV2Op(
   // Read the current buffer and size.
   auto stack_val = ReadLocalVariable(push.handle(), builder, push.getLoc());
   auto index = ReadLocalVariable(it->getSecond(), builder, push.getLoc());
-  auto stack_buffer_type = stack_val.getType().cast<RankedTensorType>();
-  auto slice_shape = llvm::to_vector<8>(stack_buffer_type.getShape());
-  slice_shape[0] = 1;
-  // Caculate the updated buffer.
-  auto update_slice = builder.create<TF::ReshapeOp>(
-      push.getLoc(),
-      ArrayRef<Type>{RankedTensorType::get(slice_shape,
-                                           stack_buffer_type.getElementType())},
-      ArrayRef<Value>{push.elem(),
-                      GetR1Const(slice_shape, builder, push.getLoc())},
-      ArrayRef<NamedAttribute>{});
   stack_val =
-      builder
-          .create<TF::XlaDynamicUpdateSliceOp>(
-              push.getLoc(), ArrayRef<Type>{stack_val.getType()},
-              ArrayRef<Value>{stack_val, update_slice,
-                              GetIndicesForStackElement(
-                                  index, stack_val, builder, push.getLoc())},
-              ArrayRef<NamedAttribute>{})
-          .output();
+      cutil::SetElement(index, stack_val, push.elem(), builder, push.getLoc());
   // Assign the new buffer and size.
   WriteLocalVariable(push.handle(), stack_val, builder, push.getLoc());
   index = builder.create<TF::AddV2Op>(
       push.getLoc(), ArrayRef<Type>{index.getType()},
-      ArrayRef<Value>{index, GetR1Const({1}, builder, push.getLoc())},
+      ArrayRef<Value>{index, cutil::GetR1Const({1}, builder, push.getLoc())},
       ArrayRef<NamedAttribute>{});
   WriteLocalVariable(it->getSecond(), index, builder, push.getLoc());
   push.erase();
@@ -605,7 +523,7 @@ LogicalResult HandleStackPopV2Op(
     llvm::SmallDenseMap<Value, Value>* data_var_to_size_var) {
   auto it = data_var_to_size_var->find(pop.handle());
   if (it == data_var_to_size_var->end()) {
-    return pop.emitOpError("unknown stack.");
+    return pop.emitOpError("unknown stack");
   }
   OpBuilder builder(pop);
   // Read the current buffer and size.
@@ -613,31 +531,10 @@ LogicalResult HandleStackPopV2Op(
   auto size = ReadLocalVariable(it->getSecond(), builder, pop.getLoc());
   auto new_size = builder.create<TF::SubOp>(
       pop.getLoc(), ArrayRef<Type>{size.getType()},
-      ArrayRef<Value>{size, GetR1Const({1}, builder, pop.getLoc())},
+      ArrayRef<Value>{size, cutil::GetR1Const({1}, builder, pop.getLoc())},
       ArrayRef<NamedAttribute>{});
-  auto stack_val_type = stack_val.getType().cast<RankedTensorType>();
-  auto elem_type = RankedTensorType::get(stack_val_type.getShape().drop_front(),
-                                         stack_val_type.getElementType());
-  // Slice the buffer to get the element.
-  llvm::SmallVector<int64_t, 8> slice_size;
-  slice_size.push_back(1);
-  for (int64_t dim : elem_type.getShape()) slice_size.push_back(dim);
-  auto size_const = GetR1Const(slice_size, builder, pop.getLoc());
-  auto slice_type =
-      RankedTensorType::get(slice_size, stack_val_type.getElementType());
-  auto slice = builder.create<TF::SliceOp>(
-      pop.getLoc(), ArrayRef<Type>{slice_type},
-      ArrayRef<Value>{
-          stack_val,
-          GetIndicesForStackElement(new_size, stack_val, builder, pop.getLoc()),
-          size_const},
-      ArrayRef<NamedAttribute>{});
-  auto pop_val = builder.create<TF::ReshapeOp>(
-      pop.getLoc(), ArrayRef<Type>{elem_type},
-      ArrayRef<Value>{slice,
-                      GetR1Const(elem_type.getShape(), builder, pop.getLoc())},
-      ArrayRef<NamedAttribute>{});
-  pop.replaceAllUsesWith(pop_val.output());
+  auto pop_val = cutil::GetElement(new_size, stack_val, builder, pop.getLoc());
+  pop.replaceAllUsesWith(pop_val);
   // Update the size.
   WriteLocalVariable(it->getSecond(), new_size, builder, pop.getLoc());
   pop.erase();
@@ -688,8 +585,7 @@ LogicalResult DecomposeStackOpsInternal(
     } else if (auto pcall = llvm::dyn_cast<TF::PartitionedCallOp>(&op)) {
       if (!pcall.f().isa<FlatSymbolRefAttr>()) {
         return pcall.emitOpError(
-            "Stack decomposition does not support call with nested "
-            "references.");
+            "stack decomposition does not support call with nested references");
       }
       if (failed(HandlePartitionedCallOp(
               pcall, module.lookupSymbol<FuncOp>(pcall.f().getRootReference()),
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
new file mode 100644
index 00000000000..8b1ba7d1d30
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
@@ -0,0 +1,695 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
+#include "mlir/IR/Attributes.h"  // TF:llvm-project
+#include "mlir/IR/Builders.h"  // TF:llvm-project
+#include "mlir/IR/Function.h"  // TF:llvm-project
+#include "mlir/IR/Module.h"  // TF:llvm-project
+#include "mlir/IR/StandardTypes.h"  // TF:llvm-project
+#include "mlir/IR/TypeUtilities.h"  // TF:llvm-project
+#include "mlir/Pass/Pass.h"  // TF:llvm-project
+#include "mlir/Support/LogicalResult.h"  // TF:llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace mlir {
+
+namespace {
+
+namespace cutil = TF::collection_ops_util;
+
+// A pass that rewrites tensor list operations to tensor operations on buffers
+// and size values.
+//
+// This pass requires that the full shape of the tensor list can be inferred: 1)
+// the maximum size needs to be a constant and 2) the element shape needs to be
+// constant.
+//
+// A tensor list creation op "tf.EmptyTensorList"/"tf.TensorListReserve" will be
+// turned in to a zero-initialized buffer, and the size is initialized to a 0
+// for "tf.EmptyTensorList" or the specified size for "tf.TensorListReserve".
+// Each push will be turned into "tf.XlaDynamicUpdateSlice" with the incremented
+// size, and each pop will be turned into a "tf.Slice" and a copy of the buffer
+// with decremented size. Each SetItem will be turned into a
+// "tf.XlaDynamicUpdateSlice" with unchanged size, and each GetItem will be
+// turned into a "tf.Slice".
+//
+// The pass also works across control flow and functional calls.
+struct TensorListOpsDecompositionPass
+    : public ModulePass<TensorListOpsDecompositionPass> {
+  void runOnModule() override;
+};
+
+// Updates func's type according to its current arguments and return values.
+void UpdateFuncType(FuncOp func) {
+  llvm::SmallVector<Type, 8> arg_types;
+  for (auto arg : func.getArguments()) arg_types.push_back(arg.getType());
+  func.setType(FunctionType::get(
+      arg_types,
+      llvm::to_vector<8>(func.front().getTerminator()->getOperandTypes()),
+      func.getContext()));
+}
+
+// Holds the size value of a tensor list and whether the size is statically
+// known (fixed).
+struct SizeInfo {
+  Value size;
+  bool fixed;
+};
+
+// Modifies a function's signature to rewrite tensor list arguments to buffers
+// and sizes.
+void ModifyFunctionSignature(
+    FuncOp func, Type size_type,
+    llvm::SmallDenseMap<Value, SizeInfo>* buffer_to_size,
+    llvm::function_ref<llvm::Optional<Type>(int64_t)> arg_to_buffer_type,
+    llvm::function_ref<bool(int64_t)> arg_buffer_size_is_fixed) {
+  auto new_input_types = llvm::to_vector<8>(func.getType().getInputs());
+  int64_t original_arg_count = new_input_types.size();
+  for (int64_t i = 0; i < original_arg_count; ++i) {
+    auto buffer_type = arg_to_buffer_type(i);
+    if (!buffer_type.hasValue()) continue;
+    func.getArgument(i).setType(*buffer_type);
+    new_input_types[i] = *buffer_type;
+    auto size_arg = func.front().addArgument(size_type);
+    new_input_types.push_back(size_arg.getType());
+    if (buffer_to_size) {
+      (*buffer_to_size)[func.getArgument(i)] = {size_arg,
+                                                arg_buffer_size_is_fixed(i)};
+    }
+  }
+  UpdateFuncType(func);
+}
+
+// Holds information about a decomposed callee function for
+// PartitionedCall/StatefulPartitionedCall.
+struct PartitionedCallDecompositionInfo {
+  bool signature_change;
+  FuncOp decomposed_callee;
+  llvm::SmallDenseMap<int64_t, int64_t> buffer_arg_to_size_arg;
+  // Each element is a tuple of (buffer_return_index, size_return_index,
+  // fixed_size).
+  llvm::SmallVector<std::tuple<int64_t, int64_t, bool>, 8>
+      buffer_ret_to_size_ret;
+};
+
+LogicalResult DecomposeTensorListOpsInternal(
+    Block*, ModuleOp, llvm::SmallDenseMap<Value, SizeInfo>*,
+    llvm::SmallDenseMap<FuncOp, PartitionedCallDecompositionInfo>*);
+
+// Adds the corresponding sizes of tensor list buffers in func's return values
+// to the list of return values. Returns the mapping from the buffer indices to
+// the added size indices, which is a list of tuples (buffer_return_index,
+// size_return_index, fixed_size).
+llvm::SmallVector<std::tuple<int64_t, int64_t, bool>, 8>
+AddTensorListSizesToReturn(
+    FuncOp func, const llvm::SmallDenseMap<Value, SizeInfo>& buffer_to_size) {
+  auto old_return = func.front().getTerminator();
+  auto new_returns = llvm::to_vector<8>(old_return->getOperands());
+  llvm::SmallVector<std::tuple<int64_t, int64_t, bool>, 8>
+      output_buffer_to_size;
+  for (auto retval : llvm::enumerate(old_return->getOperands())) {
+    auto it = buffer_to_size.find(retval.value());
+    if (it == buffer_to_size.end()) continue;
+    output_buffer_to_size.emplace_back(retval.index(), new_returns.size(),
+                                       it->getSecond().fixed);
+    new_returns.push_back(it->getSecond().size);
+  }
+  OpBuilder(old_return).create<ReturnOp>(old_return->getLoc(), new_returns);
+  old_return->erase();
+  UpdateFuncType(func);
+  return output_buffer_to_size;
+}
+
+LogicalResult HandleWhileOp(
+    TF::WhileOp while_op, ModuleOp module,
+    llvm::SmallDenseMap<Value, SizeInfo>* buffer_to_size,
+    llvm::SmallDenseMap<FuncOp, PartitionedCallDecompositionInfo>*
+        decomposed_partitioned_call_callees) {
+  // Rewrite body.
+  auto body = module.lookupSymbol<FuncOp>(while_op.body());
+  llvm::SmallDenseMap<Value, SizeInfo> body_map;
+  auto find_arg_tensor_list_type = [&](int64_t index) -> llvm::Optional<Type> {
+    auto it = buffer_to_size->find(while_op.getOperand(index));
+    if (it == buffer_to_size->end()) return llvm::None;
+    return it->getFirst().getType();
+  };
+  auto arg_buffer_size_is_fixed = [&](int64_t index) {
+    return (*buffer_to_size)[while_op.getOperand(index)].fixed;
+  };
+  OpBuilder builder(while_op);
+  ModifyFunctionSignature(body, cutil::GetSizeType(builder), &body_map,
+                          find_arg_tensor_list_type, arg_buffer_size_is_fixed);
+  if (failed(DecomposeTensorListOpsInternal(
+          &body.front(), module, &body_map,
+          decomposed_partitioned_call_callees))) {
+    return failure();
+  }
+  auto output_buffer_to_size = AddTensorListSizesToReturn(body, body_map);
+
+  // Rewrite cond.
+  auto cond = module.lookupSymbol<FuncOp>(while_op.cond());
+  llvm::SmallDenseMap<Value, SizeInfo> cond_map;
+  ModifyFunctionSignature(cond, cutil::GetSizeType(builder), &cond_map,
+                          find_arg_tensor_list_type, arg_buffer_size_is_fixed);
+  if (failed(DecomposeTensorListOpsInternal(
+          &cond.front(), module, &cond_map,
+          decomposed_partitioned_call_callees))) {
+    return failure();
+  }
+  if (output_buffer_to_size.empty()) {
+    return success();
+  }
+  // Create the new while op.
+  auto new_while_operands = llvm::to_vector<8>(while_op.getOperands());
+  auto new_output_shapes =
+      llvm::to_vector<8>(while_op.output_shapes().getValue());
+  for (int64_t i = 0; i < while_op.getNumResults(); ++i) {
+    auto it = buffer_to_size->find(while_op.getOperand(i));
+    if (it == buffer_to_size->end()) continue;
+    new_while_operands.push_back(it->getSecond().size);
+    if (!new_output_shapes.empty()) {
+      // Size is a scalar shape.
+      tensorflow::TensorShapeProto shape_proto;
+      new_output_shapes.push_back(builder.getStringAttr(
+          tensorflow::mangling_util::MangleShape(shape_proto)));
+    }
+  }
+  auto new_while =
+      builder.create<TF::WhileOp>(while_op.getLoc(), body.getType().getInputs(),
+                                  new_while_operands, while_op.getAttrs());
+  new_while.setAttr("output_shapes", builder.getArrayAttr(new_output_shapes));
+  for (const auto& entry : output_buffer_to_size) {
+    (*buffer_to_size)[new_while.getResult(std::get<0>(entry))] = {
+        new_while.getResult(std::get<1>(entry)), std::get<2>(entry)};
+  }
+  while_op.replaceAllUsesWith(
+      new_while.getResults().take_front(while_op.getNumResults()));
+  while_op.erase();
+  return success();
+}
+
+LogicalResult HandleIfOp(
+    TF::IfOp if_op, ModuleOp module,
+    llvm::SmallDenseMap<Value, SizeInfo>* buffer_to_size,
+    llvm::SmallDenseMap<FuncOp, PartitionedCallDecompositionInfo>*
+        decomposed_partitioned_call_callees) {
+  // Rewrite the branches.
+  auto then_branch = module.lookupSymbol<FuncOp>(if_op.then_branch());
+  auto else_branch = module.lookupSymbol<FuncOp>(if_op.else_branch());
+  llvm::SmallDenseMap<Value, SizeInfo> then_map;
+  llvm::SmallDenseMap<Value, SizeInfo> else_map;
+
+  auto find_arg_buffer_type = [&](int64_t index) -> llvm::Optional<Type> {
+    auto it = buffer_to_size->find(if_op.getOperand(index + 1));
+    if (it == buffer_to_size->end()) return llvm::None;
+    return it->getFirst().getType();
+  };
+  auto arg_buffer_size_is_fixed = [&](int64_t index) {
+    return (*buffer_to_size)[if_op.getOperand(index + 1)].fixed;
+  };
+  OpBuilder builder(if_op);
+  ModifyFunctionSignature(then_branch, cutil::GetSizeType(builder), &then_map,
+                          find_arg_buffer_type, arg_buffer_size_is_fixed);
+  ModifyFunctionSignature(else_branch, cutil::GetSizeType(builder), &else_map,
+                          find_arg_buffer_type, arg_buffer_size_is_fixed);
+  const bool arg_no_changed = then_map.empty();
+  if (failed(DecomposeTensorListOpsInternal(
+          &then_branch.front(), module, &then_map,
+          decomposed_partitioned_call_callees)) ||
+      failed(DecomposeTensorListOpsInternal(
+          &else_branch.front(), module, &else_map,
+          decomposed_partitioned_call_callees))) {
+    return failure();
+  }
+  auto output_buffer_to_size =
+      AddTensorListSizesToReturn(then_branch, then_map);
+  AddTensorListSizesToReturn(else_branch, else_map);
+  if (output_buffer_to_size.empty() && arg_no_changed) return success();
+  // Recreate the If op.
+  auto new_if_operands = llvm::to_vector<8>(if_op.getOperands());
+  auto new_output_shapes = llvm::to_vector<8>(if_op.output_shapes().getValue());
+  for (int64_t i = 1; i < if_op.getNumOperands(); ++i) {
+    auto it = buffer_to_size->find(if_op.getOperand(i));
+    if (it == buffer_to_size->end()) continue;
+    new_if_operands.push_back(it->getSecond().size);
+    if (!new_output_shapes.empty()) {
+      // Size is a scalar shape.
+      tensorflow::TensorShapeProto shape_proto;
+      new_output_shapes.push_back(builder.getStringAttr(
+          tensorflow::mangling_util::MangleShape(shape_proto)));
+    }
+  }
+  auto new_if = OpBuilder(if_op).create<TF::IfOp>(
+      if_op.getLoc(), then_branch.getType().getResults(), new_if_operands,
+      if_op.getAttrs());
+  new_if.setAttr("output_shapes", builder.getArrayAttr(new_output_shapes));
+  for (const auto& entry : output_buffer_to_size) {
+    (*buffer_to_size)[new_if.getResult(std::get<0>(entry))] = {
+        new_if.getResult(std::get<1>(entry)), std::get<2>(entry)};
+  }
+  if_op.replaceAllUsesWith(
+      new_if.getResults().take_front(if_op.getNumResults()));
+  if_op.erase();
+  return success();
+}
+
+template <typename CallOp>
+LogicalResult HandlePartitionedCallOp(
+    CallOp call, FuncOp callee, ModuleOp module,
+    llvm::SmallDenseMap<Value, SizeInfo>* buffer_to_size,
+    llvm::SmallDenseMap<FuncOp, PartitionedCallDecompositionInfo>*
+        decomposed_partitioned_call_callees) {
+  auto emplace_res = decomposed_partitioned_call_callees->try_emplace(
+      callee, PartitionedCallDecompositionInfo());
+  auto& info = emplace_res.first->getSecond();
+  // Recreates the call op with info.
+  auto recreate_caller = [&] {
+    auto new_operands = llvm::to_vector<8>(call.getOperands());
+    for (int64_t i = 0; i < call.getNumOperands(); ++i) {
+      auto arg_it = info.buffer_arg_to_size_arg.find(i);
+      if (arg_it == info.buffer_arg_to_size_arg.end()) continue;
+      auto it = buffer_to_size->find(call.getOperand(i));
+      if (it == buffer_to_size->end()) {
+        call.emitOpError("unknown tensor list.");
+        return failure();
+      }
+      assert(arg_it->second == new_operands.size());
+      new_operands.push_back(it->getSecond().size);
+    }
+    OpBuilder builder(call);
+    auto new_call = builder.create<CallOp>(
+        call.getLoc(), info.decomposed_callee.getType().getResults(),
+        new_operands, call.getAttrs());
+    new_call.setAttr(
+        "f", builder.getSymbolRefAttr(
+                 const_cast<FuncOp&>(info.decomposed_callee).getName()));
+    for (const auto& entry : info.buffer_ret_to_size_ret) {
+      (*buffer_to_size)[new_call.getResult(std::get<0>(entry))] = {
+          new_call.getResult(std::get<1>(entry)), std::get<2>(entry)};
+    }
+    call.replaceAllUsesWith(
+        new_call.getResults().take_front(call.getNumResults()));
+    call.erase();
+    return success();
+  };
+  if (!emplace_res.second) {
+    // This callee was handled before.
+    if (!info.signature_change) return success();
+    return recreate_caller();
+  }
+  // Rewrite the callee on a cloned function.
+  llvm::SmallDenseMap<Value, SizeInfo> callee_map;
+  auto callee_clone = callee.clone();
+  auto find_arg_buffer_type = [&](int64_t index) -> llvm::Optional<Type> {
+    auto it = buffer_to_size->find(call.getOperand(index));
+    if (it == buffer_to_size->end()) return llvm::None;
+    return it->getFirst().getType();
+  };
+  auto arg_buffer_size_is_fixed = [&](int64_t index) {
+    return (*buffer_to_size)[call.getOperand(index)].fixed;
+  };
+  ModifyFunctionSignature(callee_clone, cutil::GetSizeType(OpBuilder(call)),
+                          &callee_map, find_arg_buffer_type,
+                          arg_buffer_size_is_fixed);
+  const bool args_no_changed = callee.empty();
+  if (failed(DecomposeTensorListOpsInternal(
+          &callee_clone.front(), module, &callee_map,
+          decomposed_partitioned_call_callees))) {
+    return failure();
+  }
+  info.buffer_ret_to_size_ret =
+      AddTensorListSizesToReturn(callee_clone, callee_map);
+  if (args_no_changed && info.buffer_ret_to_size_ret.empty()) {
+    // Signature is not modified. We do not need to keep two copies.
+    info.signature_change = false;
+    auto name = callee.getName();
+    callee.erase();
+    callee_clone.setName(name);
+    SymbolTable(module).insert(callee_clone);
+  } else {
+    info.signature_change = true;
+    info.decomposed_callee = callee_clone;
+    for (auto& entry : callee_map) {
+      auto buffer_arg = entry.getFirst().dyn_cast<BlockArgument>();
+      if (!buffer_arg) continue;
+      info.buffer_arg_to_size_arg[buffer_arg.getArgNumber()] =
+          entry.getSecond().size.cast<BlockArgument>().getArgNumber();
+    }
+
+    // Add the clone with a new name.
+    auto name = llvm::join(std::vector<std::string>{callee.getName().str(),
+                                                    "tensorlist_decomposed"},
+                           "_");
+    callee_clone.setName(name);
+    SymbolTable(module).insert(callee_clone);
+    callee = callee_clone;
+  }
+  if (info.signature_change) return recreate_caller();
+  return success();
+}
+
+// Parses an R1 value to `shape` if it is a TF::ConstOp output. Otherwise,
+// returns an error.
+LogicalResult GetConstShapeValue(Value shape_value,
+                                 llvm::SmallVector<int64_t, 8>* shape) {
+  auto shape_op = shape_value.getDefiningOp();
+  if (!shape_op) return failure();
+  auto shape_const_op = llvm::dyn_cast<TF::ConstOp>(shape_op);
+  if (!shape_const_op) return failure();
+  for (auto v : shape_const_op.value().getValues<APInt>()) {
+    shape->push_back(v.getSExtValue());
+  }
+  return success();
+}
+
+LogicalResult HandleEmptyTensorListOp(
+    TF::EmptyTensorListOp list,
+    llvm::SmallDenseMap<Value, SizeInfo>* buffer_to_size) {
+  Value buffer;
+  OpBuilder builder(list);
+  llvm::SmallVector<int64_t, 8> element_shape;
+  if (failed(GetConstShapeValue(list.element_shape(), &element_shape))) {
+    return list.emitOpError("unknown tensor list element shape");
+  }
+  if (failed(cutil::CreateInitBufferValue(
+          element_shape, list.max_num_elements(), list, list.element_dtype(),
+          builder, &buffer))) {
+    return failure();
+  }
+  Value size = cutil::GetR1Const({0LL}, builder, list.getLoc());
+  list.handle().replaceAllUsesWith(buffer);
+  (*buffer_to_size)[buffer] = {size, /*fixed=*/false};
+  list.erase();
+  return success();
+}
+
+LogicalResult HandleTensorListReserveOp(
+    TF::TensorListReserveOp list,
+    llvm::SmallDenseMap<Value, SizeInfo>* buffer_to_size) {
+  Value buffer;
+  OpBuilder builder(list);
+  llvm::SmallVector<int64_t, 8> element_shape;
+  if (failed(GetConstShapeValue(list.element_shape(), &element_shape))) {
+    return list.emitOpError("unknown tensor list element shape");
+  }
+  if (failed(cutil::CreateInitBufferValue(element_shape, list.num_elements(),
+                                          list, list.element_dtype(), builder,
+                                          &buffer))) {
+    return failure();
+  }
+  Value size = cutil::ReshapeScalarToSizeType(builder, list.num_elements(),
+                                              list.getLoc());
+  (*buffer_to_size)[buffer] = {size, /*fixed=*/true};
+  list.handle().replaceAllUsesWith(buffer);
+  list.erase();
+  return success();
+}
+
+LogicalResult HandleTensorListFromTensorOp(
+    TF::TensorListFromTensorOp list,
+    llvm::SmallDenseMap<Value, SizeInfo>* buffer_to_size) {
+  OpBuilder builder(list);
+  Value buffer = builder.create<TF::IdentityOp>(
+      list.getLoc(), ArrayRef<Type>{list.tensor().getType()},
+      ArrayRef<Value>{list.tensor()}, ArrayRef<NamedAttribute>{});
+  auto type = buffer.getType().cast<TensorType>();
+  if (!type.hasStaticShape()) {
+    return list.emitOpError("TensorListFromTensorOp input has unknown shape.");
+  }
+  Value size = cutil::GetR1Const({type.getShape()[0]}, builder, list.getLoc());
+  (*buffer_to_size)[buffer] = {size, /*fixed=*/true};
+  list.output_handle().replaceAllUsesWith(buffer);
+  list.erase();
+  return success();
+}
+
+LogicalResult HandleTensorListPushBackOp(
+    TF::TensorListPushBackOp push,
+    llvm::SmallDenseMap<Value, SizeInfo>* buffer_to_size) {
+  auto buffer = push.input_handle();
+  auto it = buffer_to_size->find(buffer);
+  if (it == buffer_to_size->end()) {
+    return push.emitOpError(
+        "found tf.TensorListPushBack on unknown TensorList.");
+  }
+  if (it->getSecond().fixed) {
+    return push.emitError("cannot push on a fixed-size tensor list");
+  }
+  auto size = it->getSecond().size;
+  OpBuilder builder(push);
+  auto new_buffer =
+      cutil::SetElement(size, buffer, push.tensor(), builder, push.getLoc());
+  auto new_size = builder.create<TF::AddV2Op>(
+      push.getLoc(), ArrayRef<Type>{size.getType()},
+      ArrayRef<Value>{size, cutil::GetR1Const({1LL}, builder, push.getLoc())},
+      ArrayRef<NamedAttribute>{});
+  push.output_handle().replaceAllUsesWith(new_buffer);
+  (*buffer_to_size)[new_buffer] = {new_size, /*fixed=*/false};
+  push.erase();
+  return success();
+}
+
+LogicalResult HandleTensorListPopBackOp(
+    TF::TensorListPopBackOp pop,
+    llvm::SmallDenseMap<Value, SizeInfo>* buffer_to_size) {
+  auto buffer = pop.input_handle();
+  auto it = buffer_to_size->find(buffer);
+  if (it == buffer_to_size->end()) {
+    pop.emitOpError("found tf.TensorListPopBack on unknown TensorList.");
+    return failure();
+  }
+  if (it->getSecond().fixed) {
+    return pop.emitError("cannot pop on a fixed-size tensor list");
+  }
+  auto size = it->getSecond().size;
+  OpBuilder builder(pop);
+  auto new_buffer = builder.create<TF::IdentityOp>(
+      pop.getLoc(), ArrayRef<Type>{buffer.getType()}, ArrayRef<Value>{buffer},
+      ArrayRef<NamedAttribute>{});
+  auto new_size = builder.create<TF::SubOp>(
+      pop.getLoc(), ArrayRef<Type>{size.getType()},
+      ArrayRef<Value>{size, cutil::GetR1Const({1LL}, builder, pop.getLoc())},
+      ArrayRef<NamedAttribute>{});
+  auto element = cutil::GetElement(new_size, new_buffer, builder, pop.getLoc());
+  pop.output_handle().replaceAllUsesWith(new_buffer);
+  pop.tensor().replaceAllUsesWith(element);
+  pop.erase();
+  (*buffer_to_size)[new_buffer] = {new_size, /*fixed=*/false};
+  return success();
+}
+
+LogicalResult HandleTensorListGetItemOp(
+    TF::TensorListGetItemOp get_item,
+    const llvm::SmallDenseMap<Value, SizeInfo>& buffer_to_size) {
+  auto buffer = get_item.input_handle();
+  auto it = buffer_to_size.find(buffer);
+  if (it == buffer_to_size.end()) {
+    get_item.emitOpError("found tf.TensorListGetItemOp on unknown TensorList.");
+    return failure();
+  }
+  OpBuilder builder(get_item);
+  auto index = cutil::ReshapeScalarToSizeType(builder, get_item.index(),
+                                              get_item.getLoc());
+  auto element =
+      cutil::GetElement(index, buffer, OpBuilder(get_item), get_item.getLoc());
+  get_item.item().replaceAllUsesWith(element);
+  get_item.erase();
+  return success();
+}
+
+LogicalResult HandleTensorListSetItemOp(
+    TF::TensorListSetItemOp set_item,
+    llvm::SmallDenseMap<Value, SizeInfo>* buffer_to_size) {
+  auto buffer = set_item.input_handle();
+  auto it = buffer_to_size->find(buffer);
+  if (it == buffer_to_size->end()) {
+    set_item.emitOpError("found tf.TensorListSetItemOp on unknown TensorList.");
+    return failure();
+  }
+  OpBuilder builder(set_item);
+  auto index = cutil::ReshapeScalarToSizeType(builder, set_item.index(),
+                                              set_item.getLoc());
+  auto new_buffer = cutil::SetElement(index, buffer, set_item.item(), builder,
+                                      set_item.getLoc());
+  set_item.output_handle().replaceAllUsesWith(new_buffer);
+  (*buffer_to_size)[new_buffer] = it->getSecond();
+  set_item.erase();
+  return success();
+}
+
+LogicalResult HandleTensorListLengthOp(
+    TF::TensorListLengthOp length,
+    const llvm::SmallDenseMap<Value, SizeInfo>& buffer_to_size) {
+  auto it = buffer_to_size.find(length.input_handle());
+  if (it == buffer_to_size.end()) {
+    length.emitOpError("found tf.TensorListLength on unknown TensorList.");
+    return failure();
+  }
+  OpBuilder builder(length);
+  if (it->getSecond().fixed) {
+    auto dim = cutil::CreateScalarConst(
+        length.input_handle().getType().cast<RankedTensorType>().getDimSize(0),
+        builder, length.getLoc());
+    length.length().replaceAllUsesWith(dim);
+  } else {
+    auto current_size = it->getSecond().size;
+    // Reshapes the R1 length to a scalar.
+    auto reshape = builder.create<TF::ReshapeOp>(
+        length.getLoc(),
+        ArrayRef<Type>{RankedTensorType::get(
+            {}, getElementTypeOrSelf(current_size.getType()))},
+        ArrayRef<Value>{current_size,
+                        cutil::GetR1Const({}, builder, length.getLoc())},
+        ArrayRef<NamedAttribute>{});
+    length.length().replaceAllUsesWith(reshape);
+  }
+  length.erase();
+  return success();
+}
+
+LogicalResult DecomposeTensorListOpsInternal(
+    Block* block, ModuleOp module,
+    llvm::SmallDenseMap<Value, SizeInfo>* buffer_to_size,
+    llvm::SmallDenseMap<FuncOp, PartitionedCallDecompositionInfo>*
+        decomposed_partitioned_call_callees) {
+  for (auto& op : llvm::make_early_inc_range(block->getOperations())) {
+    // TODO(yuanzx): Add a pass to remove identities in device computation.
+    if (llvm::isa<TF::IdentityOp>(&op) || llvm::isa<TF::IdentityNOp>(&op)) {
+      op.replaceAllUsesWith(op.getOperands());
+      op.erase();
+    } else if (auto list = llvm::dyn_cast<TF::EmptyTensorListOp>(&op)) {
+      if (failed(HandleEmptyTensorListOp(list, buffer_to_size))) {
+        return failure();
+      }
+    } else if (auto list = llvm::dyn_cast<TF::TensorListReserveOp>(&op)) {
+      if (failed(HandleTensorListReserveOp(list, buffer_to_size))) {
+        return failure();
+      }
+    } else if (auto list = llvm::dyn_cast<TF::TensorListFromTensorOp>(&op)) {
+      if (failed(HandleTensorListFromTensorOp(list, buffer_to_size))) {
+        return failure();
+      }
+    } else if (auto push = llvm::dyn_cast<TF::TensorListPushBackOp>(&op)) {
+      if (failed(HandleTensorListPushBackOp(push, buffer_to_size))) {
+        return failure();
+      }
+    } else if (auto pop = llvm::dyn_cast<TF::TensorListPopBackOp>(&op)) {
+      if (failed(HandleTensorListPopBackOp(pop, buffer_to_size))) {
+        return failure();
+      }
+    } else if (auto get_item = llvm::dyn_cast<TF::TensorListGetItemOp>(&op)) {
+      if (failed(HandleTensorListGetItemOp(get_item, *buffer_to_size))) {
+        return failure();
+      }
+    } else if (auto set_item = llvm::dyn_cast<TF::TensorListSetItemOp>(&op)) {
+      if (failed(HandleTensorListSetItemOp(set_item, buffer_to_size))) {
+        return failure();
+      }
+    } else if (auto length = llvm::dyn_cast<TF::TensorListLengthOp>(&op)) {
+      if (failed(HandleTensorListLengthOp(length, *buffer_to_size))) {
+        return failure();
+      }
+    } else if (auto stack = llvm::dyn_cast<TF::TensorListStackOp>(&op)) {
+      stack.tensor().replaceAllUsesWith(stack.input_handle());
+      stack.erase();
+    } else if (auto addn = llvm::dyn_cast<TF::AddNOp>(&op)) {
+      auto it = buffer_to_size->find(addn.getOperand(0));
+      if (it != buffer_to_size->end()) {
+        addn.sum().setType(addn.getOperand(0).getType());
+        (*buffer_to_size)[addn.sum()] = it->getSecond();
+      }
+    } else if (auto zeros = llvm::dyn_cast<TF::ZerosLikeOp>(&op)) {
+      if (buffer_to_size->count(zeros.x()) > 0) {
+        zeros.y().setType(zeros.x().getType());
+        (*buffer_to_size)[zeros.y()] = (*buffer_to_size)[zeros.x()];
+      }
+    } else if (auto while_op = llvm::dyn_cast<TF::WhileOp>(&op)) {
+      if (failed(HandleWhileOp(while_op, module, buffer_to_size,
+                               decomposed_partitioned_call_callees))) {
+        return failure();
+      }
+    } else if (auto if_op = llvm::dyn_cast<TF::IfOp>(&op)) {
+      if (failed(HandleIfOp(if_op, module, buffer_to_size,
+                            decomposed_partitioned_call_callees))) {
+        return failure();
+      }
+    } else if (auto pcall = llvm::dyn_cast<TF::PartitionedCallOp>(&op)) {
+      if (!pcall.f().isa<FlatSymbolRefAttr>()) {
+        return pcall.emitOpError(
+            "TensorList decomposition does not support call with nested "
+            "references.");
+      }
+      if (failed(HandlePartitionedCallOp(
+              pcall, module.lookupSymbol<FuncOp>(pcall.f().getRootReference()),
+              module, buffer_to_size, decomposed_partitioned_call_callees))) {
+        return failure();
+      }
+    } else if (auto spcall =
+                   llvm::dyn_cast<TF::StatefulPartitionedCallOp>(&op)) {
+      if (failed(HandlePartitionedCallOp(
+              spcall, module.lookupSymbol<FuncOp>(spcall.f()), module,
+              buffer_to_size, decomposed_partitioned_call_callees))) {
+        return failure();
+      }
+    }
+  }
+  return success();
+}
+
+LogicalResult DecomposeTensorListOps(Block* block, ModuleOp module) {
+  llvm::SmallDenseMap<Value, SizeInfo> buffer_to_size;
+  llvm::SmallDenseMap<FuncOp, PartitionedCallDecompositionInfo>
+      decomposed_partitioned_call_callees;
+  return DecomposeTensorListOpsInternal(block, module, &buffer_to_size,
+                                        &decomposed_partitioned_call_callees);
+}
+
+void TensorListOpsDecompositionPass::runOnModule() {
+  auto module = getModule();
+  auto main = module.lookupSymbol<FuncOp>("main");
+  if (!main) return;
+  if (failed(DecomposeTensorListOps(&main.front(), module))) {
+    signalPassFailure();
+  }
+}
+
+static PassRegistration<TensorListOpsDecompositionPass> pass(
+    "tf-tensor-list-ops-decomposition",
+    "Decompose tensor list operations into operations on buffers and sizes. "
+    "Needs static shapes.");
+
+}  // namespace
+
+namespace TF {
+std::unique_ptr<OpPassBase<ModuleOp>> CreateTensorListOpsDecompositionPass() {
+  return std::make_unique<TensorListOpsDecompositionPass>();
+}
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc
index 50059154ed7..7fe65b888d9 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc
@@ -244,8 +244,7 @@ void TPUDynamicLayoutPass::runOnFunction() {
     if (!compile || !compile->getResult(1).hasOneUse()) return;
     auto compile_launch = llvm::dyn_cast<tf_device::LaunchOp>(compile);
     if (!compile_launch || !compile_launch.WrapsSingleOp() ||
-        compile_launch.GetBody().front().getName().getStringRef() !=
-            "tf._TPUCompileMlir")
+        !llvm::isa<TF::_TPUCompileMlirOp>(compile_launch.GetBody().front()))
       return;
     executes_and_compiles.emplace_back(execute, compile_launch);
   });
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_merge_variables_with_execute.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_merge_variables_with_execute.cc
index 3b2815ec901..c1419873dba 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_merge_variables_with_execute.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_merge_variables_with_execute.cc
@@ -41,7 +41,6 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // TF:llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // TF:llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 
@@ -136,6 +135,10 @@ VariableAccessesForTPUExecute BuildVariableAccessInfo(
   // by inter-island dependencies.
   Operation* first_read = nullptr;
   Operation& execute = execute_launch.GetBody().front();
+  auto parallel_execute = llvm::dyn_cast<tf_device::ParallelExecuteOp>(
+      execute_launch.getParentOp());
+  Operation* execute_parent =
+      parallel_execute ? parallel_execute.getOperation() : execute_launch;
   // Find inputs that are variable reads.
   for (auto operand : llvm::enumerate(execute.getOpOperands())) {
     infos.new_operand_values.push_back(operand.value().get());
@@ -144,9 +147,9 @@ VariableAccessesForTPUExecute BuildVariableAccessInfo(
         operand.value().get().getDefiningOp());
     if (!read_op) continue;
     if (check_same_region &&
-        read_op.getParentRegion() != execute_launch.getParentRegion()) {
+        read_op.getParentRegion() != execute_parent->getParentRegion())
       continue;
-    }
+
     auto resource = read_op.resource();
 
     if (check_device) {
@@ -193,9 +196,9 @@ VariableAccessesForTPUExecute BuildVariableAccessInfo(
   // work fine for the reads/assigns created by resource lifting, since they are
   // placed close to the TPUExecute.
   Operation* last_may_modify_resource_access_before_execute = nullptr;
-  for (Operation& op : llvm::reverse(
-           llvm::make_range(std::next(first_read->getIterator()),
-                            execute_launch.getOperation()->getIterator()))) {
+  for (Operation& op :
+       llvm::reverse(llvm::make_range(std::next(first_read->getIterator()),
+                                      execute_parent->getIterator()))) {
     if (llvm::dyn_cast<TF::ReadVariableOp>(&op)) continue;
     if (!OpAccessesResource(&op)) continue;
     last_may_modify_resource_access_before_execute = &op;
@@ -232,10 +235,16 @@ VariableAccessesForTPUExecute BuildVariableAccessInfo(
   llvm::SmallPtrSet<Operation*, 8> all_assigns;
   llvm::SmallVector<bool, 8> output_fused(execute_launch.getNumResults(),
                                           false);
-  for (int i = 0; i < execute_launch.getNumResults(); ++i) {
+
+  auto execute_outputs =
+      parallel_execute
+          ? parallel_execute.GetRegionOutputs(
+                execute_launch.getParentRegion()->getRegionNumber())
+          : execute_launch.getResults();
+  for (auto execute_output : llvm::enumerate(execute_outputs)) {
     // TODO(lyandy): Handle updates to resource writes by remapping to parent
     // launch result and checking if launch result is an AssignVariableOp.
-    auto result = execute_launch.getResult(i);
+    auto result = execute_output.value();
     if (!result.hasOneUse()) continue;
     auto assign_op = llvm::dyn_cast<TF::AssignVariableOp>(*result.user_begin());
     if (!assign_op) continue;
@@ -250,21 +259,20 @@ VariableAccessesForTPUExecute BuildVariableAccessInfo(
       infos.per_resource_info.shrink_and_clear();
       return infos;
     }
-    info.execute_output_index = i;
+    info.execute_output_index = execute_output.index();
     info.assign = assign_op;
     if (!last_assign || last_assign->isBeforeInBlock(assign_op)) {
       last_assign = assign_op;
     }
     all_assigns.insert(assign_op);
-    output_fused[i] = true;
+    output_fused[execute_output.index()] = true;
   }
 
   // Check if there are other resource accesses after execute.
   Operation* first_unknown_resource_access_after_execute = nullptr;
   if (last_assign) {
-    for (auto& op : llvm::make_range(
-             std::next(execute_launch.getOperation()->getIterator()),
-             last_assign->getIterator())) {
+    for (auto& op : llvm::make_range(std::next(execute_parent->getIterator()),
+                                     last_assign->getIterator())) {
       if (all_assigns.count(&op) > 0) continue;
       if (!OpAccessesResource(&op)) continue;
       first_unknown_resource_access_after_execute = &op;
@@ -301,6 +309,115 @@ VariableAccessesForTPUExecute BuildVariableAccessInfo(
   return infos;
 }
 
+// Appends result types of tf_device.parallel_execute from `start` index region
+// (inclusive) to `end` index region (exclusive) to `output_types` and returns
+// the number of types added.
+int AppendTypes(llvm::SmallVectorImpl<Type>* output_types,
+                tf_device::ParallelExecuteOp parallel_execute, int start,
+                int end) {
+  const int size_before = output_types->size();
+  for (int index = start; index < end; ++index) {
+    Block& block = parallel_execute.GetRegionBlockWithIndex(index);
+    auto terminator_operand_types = block.getTerminator()->getOperandTypes();
+    output_types->append(terminator_operand_types.begin(),
+                         terminator_operand_types.end());
+  }
+  return output_types->size() - size_before;
+}
+
+// Replaces TPUExecute with TPUExecuteAndUpdateVariables in a
+// tf_device.parallel_execute op.
+void ReplaceParallelExecute(tf_device::ParallelExecuteOp parallel_execute,
+                            tf_device::LaunchOp execute_launch,
+                            tf_device::LaunchOp merged_execute_launch,
+                            const VariableAccessesForTPUExecute& infos,
+                            OpBuilder* builder) {
+  Operation* parallel_execute_op = parallel_execute.getOperation();
+
+  // Collect result types of tf_device.parallel_execute and update region
+  // result types with the new merged execute result types.
+  llvm::SmallVector<Type, 8> output_types;
+  const int parallel_execute_num_results = parallel_execute_op->getNumResults();
+  output_types.reserve(parallel_execute_num_results);
+  Region* execute_region = merged_execute_launch.getParentRegion();
+  const int region_index = execute_region->getRegionNumber();
+  const int num_results_before_region =
+      AppendTypes(&output_types, parallel_execute, 0, region_index);
+  // Append updated results from merged execute.
+  output_types.append(merged_execute_launch.getResultTypes().begin(),
+                      merged_execute_launch.getResultTypes().end());
+  const int num_regions = parallel_execute_op->getNumRegions();
+  const int num_results_after_region = AppendTypes(
+      &output_types, parallel_execute, region_index + 1, num_regions);
+
+  builder->setInsertionPoint(parallel_execute);
+  auto new_parallel_execute = builder->create<tf_device::ParallelExecuteOp>(
+      parallel_execute.getLoc(), num_regions, output_types);
+
+  // Replace the uses of the original parallel_execute before region containing
+  // merged execute.
+  Operation* new_parallel_execute_op = new_parallel_execute.getOperation();
+  for (int i = 0; i < num_results_before_region; ++i)
+    parallel_execute_op->getResult(i).replaceAllUsesWith(
+        new_parallel_execute_op->getResult(i));
+
+  // Replace the uses of the original parallel_execute after region containing
+  // merged execute. The number of results changed in the region containing the
+  // merged execute, but they should match, so results are replaced starting
+  // from the ends of both parallel_execute.
+  const int new_parallel_execute_num_results =
+      new_parallel_execute_op->getNumResults();
+  for (int i = 0; i < num_results_after_region; ++i)
+    parallel_execute_op->getResult(parallel_execute_num_results - i - 1)
+        .replaceAllUsesWith(new_parallel_execute_op->getResult(
+            new_parallel_execute_num_results - i - 1));
+
+  // Replace the uses of the original parallel_execute for the region containing
+  // the merged execute.
+  auto old_region_results = parallel_execute.GetRegionOutputs(region_index);
+  for (int i = 0; i < infos.old_to_new_output_mapping.size(); ++i) {
+    if (infos.old_to_new_output_mapping[i] < 0) continue;
+    old_region_results[i].replaceAllUsesWith(new_parallel_execute_op->getResult(
+        infos.old_to_new_output_mapping[i] + num_results_before_region));
+  }
+
+  // Replace original terminator with new terminator for returning merged
+  // execute results.
+  Operation* old_terminator = execute_region->front().getTerminator();
+  builder->setInsertionPointToEnd(&execute_region->front());
+  builder->create<tf_device::ReturnOp>(old_terminator->getLoc(),
+                                       merged_execute_launch.getResults());
+  old_terminator->erase();
+
+  // Remove the original TPUExecute op.
+  execute_launch.erase();
+
+  // Move all regions from old parallel_execute to new parallel_execute.
+  for (auto region : llvm::zip(new_parallel_execute_op->getRegions(),
+                               parallel_execute_op->getRegions()))
+    std::get<0>(region).takeBody(std::get<1>(region));
+
+  // Remove the original parallel_execute.
+  parallel_execute_op->dropAllUses();
+  parallel_execute.erase();
+}
+
+// Replaces TPUExecute with TPUExecuteAndUpdateVariables.
+void ReplaceExecute(tf_device::LaunchOp execute_launch,
+                    tf_device::LaunchOp merged_execute_launch,
+                    const VariableAccessesForTPUExecute& infos) {
+  // Replace the uses.
+  for (int i = 0; i < infos.old_to_new_output_mapping.size(); ++i) {
+    if (infos.old_to_new_output_mapping[i] < 0) continue;
+    execute_launch.getResult(i).replaceAllUsesWith(
+        merged_execute_launch.getResult(infos.old_to_new_output_mapping[i]));
+  }
+
+  // Remove the original TPUExecute op.
+  execute_launch.getOperation()->dropAllUses();
+  execute_launch.erase();
+}
+
 // Merges the variable accesses into one TPUExecute op.
 void MergeForOneTPUExecute(tf_device::LaunchOp execute_launch,
                            bool check_device, bool check_same_region,
@@ -352,19 +469,19 @@ void MergeForOneTPUExecute(tf_device::LaunchOp execute_launch,
   merged_execute.getOperation()->moveBefore(
       merged_execute_launch.GetBody().getTerminator());
 
-  // Replace the uses.
-  for (int i = 0; i < infos.old_to_new_output_mapping.size(); ++i) {
-    if (infos.old_to_new_output_mapping[i] < 0) continue;
-    execute_launch.getResult(i).replaceAllUsesWith(
-        merged_execute_launch.getResult(infos.old_to_new_output_mapping[i]));
-  }
+  if (auto parallel_execute = llvm::dyn_cast<tf_device::ParallelExecuteOp>(
+          execute_launch.getParentOp()))
+    ReplaceParallelExecute(parallel_execute, execute_launch,
+                           merged_execute_launch, infos, builder);
+  else
+    ReplaceExecute(execute_launch, merged_execute_launch, infos);
+
   // Remove the assign ops.
   for (const auto& entry : infos.per_resource_info) {
     const auto& info = entry.getSecond();
     if (info.assign) info.assign->erase();
   }
-  // Remove the original TPUExecute op.
-  execute_launch.erase();
+
   // Remove the read ops if they have no more uses.
   for (const auto& entry : infos.per_resource_info) {
     const auto& info = entry.getSecond();
@@ -372,25 +489,43 @@ void MergeForOneTPUExecute(tf_device::LaunchOp execute_launch,
   }
 }
 
+// Checks if an ops parent is a tf_device.parallel_execute and the region the
+// op is in is perfectly wrapped.
+bool ParentParallelExecuteWrapsSingleOp(Operation* op) {
+  auto parallel_execute =
+      llvm::dyn_cast<tf_device::ParallelExecuteOp>(op->getParentOp());
+  if (!parallel_execute) return true;
+
+  return parallel_execute.RegionWrapsSingleOp(
+      op->getParentRegion()->getRegionNumber());
+}
+
 void TPUMergeVariablesWithExecutePass::runOnFunction() {
   // Find all the executes first, since we will mutate the nodes around each
   // execute.
   llvm::SmallVector<tf_device::LaunchOp, 8> execute_launches;
   getFunction().walk([&](tf_device::LaunchOp op) {
-    if (op.WrapsSingleOp() && llvm::isa<TF::TPUExecuteOp>(op.GetBody().front()))
+    if (op.WrapsSingleOp() &&
+        llvm::isa<TF::TPUExecuteOp>(op.GetBody().front()) &&
+        ParentParallelExecuteWrapsSingleOp(op))
       execute_launches.push_back(op);
   });
 
   for (auto execute_launch : execute_launches) {
     OpBuilder builder(&getContext());
     const bool parent_is_replicate =
-        llvm::isa<tf_device::ReplicateOp>(execute_launch.getParentOp());
+        llvm::isa<tf_device::ReplicateOp>(execute_launch.getParentOp()) ||
+        (llvm::isa<tf_device::ParallelExecuteOp>(
+             execute_launch.getParentOp()) &&
+         llvm::isa<tf_device::ReplicateOp>(
+             execute_launch.getParentOp()->getParentOp()));
+
     // If this is inside a tf_device::ReplicateOp, the variables are guaranteed
     // to be on the same device as the TPUExecute op. Skip device checking in
     // that case, but we need to check that we are only merging reads/assigns
     // that are also in this replicated region.
-    MergeForOneTPUExecute(execute_launch, !parent_is_replicate,
-                          parent_is_replicate, &builder);
+    MergeForOneTPUExecute(execute_launch, /*check_device=*/!parent_is_replicate,
+                          /*check_same_region=*/parent_is_replicate, &builder);
   }
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
index 7b0291a2f9b..50b6555076d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -66,10 +67,21 @@ constexpr char kNumReplicasAttr[] = "num_replicas";
 constexpr char kNumCoresPerReplicaAttr[] = "num_cores_per_replica";
 constexpr char kStepMarkerLocationAttr[] = "step_marker_location";
 constexpr char kPaddingMapAttr[] = "padding_map";
+constexpr char kTopologyAttr[] = "topology";
+constexpr char kDeviceAssignmentAttr[] = "device_assignment";
 constexpr char kDeviceAttr[] = "device";
 constexpr char kDevicesAttr[] = "devices";
 constexpr char kVersionsAttr[] = "tf.versions";
 
+constexpr char kBadStringArrayElementMsg[] =
+    "bad '{0}' attribute at index {1}, not a string";
+constexpr char kBadIntArrayElementMsg[] =
+    "bad '{0}' attribute at index {1}, not an int";
+constexpr char kBadArrayElementMsg[] =
+    "bad '{0}' attribute at index {1} with value '{2}': failed to parse to {3}";
+constexpr char kBadArrayAttrLengthMsg[] =
+    "bad '{0}' attribute, expected array attribute of size {1}, got size {2}";
+
 // Rewrites `tf_device.launch_func` operations assigned to TPU into actual TPU
 // jit-compile runtime ops.
 //
@@ -150,17 +162,37 @@ LogicalResult EncapsulateFuncAndSerialize(FuncOp entry_func,
   return success();
 }
 
-// Populates a TPUCompileMetadataProto from attributes of a
-// `tf_device::LaunchFuncOp`. If any necessary attributes are missing from the
-// op, a failure will be returned.
-// TODO(lyandy): Support session handle and guaranteed consts.
-LogicalResult SetMetadataProtoFromLaunchFuncOp(
-    tf_device::LaunchFuncOp op, int num_replicas, int num_cores_per_replica,
-    llvm::Optional<xla::DeviceAssignmentProto>&& xla_device_assignment,
-    tensorflow::tpu::TPUCompileMetadataProto* metadata) {
-  metadata->set_num_replicas(num_replicas);
-  metadata->set_num_cores_per_replica(num_cores_per_replica);
+// Extracts device coordinates from a device assignment attribute on an op.
+LogicalResult GetDeviceCoordinates(
+    tf_device::LaunchFuncOp op,
+    llvm::SmallVectorImpl<int64_t>* device_assignment) {
+  auto device_assignment_attr =
+      op.getAttrOfType<ArrayAttr>(kDeviceAssignmentAttr);
+  if (!device_assignment_attr)
+    return op.emitOpError(CreateMissingAttributeMsg(kDeviceAssignmentAttr));
 
+  device_assignment->reserve(device_assignment_attr.size());
+
+  for (auto device_coordinate_and_idx :
+       llvm::enumerate(device_assignment_attr)) {
+    auto device_coordinate =
+        device_coordinate_and_idx.value().dyn_cast<IntegerAttr>();
+    if (!device_coordinate)
+      return op.emitOpError(llvm::formatv(kBadIntArrayElementMsg,
+                                          kDeviceAssignmentAttr,
+                                          device_coordinate_and_idx.index()));
+
+    device_assignment->push_back(device_coordinate.getInt());
+  }
+
+  return success();
+}
+
+// Populates a TPUCompileMetadataProto with StepMarkerLocation from a
+// `tf_device::LaunchFuncOp`.
+LogicalResult SetMetadataProtoStepMarkerLocation(
+    tf_device::LaunchFuncOp op,
+    tensorflow::tpu::TPUCompileMetadataProto* metadata) {
   auto step_marker_location =
       op.getAttrOfType<StringAttr>(kStepMarkerLocationAttr);
   if (!step_marker_location)
@@ -179,6 +211,14 @@ LogicalResult SetMetadataProtoFromLaunchFuncOp(
 
   metadata->set_step_marker_location(location);
 
+  return success();
+}
+
+// Populates a TPUCompileMetadataProto with PaddingMap from a
+// `tf_device::LaunchFuncOp`.
+LogicalResult SetMetadataProtoPaddingMap(
+    tf_device::LaunchFuncOp op,
+    tensorflow::tpu::TPUCompileMetadataProto* metadata) {
   auto padding_map = op.getAttrOfType<ArrayAttr>(kPaddingMapAttr);
   if (!padding_map)
     return op.emitOpError(CreateMissingAttributeMsg(kPaddingMapAttr));
@@ -187,25 +227,56 @@ LogicalResult SetMetadataProtoFromLaunchFuncOp(
     auto& padding_attr = padding_and_idx.value();
     auto padding_attr_str = padding_attr.dyn_cast<StringAttr>();
     if (!padding_attr_str)
-      return op.emitOpError(
-          llvm::formatv("bad '{0}' attribute at index {1}, not a string",
-                        kPaddingMapAttr, padding_and_idx.index()));
+      return op.emitOpError(llvm::formatv(
+          kBadStringArrayElementMsg, kPaddingMapAttr, padding_and_idx.index()));
 
     tensorflow::tpu::PaddingMap* padding =
         metadata->mutable_padding_maps()->Add();
     if (!padding->ParseFromString(std::string(padding_attr_str.getValue())))
       return op.emitOpError(llvm::formatv(
-          "bad '{0}' attribute at index {1} with value '{2}'", kPaddingMapAttr,
-          padding_and_idx.index(), padding_attr_str.getValue()));
+          kBadArrayElementMsg, kPaddingMapAttr, padding_and_idx.index(),
+          padding_attr_str.getValue(), "tpu::PaddingMap"));
   }
 
-  if (xla_device_assignment.hasValue())
-    *metadata->mutable_device_assignment() =
-        std::move(xla_device_assignment.getValue());
+  return success();
+}
+
+// Parses a xla::OpSharding from a string attribute.
+LogicalResult SetOpSharding(Operation* op, Attribute attr, llvm::StringRef name,
+                            int index, xla::OpSharding* sharding) {
+  auto sharding_str = attr.dyn_cast<StringAttr>();
+  if (!sharding_str)
+    return op->emitOpError(
+        llvm::formatv(kBadStringArrayElementMsg, name, index));
+
+  if (!sharding->ParseFromString(sharding_str.getValue().str()))
+    return op->emitOpError(llvm::formatv(kBadArrayElementMsg, name, index,
+                                         sharding_str.getValue(),
+                                         "xla::OpSharding"));
+
+  return success();
+}
+
+// Populates a TPUCompileMetadataProto with argument types and sharding from a
+// `tf_device::LaunchFuncOp`.
+LogicalResult SetMetadataProtoArgs(
+    tf_device::LaunchFuncOp op,
+    tensorflow::tpu::TPUCompileMetadataProto* metadata) {
+  auto input_shardings =
+      op.getAttrOfType<ArrayAttr>(tensorflow::kInputShardingAttr);
+  if (!input_shardings)
+    return op.emitOpError(
+        CreateMissingAttributeMsg(tensorflow::kInputShardingAttr));
+
+  if (input_shardings.size() != op.getNumOperands())
+    return op.emitOpError(
+        llvm::formatv(kBadArrayAttrLengthMsg, tensorflow::kInputShardingAttr,
+                      op.getNumOperands(), input_shardings.size()));
 
   // Set args metadata in proto.
   for (auto operand_type_and_idx : llvm::enumerate(op.getOperandTypes())) {
     Type operand_type = operand_type_and_idx.value();
+    int index = operand_type_and_idx.index();
     tensorflow::tpu::TPUCompileMetadataProto::Arg* arg = metadata->add_args();
     tensorflow::DataType dtype;
     tensorflow::Status status =
@@ -213,7 +284,7 @@ LogicalResult SetMetadataProtoFromLaunchFuncOp(
     if (!status.ok())
       return op.emitOpError(
           llvm::formatv("failed to determine operand type at index {0}: {1}",
-                        operand_type_and_idx.index(), status.error_message()));
+                        index, status.error_message()));
 
     arg->set_dtype(dtype);
     // TODO(lyandy): Support other arg kinds.
@@ -232,29 +303,67 @@ LogicalResult SetMetadataProtoFromLaunchFuncOp(
       arg->mutable_shape()->set_unknown_rank(true);
     }
 
-    // TODO(lyandy): Determine proper sharding of args once topology and devices
-    // are propagated to the pass.
-    xla::OpSharding sharding;
-    sharding.set_type(xla::OpSharding::MAXIMAL);
-    sharding.add_tile_assignment_dimensions(1);
-    sharding.add_tile_assignment_devices(0);
-    *arg->mutable_sharding() = std::move(sharding);
-  }
-
-  // Set retvals metadata in proto.
-  // TODO(lyandy): Determine proper sharding of retvals once topology and
-  // devices is propagated to the pass.
-  for (int i = 0; i < op.getNumResults(); ++i) {
-    xla::OpSharding sharding;
-    sharding.set_type(xla::OpSharding::MAXIMAL);
-    sharding.add_tile_assignment_dimensions(1);
-    sharding.add_tile_assignment_devices(0);
-    *metadata->add_retvals()->mutable_sharding() = std::move(sharding);
+    if (failed(SetOpSharding(op, input_shardings.getValue()[index],
+                             tensorflow::kInputShardingAttr, index,
+                             arg->mutable_sharding())))
+      return failure();
   }
 
   return success();
 }
 
+// Populates a TPUCompileMetadataProto with result sharding from a
+// `tf_device::LaunchFuncOp`.
+LogicalResult SetMetadataProtoRetvals(
+    tf_device::LaunchFuncOp op,
+    tensorflow::tpu::TPUCompileMetadataProto* metadata) {
+  auto output_shardings =
+      op.getAttrOfType<ArrayAttr>(tensorflow::kOutputShardingAttr);
+  if (!output_shardings)
+    return op.emitOpError(
+        CreateMissingAttributeMsg(tensorflow::kOutputShardingAttr));
+
+  if (output_shardings.size() != op.getNumResults())
+    return op.emitOpError(
+        llvm::formatv(kBadArrayAttrLengthMsg, tensorflow::kOutputShardingAttr,
+                      op.getNumResults(), output_shardings.size()));
+
+  // Set retvals metadata in proto.
+  for (auto output_sharding_and_idx : llvm::enumerate(output_shardings))
+    if (failed(SetOpSharding(op, output_sharding_and_idx.value(),
+                             tensorflow::kOutputShardingAttr,
+                             output_sharding_and_idx.index(),
+                             metadata->add_retvals()->mutable_sharding())))
+      return failure();
+
+  return success();
+}
+
+// Populates a TPUCompileMetadataProto from attributes of a
+// `tf_device::LaunchFuncOp`. If any necessary attributes are missing from the
+// op, a failure will be returned.
+// TODO(lyandy): Support session handle and guaranteed consts.
+LogicalResult SetMetadataProtoFromLaunchFuncOp(
+    tf_device::LaunchFuncOp op, int num_replicas, int num_cores_per_replica,
+    llvm::Optional<xla::DeviceAssignmentProto>&& xla_device_assignment,
+    tensorflow::tpu::TPUCompileMetadataProto* metadata) {
+  metadata->set_num_replicas(num_replicas);
+  metadata->set_num_cores_per_replica(num_cores_per_replica);
+
+  if (failed(SetMetadataProtoStepMarkerLocation(op, metadata)))
+    return failure();
+
+  if (failed(SetMetadataProtoPaddingMap(op, metadata))) return failure();
+
+  if (xla_device_assignment.hasValue())
+    *metadata->mutable_device_assignment() =
+        std::move(xla_device_assignment.getValue());
+
+  if (failed(SetMetadataProtoArgs(op, metadata))) return failure();
+
+  return SetMetadataProtoRetvals(op, metadata);
+}
+
 // Wraps single op in `tf_device.launch` for explicit device assignment.
 tf_device::LaunchOp WrapOpInLaunch(OpBuilder* builder, Location loc,
                                    Operation* op, llvm::StringRef device) {
@@ -282,9 +391,6 @@ Operation* BuildCompileOp(
     int num_cores_per_replica, llvm::StringRef compilation_device,
     llvm::Optional<xla::DeviceAssignmentProto>&& xla_device_assignment,
     OpBuilder* builder) {
-  // TODO(b/139377366): Use tf_tpu.compile build method when it is defined.
-  OperationState compile_op_state(launch_func.getLoc(), "tf._TPUCompileMlir");
-
   // Set metadata from attributes.
   tensorflow::tpu::TPUCompileMetadataProto metadata;
   if (failed(SetMetadataProtoFromLaunchFuncOp(
@@ -298,9 +404,6 @@ Operation* BuildCompileOp(
   else
     metadata.SerializeToString(&txt_metadata);
 
-  compile_op_state.addAttribute("metadata",
-                                builder->getStringAttr(txt_metadata));
-
   // Build a shape op for each input to launch_func.
   // TODO(b/139377366): When shape inference is ready, we can use compile time
   // shape inference to get inputs that have static shapes and only use shape
@@ -320,63 +423,77 @@ Operation* BuildCompileOp(
         operand_and_idx.value());
     compile_op_operands.emplace_back(shape_op.getResult());
   }
-  compile_op_state.addOperands(compile_op_operands);
-  compile_op_state.addAttribute(
-      "NumDynamicShapes",
-      builder->getI64IntegerAttr(compile_op_operands.size()));
 
-  FlatSymbolRefAttr func_attr =
-      launch_func.getAttrOfType<FlatSymbolRefAttr>("func");
-  if (!func_attr) {
-    launch_func.emitOpError("does not have `func` attribute");
-    return nullptr;
-  }
+  FlatSymbolRefAttr func_attr = launch_func.funcAttr();
   FuncOp func = launch_func.getParentOfType<ModuleOp>().lookupSymbol<FuncOp>(
       func_attr.getValue());
 
   std::string txt_module;
   if (failed(EncapsulateFuncAndSerialize(func, &txt_module))) return nullptr;
-  compile_op_state.addAttribute("mlir_module",
-                                builder->getStringAttr(txt_module));
 
-  // Result #0 is a string indicating whether compilation is successful or not.
-  compile_op_state.addTypes(
-      RankedTensorType::get({}, builder->getType<TF::StringType>()));
+  auto result_type =
+      RankedTensorType::get({}, builder->getType<TF::StringType>());
 
-  // Result #1 is key to look up executable binary in compilation cache.
-  compile_op_state.addTypes(
-      RankedTensorType::get({}, builder->getType<TF::StringType>()));
+  auto compile_op = builder->create<TF::_TPUCompileMlirOp>(
+      launch_func.getLoc(), /*compilation_status=*/result_type, /*program=*/
+      llvm::SmallVector<Type, 8>(num_cores_per_replica, result_type),
+      compile_op_operands, txt_module, txt_metadata);
 
-  Operation* compile_op = builder->createOperation(compile_op_state);
-
-  return WrapOpInLaunch(builder, compile_op->getLoc(), compile_op,
+  return WrapOpInLaunch(builder, compile_op.getLoc(), compile_op,
                         compilation_device);
 }
 
-// Creates a `tf.TPUExecute` op that executes TPU program generated by
-// `compile_op`.
-Operation* BuildExecuteOp(Operation* compile_op,
-                          tf_device::LaunchFuncOp launch_func,
-                          OpBuilder* builder) {
-  // TPUExecute inherits all launch_func inputs, and takes an additional input
-  // for compilation cache key.
-  llvm::SmallVector<Value, 4> tensor_inputs(launch_func.getOperands());
-  tensor_inputs.push_back(compile_op->getResult(1));
+// Assigns explicit devices to replicate op. An aliased device is created per
+// core, and all replica devices per core are grouped together.
+void AssignDevicesToReplicate(
+    tf_device::ReplicateOp replicate,
+    llvm::ArrayRef<llvm::SmallVector<std::string, 8>> execution_devices,
+    OpBuilder* builder) {
+  if (!replicate) return;
 
+  const int num_replicas = execution_devices.size();
+  const int num_cores_per_replica = execution_devices.front().size();
+
+  llvm::SmallVector<NamedAttribute, 8> device_attrs;
+  for (int core = 0; core < num_cores_per_replica; ++core) {
+    llvm::SmallVector<StringRef, 8> devices_by_core;
+    devices_by_core.reserve(num_replicas);
+    for (int replica = 0; replica < num_replicas; ++replica)
+      devices_by_core.push_back(execution_devices[replica][core]);
+
+    device_attrs.push_back(
+        builder->getNamedAttr(tensorflow::GetDeviceAliasForLogicalCore(core),
+                              builder->getStrArrayAttr(devices_by_core)));
+  }
+
+  replicate.setAttr(kDevicesAttr, builder->getDictionaryAttr(device_attrs));
+}
+
+// Creates a `tf.TPUExecute` op that executes TPU program.
+Operation* BuildExecuteOp(
+    const int core_id, llvm::ArrayRef<xla::OpSharding> output_sharding_config,
+    llvm::ArrayRef<Value> inputs, tf_device::LaunchFuncOp launch_func,
+    OpBuilder* builder) {
   // TODO(b/139377366): Need to snapshot all resource variable inputs in
   // follow-up CLs.
 
+  auto output_types = tensorflow::GetOutputTypesForLogicalDeviceComputation(
+      core_id, output_sharding_config, launch_func);
+
   // TPUExecute has same output types as launch_func.
-  return builder->create<TF::TPUExecuteOp>(
-      launch_func.getLoc(), launch_func.getResultTypes(), tensor_inputs,
-      llvm::ArrayRef<NamedAttribute>{});
+  return builder->create<TF::TPUExecuteOp>(launch_func.getLoc(), output_types,
+                                           inputs,
+                                           llvm::ArrayRef<NamedAttribute>{});
 }
 
 // Creates a tf_device.parallel_execute op that wraps TPUExecute op to
 // represent execution of TPU program in multiple logical cores.
-Operation* BuildParallelExecuteOp(int num_logical_cores, Operation* compile_op,
-                                  tf_device::LaunchFuncOp launch_func,
-                                  OpBuilder* builder) {
+tf_device::ParallelExecuteOp BuildParallelExecuteOp(
+    llvm::ArrayRef<llvm::SmallVector<std::string, 8>> execution_devices,
+    llvm::ArrayRef<xla::OpSharding> output_sharding_config,
+    Operation* compile_op, tf_device::LaunchFuncOp launch_func,
+    OpBuilder* builder) {
+  const int num_cores_per_replica = execution_devices.front().size();
   // parallel_execute op returns concatenated list of return values of
   // all its regions.
   //
@@ -385,18 +502,28 @@ Operation* BuildParallelExecuteOp(int num_logical_cores, Operation* compile_op,
   const auto& launch_result_types = launch_func.getResultTypes();
   llvm::SmallVector<Type, 8> concatenated_output_types;
   concatenated_output_types.reserve(launch_result_types.size() *
-                                    num_logical_cores);
+                                    num_cores_per_replica);
 
-  for (int core_id = 0; core_id < num_logical_cores; ++core_id)
-    for (Type t : launch_result_types)
-      concatenated_output_types.emplace_back(t);
+  for (int core = 0; core < num_cores_per_replica; ++core) {
+    auto output_types = tensorflow::GetOutputTypesForLogicalDeviceComputation(
+        core, output_sharding_config, launch_func);
+    for (Type t : output_types) concatenated_output_types.emplace_back(t);
+  }
 
   auto parallel_execute_op = builder->create<tf_device::ParallelExecuteOp>(
-      launch_func.getLoc(), num_logical_cores, concatenated_output_types);
+      launch_func.getLoc(), num_cores_per_replica, concatenated_output_types);
 
+  // Extract inputs for each region of the parallel_execute op. The i-th
+  // element in the list represents the input lists to TPU computation for
+  // i-th logical core.
+  auto input_list = tensorflow::ExtractInputsForLogicalDevices(
+      num_cores_per_replica, launch_func);
+
+  const bool replicated = execution_devices.size() != 1;
   // For each logical core, create a region with TPUExecute op.
-  for (int core_id = 0; core_id < num_logical_cores; ++core_id) {
-    auto& region = parallel_execute_op.GetRegionBlockWithIndex(core_id);
+  assert(input_list.size() == num_cores_per_replica);
+  for (int core = 0; core < num_cores_per_replica; ++core) {
+    auto& region = parallel_execute_op.GetRegionBlockWithIndex(core);
     builder->setInsertionPointToEnd(&region);
 
     // Create Execute op.
@@ -404,14 +531,21 @@ Operation* BuildParallelExecuteOp(int num_logical_cores, Operation* compile_op,
     // TODO(b/148913294): Identify inputs/return values specific to each
     // logical core TPU execution by parsing xla_sharding op in
     // launch_func.
-    auto execute = BuildExecuteOp(compile_op, launch_func, builder);
+    auto execute_inputs = input_list[core];
+    execute_inputs.emplace_back(compile_op->getResult(core + 1));
 
-    // Create a launch op for each region of parallel_execute.
-    //
-    // TODO(b/149102679): Add device attribute to launch op once device
-    // topology for multiple logical cores can be correctly parsed.
-    auto region_launch_op = WrapOpInLaunch(
-        builder, region.getParent()->getLoc(), execute, /*device=*/"");
+    auto execute = BuildExecuteOp(core, output_sharding_config, execute_inputs,
+                                  launch_func, builder);
+
+    // If computation is replicated, use aliased device. Otherwise there is only
+    // one execution device per core and the device is assigned to the execute
+    // op.
+    std::string device = replicated
+                             ? tensorflow::GetDeviceAliasForLogicalCore(core)
+                             : execution_devices.front()[core];
+
+    auto region_launch_op =
+        WrapOpInLaunch(builder, region.getParent()->getLoc(), execute, device);
 
     builder->create<tf_device::ReturnOp>(region.getParent()->getLoc(),
                                          region_launch_op.getResults());
@@ -420,43 +554,14 @@ Operation* BuildParallelExecuteOp(int num_logical_cores, Operation* compile_op,
   return parallel_execute_op;
 }
 
-// As tf_device.parallel_execute wraps # logical cores number of TPUExecute
-// ops, the number of return values of parallel_execute op exceeds that of
-// launch_func op. As so, each return value of parallel_execute op must be
-// mapped with corresponding return value usages of launch_func.
-//
-// TODO(b/148913294): Once argument and return value sharding of tpu computation
-// is determined, correctly map outputs of parallel_execute op.
-void RemapOutputsOfParallelExecute(tf_device::LaunchFuncOp launch_func,
-                                   Operation* op) {
-  for (auto outputs : llvm::zip(launch_func.getResults(), op->getResults()))
-    std::get<0>(outputs).replaceAllUsesWith(std::get<1>(outputs));
-}
-
 tf_device::LaunchOp AssignDevicesToReplicatedExecute(
     llvm::ArrayRef<llvm::SmallVector<std::string, 8>> execution_devices,
-    tf_device::ReplicateOp replicate, Operation* execute_op,
-    OpBuilder* builder) {
-  // If computation is replicated, execution devices are assigned to the
-  // replicate. Otherwise there is only one execution device and the device is
-  // assigned to the execute op.
-  std::string device;
-  if (replicate) {
-    // Model parallelism is not support for now. Therefore, assign all ops
-    // in replicate op with virtual device alias specifying that ops will be
-    // executed on the zeroth core.
-    llvm::SmallVector<llvm::StringRef, 4> replicate_execution_devices;
-    replicate_execution_devices.reserve(execution_devices.size());
-    for (const auto& replica_execution_devices : execution_devices)
-      replicate_execution_devices.push_back(replica_execution_devices.front());
-
-    device = tensorflow::GetDeviceAliasForLogicalCore(0);
-    auto device_attr = builder->getNamedAttr(
-        device, builder->getStrArrayAttr(replicate_execution_devices));
-    replicate.setAttr(kDevicesAttr, builder->getDictionaryAttr(device_attr));
-  } else {
-    device = execution_devices.front().front();
-  }
+    Operation* execute_op, OpBuilder* builder) {
+  const bool replicated = execution_devices.size() != 1;
+  // If computation is replicated, use aliased device. Otherwise there is only
+  // one execution device and the device is assigned to the execute op.
+  std::string device = replicated ? tensorflow::GetDeviceAliasForLogicalCore(0)
+                                  : execution_devices.front().front();
 
   return WrapOpInLaunch(builder, execute_op->getLoc(), execute_op, device);
 }
@@ -466,10 +571,8 @@ tf_device::LaunchOp AssignDevicesToReplicatedExecute(
 void BuildTPUCompileSucceededAssertOp(Operation* compile_op,
                                       llvm::StringRef compilation_device,
                                       OpBuilder* builder) {
-  OperationState assert_op_state(compile_op->getLoc(),
-                                 "tf.TPUCompileSucceededAssert");
-  assert_op_state.addOperands(compile_op->getResult(0));
-  Operation* assert_op = builder->createOperation(assert_op_state);
+  auto assert_op = builder->create<TF::TPUCompileSucceededAssertOp>(
+      compile_op->getLoc(), compile_op->getResult(0));
   WrapOpInLaunch(builder, compile_op->getLoc(), assert_op, compilation_device);
 }
 
@@ -551,11 +654,19 @@ LogicalResult Rewrite(
 
   int num_cores_per_replica = num_cores_per_replica_attr.getInt();
 
+  auto topology_attr = launch_func.getAttrOfType<StringAttr>(kTopologyAttr);
+  if (!topology_attr)
+    return launch_func.emitOpError(CreateMissingAttributeMsg(kTopologyAttr));
+
+  llvm::SmallVector<int64_t, 6> device_assignment;
+  if (failed(GetDeviceCoordinates(launch_func, &device_assignment)))
+    return failure();
+
   // Determine compilation and execution devices.
   auto status_or_tpu_device_assignment =
       tensorflow::GetTPUCompilationAndExecutionDevices(
-          devices, num_replicas, num_cores_per_replica, /*topology_attr=*/"",
-          /*device_assignment_attr=*/{});
+          devices, num_replicas, num_cores_per_replica,
+          topology_attr.getValue(), device_assignment);
   if (!status_or_tpu_device_assignment.ok())
     return launch_func.emitError()
            << "error in fetching TPU compilation/execution devices: "
@@ -581,21 +692,35 @@ LogicalResult Rewrite(
   BuildTPUCompileSucceededAssertOp(
       compile_op, tpu_device_assignment.compilation_device, builder);
 
+  AssignDevicesToReplicate(replicate, tpu_device_assignment.execution_devices,
+                           builder);
+
+  llvm::SmallVector<xla::OpSharding, 4> output_shardings;
+  auto result = tensorflow::ParseAndValidateOutputSharding(launch_func,
+                                                           &output_shardings);
+  if (failed(result)) return failure();
+
   if (num_cores_per_replica > 1) {
     // For model parallelism, tf_device.parallel_execute is used to express
     // concurrent device execution across multiple logical devices.
-    Operation* execute_op = BuildParallelExecuteOp(
-        num_cores_per_replica, compile_op, launch_func, builder);
+    tf_device::ParallelExecuteOp execute_op = BuildParallelExecuteOp(
+        tpu_device_assignment.execution_devices, output_shardings, compile_op,
+        launch_func, builder);
 
-    RemapOutputsOfParallelExecute(launch_func, execute_op);
-
-    // TODO(hongjunchoi): Correctly parse TPU topology and assign logical device
-    // attributes to launch_op's within parallel_execute op.
+    // As tf_device.parallel_execute wraps # logical cores number of TPUExecute
+    // ops, the number of return values of parallel_execute op exceeds that of
+    // launch_func op. As so, each return value of parallel_execute op must be
+    // mapped with corresponding return value usages of launch_func.
+    tensorflow::RemapOutputsFromLogicalDevices(output_shardings, launch_func,
+                                               execute_op);
   } else {
-    Operation* execute_op = BuildExecuteOp(compile_op, launch_func, builder);
+    llvm::SmallVector<Value, 4> execute_inputs(launch_func.getOperands());
+    execute_inputs.emplace_back(compile_op->getResult(1));
+
+    Operation* execute_op = BuildExecuteOp(
+        /*core_id=*/0, output_shardings, execute_inputs, launch_func, builder);
     tf_device::LaunchOp launch_op = AssignDevicesToReplicatedExecute(
-        tpu_device_assignment.execution_devices, replicate, execute_op,
-        builder);
+        tpu_device_assignment.execution_devices, execute_op, builder);
     launch_func.replaceAllUsesWith(launch_op);
   }
 
@@ -605,13 +730,14 @@ LogicalResult Rewrite(
 }
 
 void TPURewritePass::runOnModule() {
-  llvm::SmallVector<tensorflow::DeviceNameUtils::ParsedName, 8> devices;
+  mlir::TF::RuntimeDevices devices;
   if (failed(tensorflow::GetDevicesFromOp(getModule(), &devices)))
     return signalPassFailure();
 
   OpBuilder builder(&getContext());
   auto result = getModule().walk([&](tf_device::LaunchFuncOp op) {
-    if (failed(Rewrite(op, devices, &builder))) return WalkResult::interrupt();
+    if (failed(Rewrite(op, devices.device_names(), &builder)))
+      return WalkResult::interrupt();
 
     return WalkResult::advance();
   });
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
index 244df85f482..c9838ff9651 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // TF:llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h"
 #include "tensorflow/compiler/xla/client/sharding_builder.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -34,10 +35,6 @@ namespace mlir {
 namespace TFTPU {
 namespace {
 
-constexpr char kXlaShardingAttr[] = "_XlaSharding";
-constexpr char kInputShardingAttr[] = "input_sharding_configuration";
-constexpr char kOutputShardingAttr[] = "output_sharding_configuration";
-
 struct TPUShardingIdentificationPass
     : public ModulePass<TPUShardingIdentificationPass> {
   void runOnModule() override;
@@ -68,13 +65,6 @@ void GetAdjacentToXlaShardingOp(
   }
 }
 
-llvm::Optional<StringRef> ParseShardingAttribute(Operation* operation) {
-  const auto& sharding_attr =
-      operation->getAttrOfType<StringAttr>(kXlaShardingAttr);
-  if (!sharding_attr) return llvm::Optional<StringRef>();
-  return sharding_attr.getValue();
-}
-
 // Parse XlaSharding op connected to input args. If Input to
 // tf_device.LaunchFunc op is of resource type, then XlaSharding op
 // will be connected to following ReadVariable op.
@@ -97,7 +87,7 @@ llvm::Optional<StringRef> ParseInputSharding(const FuncOp func,
   }
 
   if (!parsed_sharding_op) return llvm::Optional<StringRef>();
-  return ParseShardingAttribute(parsed_sharding_op->getOperation());
+  return tensorflow::ParseShardingAttribute(parsed_sharding_op->getOperation());
 }
 
 // If operand of return value of tf_device.LaunchFunc op is directly from
@@ -105,9 +95,9 @@ llvm::Optional<StringRef> ParseInputSharding(const FuncOp func,
 llvm::Optional<StringRef> ParseReturnValueSharding(FuncOp func,
                                                    const int output_index,
                                                    const OpOperand& operand) {
-  if (auto sharding_op =
-          llvm::dyn_cast<TF::XlaShardingOp>(operand.get().getDefiningOp())) {
-    return ParseShardingAttribute(sharding_op.getOperation());
+  if (auto sharding_op = llvm::dyn_cast_or_null<TF::XlaShardingOp>(
+          operand.get().getDefiningOp())) {
+    return tensorflow::ParseShardingAttribute(sharding_op.getOperation());
   }
 
   return llvm::Optional<StringRef>();
@@ -153,8 +143,8 @@ void IdentifyXlaShardingForTPUComputation(tf_device::LaunchFuncOp launch_func) {
     if (!input_arg_sharding.hasValue()) continue;
     sharding_for_args[arg_index] = input_arg_sharding->str();
   }
-  SetShardingConfigurationAsAttribute(launch_func, kInputShardingAttr,
-                                      sharding_for_args);
+  SetShardingConfigurationAsAttribute(
+      launch_func, tensorflow::kInputShardingAttr, sharding_for_args);
 
   // By default return values from logical core 0 is used if no sharding
   // configuration is defined.
@@ -176,8 +166,8 @@ void IdentifyXlaShardingForTPUComputation(tf_device::LaunchFuncOp launch_func) {
       sharding_for_return_values[return_value_index] =
           return_val_sharding->str();
   }
-  SetShardingConfigurationAsAttribute(launch_func, kOutputShardingAttr,
-                                      sharding_for_return_values);
+  SetShardingConfigurationAsAttribute(
+      launch_func, tensorflow::kOutputShardingAttr, sharding_for_return_values);
 }
 
 void TPUShardingIdentificationPass::runOnModule() {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
index 26d1f75b382..6e698c3ca5c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
@@ -442,8 +442,7 @@ void HandleReplicateOp(TF::WhileOp while_op, tf_device::ReplicateOp replicate,
   if (!compile) return;
   auto compile_launch = llvm::dyn_cast<tf_device::LaunchOp>(compile);
   if (!compile_launch || !compile_launch.WrapsSingleOp() ||
-      compile_launch.GetBody().front().getName().getStringRef() !=
-          "tf._TPUCompileMlir")
+      !llvm::isa<TF::_TPUCompileMlirOp>(compile_launch.GetBody().front()))
     return;
 
   auto module = while_op.getParentOfType<ModuleOp>();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc b/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc
index 912a6aa722f..27939cba63c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc
@@ -25,8 +25,6 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Analysis/LoopAnalysis.h"  // TF:llvm-project
-#include "mlir/Dialect/QuantOps/FakeQuantSupport.h"  // TF:llvm-project
-#include "mlir/Dialect/QuantOps/UniformSupport.h"  // TF:llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/OpImplementation.h"  // TF:llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index 3d5355ba92a..366403e0654 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -2093,7 +2093,7 @@ Status GraphDefImporter::GetControlRetsFromFunctionGraph(
 
 // Stateful helper class to import a TensorFlow model expressed in SavedModel
 // into an MLIR Module.
-class SavedModelImporter : public ImporterBase {
+class SavedModelObjectGraphImporter : public ImporterBase {
  public:
   // Main entry point: converts all functions in the given meta graph to an MLIR
   // Module.
@@ -2102,7 +2102,7 @@ class SavedModelImporter : public ImporterBase {
       absl::Span<std::string> exported_names, bool add_default_attributes);
 
  private:
-  explicit SavedModelImporter(
+  explicit SavedModelObjectGraphImporter(
       const FunctionLibraryDefinition& flib, const GraphDebugInfo& debug_info,
       const GraphImportConfig& specs, mlir::ModuleOp module,
       std::unordered_map<std::string, std::string>* tf_name_to_mlir_name,
@@ -2799,7 +2799,7 @@ Status CreateSavedModelIR(
   return Status::OK();
 }
 
-StatusOr<mlir::OwningModuleRef> SavedModelImporter::Convert(
+StatusOr<mlir::OwningModuleRef> SavedModelObjectGraphImporter::Convert(
     SavedModelV2Bundle* saved_model, mlir::MLIRContext* context,
     absl::Span<std::string> exported_names, bool add_default_attributes) {
   GraphDebugInfo dummy_debug_info;
@@ -2828,8 +2828,9 @@ StatusOr<mlir::OwningModuleRef> SavedModelImporter::Convert(
       ConvertGraphDefToGraph(options, preprocessed_graphdef, &graph));
 
   NameUniquifier function_name_uniquifier(graph.flib_def());
-  SavedModelImporter importer(graph.flib_def(), debug_info, specs, module.get(),
-                              &tf_name_to_mlir_name, &function_name_uniquifier);
+  SavedModelObjectGraphImporter importer(graph.flib_def(), debug_info, specs,
+                                         module.get(), &tf_name_to_mlir_name,
+                                         &function_name_uniquifier);
 
   auto fn_names = graph.flib_def().ListFunctionNames();
   for (const auto& fn_name : fn_names) {
@@ -2870,20 +2871,20 @@ StatusOr<mlir::OwningModuleRef> SavedModelImporter::Convert(
 
 // A helper class to import a TensorFlow model expressed in SavedModel V1 into
 // an MLIR Module in SavedModel dialect.
-class SavedModelV1Importer {
+class SavedModelSignatureDefImporter {
  public:
   // Main entry point: converts all functions (specified by SignatureDefs) in
   // the given meta graph to an MLIR Module.
   static StatusOr<mlir::OwningModuleRef> Convert(const SavedModelBundle& bundle,
                                                  mlir::MLIRContext* context) {
-    SavedModelV1Importer importer(bundle, context);
+    SavedModelSignatureDefImporter importer(bundle, context);
 
     return importer.ConvertSignatures();
   }
 
  private:
-  SavedModelV1Importer(const SavedModelBundle& bundle,
-                       mlir::MLIRContext* context)
+  SavedModelSignatureDefImporter(const SavedModelBundle& bundle,
+                                 mlir::MLIRContext* context)
       : bundle_(bundle),
         module_(mlir::ModuleOp::create(mlir::UnknownLoc::get(context))) {}
 
@@ -2919,7 +2920,8 @@ class SavedModelV1Importer {
   mlir::OwningModuleRef module_;
 };
 
-StatusOr<mlir::OwningModuleRef> SavedModelV1Importer::ConvertSignatures() {
+StatusOr<mlir::OwningModuleRef>
+SavedModelSignatureDefImporter::ConvertSignatures() {
   const auto& signatures = bundle_.GetSignatures();
   const auto& graphdef = bundle_.meta_graph_def.graph_def();
   PopulateTfVersions(module_.get(), graphdef.versions());
@@ -2958,7 +2960,7 @@ StatusOr<mlir::OwningModuleRef> SavedModelV1Importer::ConvertSignatures() {
   return std::move(module_);
 }
 
-Status SavedModelV1Importer::ConvertSignature(
+Status SavedModelSignatureDefImporter::ConvertSignature(
     const GraphDef& graphdef, const std::string& sig_def_key,
     const std::map<std::string, TensorInfo>& inputs_sorted,
     const std::map<std::string, TensorInfo>& outputs_sorted,
@@ -3022,7 +3024,7 @@ Status SavedModelV1Importer::ConvertSignature(
   return Status::OK();
 }
 
-Status SavedModelV1Importer::LiftVariables() {
+Status SavedModelSignatureDefImporter::LiftVariables() {
   llvm::SmallVector<mlir::TF::VarHandleOp, 4> ops;
 
   bool contains_ref_variable = false;
@@ -3047,7 +3049,7 @@ Status SavedModelV1Importer::LiftVariables() {
   return Status::OK();
 }
 
-void SavedModelV1Importer::LiftVariable(mlir::TF::VarHandleOp op) {
+void SavedModelSignatureDefImporter::LiftVariable(mlir::TF::VarHandleOp op) {
   mlir::OpBuilder builder(&module_->getBodyRegion());
 
   auto func_op = op.getParentOfType<mlir::FuncOp>();
@@ -3077,7 +3079,7 @@ void SavedModelV1Importer::LiftVariable(mlir::TF::VarHandleOp op) {
   op.getOperation()->erase();
 }
 
-Status SavedModelV1Importer::ReadVariablesFromSession(
+Status SavedModelSignatureDefImporter::ReadVariablesFromSession(
     const llvm::SmallVectorImpl<mlir::TF::VarHandleOp>& ops) {
   mlir::OpBuilder builder(&module_->getBodyRegion());
 
@@ -3140,7 +3142,7 @@ Status SavedModelV1Importer::ReadVariablesFromSession(
   return Status::OK();
 }
 
-GraphImportConfig::InputArrays SavedModelV1Importer::ParseInputArrays(
+GraphImportConfig::InputArrays SavedModelSignatureDefImporter::ParseInputArrays(
     const std::map<std::string, TensorInfo>& inputs) {
   GraphImportConfig::InputArrays results;
   for (const auto& iter : inputs) {
@@ -3162,7 +3164,7 @@ GraphImportConfig::InputArrays SavedModelV1Importer::ParseInputArrays(
   return results;
 }
 
-std::vector<std::string> SavedModelV1Importer::ParseOutputArrays(
+std::vector<std::string> SavedModelSignatureDefImporter::ParseOutputArrays(
     const std::map<std::string, TensorInfo>& outputs) {
   std::vector<std::string> results;
   for (const auto& iter : outputs) {
@@ -3217,13 +3219,13 @@ StatusOr<mlir::OwningModuleRef> ConvertGraphToMlir(
 StatusOr<mlir::OwningModuleRef> ConvertSavedModelToMlir(
     SavedModelV2Bundle* saved_model, mlir::MLIRContext* context,
     absl::Span<std::string> exported_names, bool add_default_attributes) {
-  return SavedModelImporter::Convert(saved_model, context, exported_names,
-                                     add_default_attributes);
+  return SavedModelObjectGraphImporter::Convert(
+      saved_model, context, exported_names, add_default_attributes);
 }
 
 StatusOr<mlir::OwningModuleRef> ConvertSavedModelV1ToMlir(
     const SavedModelBundle& saved_model, mlir::MLIRContext* context) {
-  return SavedModelV1Importer::Convert(saved_model, context);
+  return SavedModelSignatureDefImporter::Convert(saved_model, context);
 }
 
 std::string MlirModuleToString(mlir::ModuleOp module, bool show_debug_info) {
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
index 5e958960d07..d5fcf86cc93 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
@@ -116,7 +116,7 @@ mlir::OwningModuleRef GraphdefToMlirTranslateFunction(
   return module_or.ConsumeValueOrDie();
 }
 
-mlir::OwningModuleRef SavedModelToMlirImport(
+mlir::OwningModuleRef SavedModelObjectGraphToMlirImport(
     absl::string_view saved_model_dir,
     const std::unordered_set<std::string>& tags,
     absl::Span<std::string> exported_names, mlir::MLIRContext* context) {
@@ -137,7 +137,7 @@ mlir::OwningModuleRef SavedModelToMlirImport(
   return module_or.ConsumeValueOrDie();
 }
 
-mlir::OwningModuleRef SavedModelV1ToMlirImport(
+mlir::OwningModuleRef SavedModelSignatureDefsToMlirImport(
     absl::string_view saved_model_dir,
     const std::unordered_set<std::string>& tags, mlir::MLIRContext* context) {
   tensorflow::SavedModelBundle bundle;
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
index 0380e1165a7..76bada96845 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
@@ -52,7 +52,7 @@ mlir::OwningModuleRef GraphdefToSplattedMlirTranslateFunction(
 // Converts a TensorFlow SavedModel stored in the directory with the given
 // `saved_model_dir` into a MLIR module. Creates MLIR entities into the
 // given MLIR `context`.
-mlir::OwningModuleRef SavedModelToMlirImport(
+mlir::OwningModuleRef SavedModelObjectGraphToMlirImport(
     absl::string_view saved_model_dir,
     const std::unordered_set<std::string>& tags,
     absl::Span<std::string> exported_names, mlir::MLIRContext* context);
@@ -60,7 +60,7 @@ mlir::OwningModuleRef SavedModelToMlirImport(
 // Converts a TensorFlow V1 SavedModel stored in the directory with the given
 // `saved_model_dir` into a MLIR module. Creates MLIR entities into the
 // given MLIR `context`.
-mlir::OwningModuleRef SavedModelV1ToMlirImport(
+mlir::OwningModuleRef SavedModelSignatureDefsToMlirImport(
     absl::string_view saved_model_dir,
     const std::unordered_set<std::string>& tags, mlir::MLIRContext* context);
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
index 35a84481851..10aad0a03ff 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
@@ -67,6 +67,7 @@ Status ParseMlirModule(llvm::StringRef mlir_module_string,
 // Converts arg_shapes to xla::Shape's and store into xla_input_shapes.
 Status GetXlaInputShapes(
     mlir::ModuleOp module, llvm::ArrayRef<TensorShape> arg_shapes,
+    bool use_tuple_args,
     const xla::CustomShapeRepresentationFn shape_representation_fn,
     std::vector<xla::Shape>* xla_input_shapes) {
   xla_input_shapes->clear();
@@ -88,8 +89,12 @@ Status GetXlaInputShapes(
     TF_ASSIGN_OR_RETURN(xla_shape,
                         shape_representation_fn(arg_shapes[i], dtype));
   }
-  xla_input_shapes->push_back(
-      xla::ShapeUtil::MakeTupleShape(individual_arg_shapes));
+  if (use_tuple_args) {
+    xla_input_shapes->push_back(
+        xla::ShapeUtil::MakeTupleShape(individual_arg_shapes));
+  } else {
+    *xla_input_shapes = individual_arg_shapes;
+  }
   return Status::OK();
 }
 
@@ -210,6 +215,7 @@ Status ConvertMLIRToXlaComputation(mlir::ModuleOp module_op,
                                    bool use_tuple_args, bool return_tuple) {
   mlir::PassManager tf2xla(module_op.getContext());
   tf2xla.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
+  tf2xla.addPass(mlir::TF::CreateTensorListOpsDecompositionPass());
   tf2xla.addPass(mlir::TF::CreateStackOpsDecompositionPass());
   tf2xla.addPass(mlir::TFDevice::CreateDecomposeResourceOpsPass());
   tf2xla.addPass(mlir::TF::CreatePromoteResourcesToArgsPass());
@@ -222,13 +228,17 @@ Status ConvertMLIRToXlaComputation(mlir::ModuleOp module_op,
   // and canonicalization opportunities that are necessary for the second
   // LegalizeTFPass(allow_partial_conversion=false) invocation.
   tf2xla.addNestedPass<mlir::FuncOp>(mlir::xla_hlo::createLegalizeTFPass(true));
-  tf2xla.addPass(mlir::tf_executor::CreateTFExecutorGraphPruningPass());
   tf2xla.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
   tf2xla.addNestedPass<mlir::FuncOp>(
       mlir::xla_hlo::createLegalizeTFPass(false));
 
-  if (VLOG_IS_ON(1))
-    tf2xla.enableIRPrinting(std::make_unique<tensorflow::BridgeLoggerConfig>());
+  if (VLOG_IS_ON(1)) {
+    // Print the whole module after each pass which requires disabling
+    // multi-threading as well.
+    tf2xla.disableMultithreading();
+    tf2xla.enableIRPrinting(std::make_unique<tensorflow::BridgeLoggerConfig>(
+        /*print_module_scope=*/true));
+  }
 
   // Make sure we catch any error reported by MLIR and forward it to the TF
   // error reporting system. Report a generic error if pass manager failed
@@ -252,6 +262,7 @@ Status ConvertMLIRToXlaComputation(mlir::ModuleOp module_op,
 
 Status CompileSerializedMlirToXlaHlo(
     llvm::StringRef mlir_module_string, llvm::ArrayRef<TensorShape> arg_shapes,
+    bool use_tuple_args,
     const XlaCompiler::ShapeRepresentationFn shape_representation_fn,
     XlaCompiler::CompilationResult* compilation_result) {
   mlir::MLIRContext mlir_context;
@@ -273,7 +284,7 @@ Status CompileSerializedMlirToXlaHlo(
   // Convert MLIR module to XLA HLO proto contained in XlaComputation.
   compilation_result->computation = std::make_shared<xla::XlaComputation>();
   TF_RETURN_IF_ERROR(ConvertMLIRToXlaComputation(
-      module_op, compilation_result->computation.get(), /*use_tuple_args=*/true,
+      module_op, compilation_result->computation.get(), use_tuple_args,
       /*return_tuple=*/true));
 
   // Construct mapping from XlaComputation's arg to input edges of execute
@@ -286,7 +297,7 @@ Status CompileSerializedMlirToXlaHlo(
       };
 
   // Compute all input shapes.
-  TF_RETURN_IF_ERROR(GetXlaInputShapes(module_op, arg_shapes,
+  TF_RETURN_IF_ERROR(GetXlaInputShapes(module_op, arg_shapes, use_tuple_args,
                                        shape_representation_fn_no_fast_memory,
                                        &compilation_result->xla_input_shapes));
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h
index ed25aaf929e..41fa8b90e4f 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h
@@ -50,6 +50,7 @@ Status ConvertMLIRToXlaComputation(mlir::ModuleOp module_op,
 // metadata and stores them in CompilationResult.
 Status CompileSerializedMlirToXlaHlo(
     llvm::StringRef mlir_module_string, llvm::ArrayRef<TensorShape> arg_shapes,
+    bool use_tuple_args,
     const XlaCompiler::ShapeRepresentationFn shape_representation_fn,
     XlaCompiler::CompilationResult* compilation_result);
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_test.cc
index 8e0f9cb2497..b258dd68ae1 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_test.cc
@@ -41,30 +41,31 @@ TEST(CompileSerializedMlirToXlaHloTest, InvalidSerializedMlirModule) {
   std::vector<TensorShape> arg_shapes;
   XlaCompiler::CompilationResult compilation_result;
 
-  Status s = CompileSerializedMlirToXlaHlo(invalid_mlir_module, arg_shapes,
-                                           TestShapeRepresentation,
-                                           &compilation_result);
+  Status s = CompileSerializedMlirToXlaHlo(
+      invalid_mlir_module, arg_shapes,
+      /*use_tuple_args=*/true, TestShapeRepresentation, &compilation_result);
   EXPECT_EQ(s.code(), tensorflow::errors::Code::INVALID_ARGUMENT);
   EXPECT_EQ(s.ToString(),
             "Invalid argument: could not parse MLIR module<stdin>: error: "
             "custom op 'totally' is unknown\n");
 }
 
-TEST(CompileSerializedMlirToXlaHloTest, Success) {
-  string mlir_module = R"(
-    module attributes {tf.versions = {producer = 179 : i32}} {
-      func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-        %0 = "tf.AddV2"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", name = "add"} : (tensor<f32>, tensor<f32>) -> tensor<f32>
-        return %0 : tensor<f32>
-      }
+constexpr llvm::StringRef kBinaryAddModule = R"(
+  module attributes {tf.versions = {producer = 179 : i32}} {
+    func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
+      %0 = "tf.AddV2"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", name = "add"} : (tensor<f32>, tensor<f32>) -> tensor<f32>
+      return %0 : tensor<f32>
     }
-  )";
+  }
+)";
 
+TEST(CompileSerializedMlirToXlaHloTest, TupleArgs) {
   std::vector<TensorShape> arg_shapes(2, TensorShape());
   XlaCompiler::CompilationResult compilation_result;
 
   Status s = CompileSerializedMlirToXlaHlo(
-      mlir_module, arg_shapes, TestShapeRepresentation, &compilation_result);
+      kBinaryAddModule, arg_shapes,
+      /*use_tuple_args=*/true, TestShapeRepresentation, &compilation_result);
   ASSERT_TRUE(s.ok());
 
   const xla::HloModuleConfig module_config(
@@ -86,7 +87,7 @@ ENTRY %main.6 (arg_tuple.1: (f32[], f32[])) -> (f32[]) {
   EXPECT_EQ(expected_hlo_module_string,
             status_or_hlo_module.ValueOrDie()->ToString());
 
-  // Expect an iota like input mapping.
+  // Expect an in order input mapping.
   EXPECT_EQ(compilation_result.input_mapping, std::vector<int>({0, 1}));
 
   // Expect a single tuple-shape, containing two F32 scalars.
@@ -116,6 +117,62 @@ ENTRY %main.6 (arg_tuple.1: (f32[], f32[])) -> (f32[]) {
   EXPECT_TRUE(compilation_result.resource_updates.empty());
 }
 
+TEST(CompileSerializedMlirToXlaHloTest, IndividualArgs) {
+  std::vector<TensorShape> arg_shapes(2, TensorShape());
+  XlaCompiler::CompilationResult compilation_result;
+
+  Status s = CompileSerializedMlirToXlaHlo(
+      kBinaryAddModule, arg_shapes,
+      /*use_tuple_args=*/false, TestShapeRepresentation, &compilation_result);
+  ASSERT_TRUE(s.ok());
+
+  const xla::HloModuleConfig module_config(
+      compilation_result.computation->GetProgramShape().ValueOrDie());
+  auto status_or_hlo_module = xla::HloModule::CreateFromProto(
+      compilation_result.computation->proto(), module_config);
+  ASSERT_TRUE(status_or_hlo_module.ok());
+  string expected_hlo_module_string = R"(HloModule main.5
+
+ENTRY %main.5 (Arg_0.1: f32[], Arg_1.2: f32[]) -> (f32[]) {
+  %Arg_0.1 = f32[] parameter(0)
+  %Arg_1.2 = f32[] parameter(1)
+  %add.3 = f32[] add(f32[] %Arg_0.1, f32[] %Arg_1.2)
+  ROOT %tuple.4 = (f32[]) tuple(f32[] %add.3)
+}
+
+)";
+  EXPECT_EQ(expected_hlo_module_string,
+            status_or_hlo_module.ValueOrDie()->ToString());
+
+  // Expect an in order input mapping.
+  EXPECT_EQ(compilation_result.input_mapping, std::vector<int>({0, 1}));
+
+  // Expect two inputs, each containing a F32 scalar.
+  EXPECT_EQ(compilation_result.xla_input_shapes.size(), 2);
+  xla::Shape expected_input_shape = xla::ShapeUtil::MakeShape(xla::F32, {});
+  EXPECT_EQ(compilation_result.xla_input_shapes[0], expected_input_shape);
+  EXPECT_EQ(compilation_result.xla_input_shapes[1], expected_input_shape);
+
+  // Expect output shape is a tuple shape containing a single F32 Scalar type.
+  const xla::Shape output_shape =
+      xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {});
+  const xla::Shape tuple_output_shape =
+      xla::ShapeUtil::MakeTupleShape({output_shape});
+  EXPECT_EQ(compilation_result.xla_output_shape, tuple_output_shape);
+
+  // Expect exactly 1 OutputDescription.
+  EXPECT_EQ(compilation_result.outputs.size(), 1);
+  const XlaCompiler::OutputDescription& output_desc =
+      compilation_result.outputs.front();
+  EXPECT_EQ(output_desc.type, DataType::DT_FLOAT);
+  EXPECT_EQ(output_desc.shape, TensorShape());
+  EXPECT_FALSE(output_desc.is_constant);
+  EXPECT_FALSE(output_desc.is_tensor_list);
+
+  // Expect no resource updates from computation.
+  EXPECT_TRUE(compilation_result.resource_updates.empty());
+}
+
 // Tests that foldable ops are constant-folded to enable legalization of ops
 // that require compile time constant operand.
 TEST(CompileSerializedMlirToXlaHloTest, CompileTimeConstantFoldedSuccess) {
@@ -136,7 +193,8 @@ TEST(CompileSerializedMlirToXlaHloTest, CompileTimeConstantFoldedSuccess) {
   XlaCompiler::CompilationResult compilation_result;
 
   Status s = CompileSerializedMlirToXlaHlo(
-      mlir_module, arg_shapes, TestShapeRepresentation, &compilation_result);
+      mlir_module, arg_shapes,
+      /*use_tuple_args=*/true, TestShapeRepresentation, &compilation_result);
   ASSERT_TRUE(s.ok());
 
   const xla::HloModuleConfig module_config(
@@ -174,7 +232,8 @@ TEST(CompileSerializedMlirToXlaHloTest, ShapeInference) {
   XlaCompiler::CompilationResult compilation_result;
 
   Status s = CompileSerializedMlirToXlaHlo(
-      mlir_module, arg_shapes, TestShapeRepresentation, &compilation_result);
+      mlir_module, arg_shapes,
+      /*use_tuple_args=*/true, TestShapeRepresentation, &compilation_result);
   TF_ASSERT_OK(s);
 
   const xla::HloModuleConfig module_config(
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_type.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_type.cc
index 7b0cbe6d5b5..84a8969a486 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_type.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_type.cc
@@ -57,6 +57,18 @@ Status ConvertDataType(DataType dtype, Builder builder, Type* type) {
     case DT_INT64:
       *type = builder.getIntegerType(64);
       return Status::OK();
+    case DT_UINT8:
+      *type = builder.getIntegerType(8, /*isSigned=*/false);
+      return Status::OK();
+    case DT_UINT16:
+      *type = builder.getIntegerType(16, /*isSigned=*/false);
+      return Status::OK();
+    case DT_UINT32:
+      *type = builder.getIntegerType(32, /*isSigned=*/false);
+      return Status::OK();
+    case DT_UINT64:
+      *type = builder.getIntegerType(64, /*isSigned=*/false);
+      return Status::OK();
     case DT_BFLOAT16:
       *type = builder.getBF16Type();
       return Status::OK();
@@ -99,16 +111,16 @@ Status ConvertScalarTypeToDataType(Type type, DataType* dtype) {
           *dtype = DT_BOOL;
           return Status::OK();
         case 8:
-          *dtype = DT_INT8;
+          *dtype = itype.isUnsigned() ? DT_UINT8 : DT_INT8;
           return Status::OK();
         case 16:
-          *dtype = DT_INT16;
+          *dtype = itype.isUnsigned() ? DT_UINT16 : DT_INT16;
           return Status::OK();
         case 32:
-          *dtype = DT_INT32;
+          *dtype = itype.isUnsigned() ? DT_UINT32 : DT_INT32;
           return Status::OK();
         case 64:
-          *dtype = DT_INT64;
+          *dtype = itype.isUnsigned() ? DT_UINT64 : DT_INT64;
           return Status::OK();
         default:
           return errors::Unimplemented(
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/device_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/device_util.cc
index e983f3e9c0c..9561d0a2f93 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/device_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/device_util.cc
@@ -20,7 +20,9 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/Regex.h"
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/Operation.h"  // TF:llvm-project
@@ -33,50 +35,124 @@ namespace tensorflow {
 
 constexpr char kDevicesAttr[] = "tf.devices";
 
-void AddDevicesToOp(mlir::Operation* op, const DeviceSet* device_set) {
-  if (!device_set) return;
+namespace {
 
-  // Collect devices as strings in TensorFlow device name form.
-  llvm::SmallVector<std::string, 8> devices;
-  devices.reserve(device_set->devices().size());
-  for (Device* device : device_set->devices())
-    devices.push_back(
-        DeviceNameUtils::ParsedNameToString(device->parsed_name()));
+// Parse GPU compute capability from physical device description. If compute
+// capability is not found in device description, return an empty dictionary
+// attribute.
+mlir::DictionaryAttr ParseGpuDeviceMetadata(const Device& device,
+                                            mlir::Builder* builder) {
+  // Parse GPU device compute capability from physical device description.
+  static auto* r = new llvm::Regex("compute capability: ([0-9]+)\\.([0-9]+)");
 
-  llvm::SmallVector<llvm::StringRef, 8> device_refs(devices.begin(),
-                                                    devices.end());
-  mlir::Builder builder(op->getContext());
-  op->setAttr(kDevicesAttr, builder.getStrArrayAttr(device_refs));
+  llvm::SmallVector<llvm::StringRef, 3> cc;
+  if (r->match(device.attributes().physical_device_desc(), &cc)) {
+    return mlir::TF::GpuDeviceMetadata::get(
+        builder->getI32IntegerAttr(std::stoi(cc[1].str())),
+        builder->getI32IntegerAttr(std::stoi(cc[2].str())),
+        builder->getContext());
+  }
+
+  return builder->getDictionaryAttr({});
 }
 
-mlir::LogicalResult GetDevicesFromOp(
-    mlir::Operation* op,
-    llvm::SmallVectorImpl<DeviceNameUtils::ParsedName>* devices) {
-  auto devices_attr = op->getAttr(kDevicesAttr);
-  if (!devices_attr) return mlir::success();
+// Get devices from an array of string attributes.
+// TODO(ezhulenev): Update all tests to use dictionary attribute for
+// `tf.devices` and remove this function.
+mlir::LogicalResult GetDevicesFromOp(mlir::Operation* op,
+                                     mlir::ArrayAttr array_attr,
+                                     mlir::TF::RuntimeDevices* devices) {
+  DeviceNameUtils::ParsedName device;
 
-  auto array_attr = devices_attr.dyn_cast<mlir::ArrayAttr>();
-  if (!array_attr)
-    return op->emitOpError(
-        llvm::formatv("bad '{0}' attribute, not an array", kDevicesAttr));
+  for (auto& kv : llvm::enumerate(array_attr)) {
+    const int idx = kv.index();
 
-  devices->resize(array_attr.size());
-  for (auto attr_and_idx : llvm::enumerate(array_attr)) {
-    const int idx = attr_and_idx.index();
-    auto string_attr = attr_and_idx.value().dyn_cast<mlir::StringAttr>();
+    auto string_attr = kv.value().dyn_cast<mlir::StringAttr>();
     if (!string_attr)
       return op->emitOpError(llvm::formatv(
           "bad '{0}' attribute at index {1}, not a string", kDevicesAttr, idx));
 
-    if (!DeviceNameUtils::ParseFullName(string_attr.getValue().str(),
-                                        &(*devices)[idx]))
+    if (DeviceNameUtils::ParseFullName(string_attr.getValue().str(), &device)) {
+      devices->AddDevice(device);
+    } else {
       return op->emitOpError(
-          llvm::formatv("bad '{0}' attribute at index {1} with value '{2}', "
-                        "not a valid device",
-                        kDevicesAttr, idx, string_attr.getValue()));
+          llvm::formatv("bad '{0}' attribute, '{1}', not a valid device",
+                        kDevicesAttr, string_attr.getValue()));
+    }
   }
 
   return mlir::success();
 }
 
+// Get devices from a dictionary attribute.
+mlir::LogicalResult GetDevicesFromOp(mlir::Operation* op,
+                                     mlir::DictionaryAttr dict_attr,
+                                     mlir::TF::RuntimeDevices* devices) {
+  DeviceNameUtils::ParsedName device;
+
+  // Parse device names and metadata from dictionary attribute.
+  for (auto& kv : dict_attr) {
+    const mlir::Identifier name = kv.first;
+    const mlir::Attribute attr = kv.second;
+
+    if (!DeviceNameUtils::ParseFullName(name.str(), &device))
+      return op->emitOpError(
+          llvm::formatv("bad '{0}' attribute, '{1}', not a valid device",
+                        kDevicesAttr, name.strref()));
+
+    if (auto gpu_metadata = attr.dyn_cast<mlir::TF::GpuDeviceMetadata>()) {
+      devices->AddGpuDevice(device, gpu_metadata);
+    } else {
+      devices->AddDevice(device);
+    }
+  }
+
+  return mlir::success();
+}
+
+}  // namespace
+
+void AddDevicesToOp(mlir::Operation* op, const DeviceSet* device_set) {
+  if (!device_set) return;
+
+  mlir::MLIRContext* ctx = op->getContext();
+  mlir::Builder builder(ctx);
+
+  // Collect devices with attached metadata.
+  llvm::SmallVector<mlir::NamedAttribute, 8> devices;
+  devices.reserve(device_set->devices().size());
+
+  // For device that do not have any metadata, or if we failed to parse metadata
+  // from the DeviceSet, we add empty dictionary to the `tf.devices` attribute.
+  for (Device* device : device_set->devices()) {
+    string name = DeviceNameUtils::ParsedNameToString(device->parsed_name());
+
+    if (device->device_type() == DEVICE_GPU) {
+      auto metadata = ParseGpuDeviceMetadata(*device, &builder);
+      devices.push_back(builder.getNamedAttr(name, metadata));
+    } else {
+      auto metadata = builder.getDictionaryAttr({});
+      devices.push_back(builder.getNamedAttr(name, metadata));
+    }
+  }
+
+  op->setAttr(kDevicesAttr, builder.getDictionaryAttr(devices));
+}
+
+mlir::LogicalResult GetDevicesFromOp(mlir::Operation* op,
+                                     mlir::TF::RuntimeDevices* devices) {
+  auto devices_attr = op->getAttr(kDevicesAttr);
+  if (!devices_attr) return mlir::success();
+
+  if (auto array_attr = devices_attr.dyn_cast<mlir::ArrayAttr>()) {
+    return GetDevicesFromOp(op, array_attr, devices);
+
+  } else if (auto dict_attr = devices_attr.dyn_cast<mlir::DictionaryAttr>()) {
+    return GetDevicesFromOp(op, dict_attr, devices);
+  }
+
+  return op->emitOpError(
+      llvm::formatv("unsupported '{0}' attribute", kDevicesAttr));
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/device_util.h b/tensorflow/compiler/mlir/tensorflow/utils/device_util.h
index 73ae18d2487..1cbf0517554 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/device_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/device_util.h
@@ -19,22 +19,27 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/Operation.h"  // TF:llvm-project
 #include "mlir/Support/LogicalResult.h"  // TF:llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
+
 // Collects all devices known to the system by name and adds them as a
-// `tf.devices` array attribute of string attributes to an op. Device names
-// added are in the following form:
+// `tf.devices` dictionary attribute with a full device name as a key, and
+// device metadata as a value.
+//
+// Device names added in full parsed device form:
 //   /job:<name>/replica:<replica>/task:<task>/device:<type>:<device_num>
+//
+// Supported device metadata types:
+// (1) GpuDeviceMetadata: GPU device compute capability.
 void AddDevicesToOp(mlir::Operation* op, const DeviceSet* device_set);
 
-// Collects devices as DeviceNameUtils::ParsedName from an op `tf.devices`
-// attribute. A failure will be returned if the attribute is not an
-// ArrayAttr<StringAttr> or the devices are invalid.
-mlir::LogicalResult GetDevicesFromOp(
-    mlir::Operation* op,
-    llvm::SmallVectorImpl<DeviceNameUtils::ParsedName>* devices);
+// Collects devices information from an op `tf.devices` attributes. Returns
+// failure if can't parse device metadata from the attribute.
+mlir::LogicalResult GetDevicesFromOp(mlir::Operation* op,
+                                     mlir::TF::RuntimeDevices* devices);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc
index cb25e000f7a..25e55e23c1a 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
 
 #include <memory>
-#include <string>
 #include <tuple>
 #include <utility>
 
@@ -46,13 +45,15 @@ class FakeDevice : public Device {
 
   Status Sync() override { return errors::Unimplemented("FakeDevice::Sync()"); }
 
-  static std::unique_ptr<Device> Make(const string& name) {
+  static std::unique_ptr<Device> Make(const string& name,
+                                      const string& desc = "") {
     DeviceNameUtils::ParsedName parsed_name;
     DeviceNameUtils::ParseFullName(name, &parsed_name);
 
     DeviceAttributes device_attributes;
     device_attributes.set_name(name);
     device_attributes.set_device_type(parsed_name.type);
+    device_attributes.set_physical_device_desc(desc);
     return std::make_unique<FakeDevice>(device_attributes);
   }
 };
@@ -62,26 +63,40 @@ TEST(DeviceUtilTest, AddDeviceToOp) {
   mlir::OwningModuleRef module_ref =
       mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
 
-  DeviceSet device_set;
-  llvm::SmallVector<std::unique_ptr<Device>, 2> devices;
-  devices.push_back(
-      FakeDevice::Make("/job:worker/replica:0/task:0/device:CPU:0"));
-  devices.push_back(
-      FakeDevice::Make("/job:worker/replica:1/task:2/device:GPU:3"));
-  for (auto& device : devices) device_set.AddDevice(device.get());
+  const std::string cpu0 = "/job:worker/replica:0/task:0/device:CPU:0";
+  const std::string gpu0 = "/job:worker/replica:1/task:2/device:GPU:0";
+  const std::string gpu1 = "/job:worker/replica:1/task:2/device:GPU:1";
 
+  llvm::SmallVector<std::unique_ptr<Device>, 2> devices;
+  devices.push_back(FakeDevice::Make(cpu0));
+  devices.push_back(FakeDevice::Make(gpu0, "compute capability: 7.0"));
+  devices.push_back(FakeDevice::Make(gpu1));
+
+  DeviceSet device_set;
+  for (auto& device : devices) device_set.AddDevice(device.get());
   AddDevicesToOp(*module_ref, &device_set);
-  auto devices_attr = module_ref->getAttrOfType<mlir::ArrayAttr>("tf.devices");
+
+  auto devices_attr =
+      module_ref->getAttrOfType<mlir::DictionaryAttr>("tf.devices");
   ASSERT_NE(devices_attr, nullptr);
-  ASSERT_EQ(devices_attr.size(), 2);
-  auto device_attr_0 = devices_attr.getValue()[0].dyn_cast<mlir::StringAttr>();
-  ASSERT_NE(device_attr_0, nullptr);
-  EXPECT_EQ(device_attr_0.getValue(),
-            "/job:worker/replica:0/task:0/device:CPU:0");
-  auto device_attr_1 = devices_attr.getValue()[1].dyn_cast<mlir::StringAttr>();
-  ASSERT_NE(device_attr_1, nullptr);
-  EXPECT_EQ(device_attr_1.getValue(),
-            "/job:worker/replica:1/task:2/device:GPU:3");
+  ASSERT_EQ(devices_attr.size(), 3);
+
+  // CPU device added with an empty metadata.
+  auto device_meta_0 = devices_attr.get(cpu0).dyn_cast<mlir::DictionaryAttr>();
+  ASSERT_NE(device_meta_0, nullptr);
+  ASSERT_EQ(device_meta_0.size(), 0);
+
+  // GPU device successfully parsed compute capability from description.
+  auto device_meta_1 =
+      devices_attr.get(gpu0).dyn_cast<mlir::TF::GpuDeviceMetadata>();
+  ASSERT_NE(device_meta_1, nullptr);
+  ASSERT_EQ(device_meta_1.cc_major().getInt(), 7);
+  ASSERT_EQ(device_meta_1.cc_minor().getInt(), 0);
+
+  // If description is empty GPU devices added with an empty metadata.
+  auto device_meta_2 = devices_attr.get(gpu1).dyn_cast<mlir::DictionaryAttr>();
+  ASSERT_NE(device_meta_2, nullptr);
+  ASSERT_EQ(device_meta_2.size(), 0);
 }
 
 TEST(DeviceUtilTest, AddDeviceToOpNullDeviceSet) {
@@ -98,7 +113,7 @@ TEST(DeviceUtilTest, GetDevicesFromOpNoDevicesAttribute) {
   mlir::OwningModuleRef module_ref =
       mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
 
-  llvm::SmallVector<DeviceNameUtils::ParsedName, 8> devices;
+  mlir::TF::RuntimeDevices devices;
   EXPECT_TRUE(mlir::succeeded(GetDevicesFromOp(*module_ref, &devices)));
 }
 
@@ -109,7 +124,7 @@ TEST(DeviceUtilTest, GetDevicesFromOpBadDevicesAttributeType) {
   mlir::Builder builder(*module_ref);
   module_ref->setAttr("tf.devices", builder.getBoolAttr(false));
 
-  llvm::SmallVector<DeviceNameUtils::ParsedName, 8> devices;
+  mlir::TF::RuntimeDevices devices;
   EXPECT_TRUE(mlir::failed(GetDevicesFromOp(*module_ref, &devices)));
 }
 
@@ -120,7 +135,7 @@ TEST(DeviceUtilTest, GetDevicesFromOpBadDevicesAttributeArraySubtype) {
   mlir::Builder builder(*module_ref);
   module_ref->setAttr("tf.devices", builder.getI32ArrayAttr({8}));
 
-  llvm::SmallVector<DeviceNameUtils::ParsedName, 8> devices;
+  mlir::TF::RuntimeDevices devices;
   EXPECT_TRUE(mlir::failed(GetDevicesFromOp(*module_ref, &devices)));
 }
 
@@ -129,9 +144,11 @@ TEST(DeviceUtilTest, GetDevicesFromOpBadDevicesInDevicesAttribute) {
   mlir::OwningModuleRef module_ref =
       mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
   mlir::Builder builder(*module_ref);
-  module_ref->setAttr("tf.devices", builder.getStrArrayAttr({"bad_device"}));
+  module_ref->setAttr("tf.devices",
+                      builder.getDictionaryAttr(builder.getNamedAttr(
+                          "bad_device", builder.getDictionaryAttr({}))));
 
-  llvm::SmallVector<DeviceNameUtils::ParsedName, 8> devices;
+  mlir::TF::RuntimeDevices devices;
   EXPECT_TRUE(mlir::failed(GetDevicesFromOp(*module_ref, &devices)));
 }
 
@@ -140,16 +157,53 @@ TEST(DeviceUtilTest, GetDevicesFromOpValidDeviceInDevicesAttribute) {
   mlir::OwningModuleRef module_ref =
       mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
   mlir::Builder builder(*module_ref);
-  module_ref->setAttr(
-      "tf.devices",
-      builder.getStrArrayAttr({"/job:worker/replica:0/task:0/device:CPU:0"}));
 
-  llvm::SmallVector<DeviceNameUtils::ParsedName, 8> devices;
+  auto device_dict = builder.getDictionaryAttr(
+      {builder.getNamedAttr("/job:worker/replica:0/task:0/device:CPU:0",
+                            builder.getDictionaryAttr({}))});
+  module_ref->setAttr("tf.devices", device_dict);
+
+  mlir::TF::RuntimeDevices devices;
   EXPECT_TRUE(mlir::succeeded(GetDevicesFromOp(*module_ref, &devices)));
-  ASSERT_EQ(devices.size(), 1);
-  EXPECT_EQ(DeviceNameUtils::ParsedNameToString(devices[0]),
+
+  ASSERT_EQ(devices.NumDevices(), 1);
+  ASSERT_EQ(devices.device_names().size(), 1);
+  ASSERT_EQ(DeviceNameUtils::ParsedNameToString(devices.device_names()[0]),
             "/job:worker/replica:0/task:0/device:CPU:0");
 }
 
+TEST(DeviceUtilTest, GetGpuDeviceMetadata) {
+  mlir::MLIRContext context;
+  mlir::OwningModuleRef module_ref =
+      mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
+
+  mlir::Builder builder(*module_ref);
+
+  const std::string gpu0 = "/job:worker/replica:0/task:0/device:GPU:0";
+  const std::string gpu1 = "/job:worker/replica:0/task:0/device:GPU:1";
+
+  llvm::SmallVector<mlir::NamedAttribute, 2> metadata;
+  metadata.push_back(builder.getNamedAttr(
+      gpu0, mlir::TF::GpuDeviceMetadata::get(builder.getI32IntegerAttr(1),
+                                             builder.getI32IntegerAttr(2),
+                                             module_ref->getContext())));
+
+  module_ref->setAttr("tf.devices", builder.getDictionaryAttr(metadata));
+
+  mlir::TF::RuntimeDevices devices;
+  EXPECT_TRUE(mlir::succeeded(GetDevicesFromOp(*module_ref, &devices)));
+
+  DeviceNameUtils::ParsedName parsed_name;
+  DeviceNameUtils::ParseFullName(gpu0, &parsed_name);
+  auto meta_0 = devices.GetGpuDeviceMetadata(parsed_name);
+  ASSERT_TRUE(meta_0.hasValue());
+  ASSERT_EQ(meta_0->cc_major().getInt(), 1);
+  ASSERT_EQ(meta_0->cc_minor().getInt(), 2);
+
+  DeviceNameUtils::ParseFullName(gpu1, &parsed_name);
+  auto meta_1 = devices.GetGpuDeviceMetadata(parsed_name);
+  ASSERT_FALSE(meta_1.hasValue());
+}
+
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
index 1b8ae8403bf..36a59d12060 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
@@ -170,4 +170,16 @@ std::string GetDumpDirFromEnvVar() {
   return result;
 }
 
+std::string DumpRawStringToFile(llvm::StringRef name, llvm::StringRef content,
+                                llvm::StringRef dirname) {
+  std::unique_ptr<llvm::raw_ostream> os;
+  std::string filepath;
+  Status result = CreateFileForDumping(name, &os, &filepath, dirname);
+  if (!result.ok()) return result.error_message();
+
+  (*os) << content;
+  LOG(INFO) << "Outputted requested string to '" << filepath << "'";
+  return filepath;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h
index 14c0d1f0b6e..79c4961273a 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h
@@ -56,6 +56,14 @@ std::string DumpMlirOpToFile(llvm::StringRef name, mlir::Operation* op,
 // cannot be determined and generates a warning message.
 std::string GetDumpDirFromEnvVar();
 
+// Dumps a raw string to a file and returns the file name used.
+//
+// This will create a file name via prefixing `name` with the value of the
+// TF_DUMP_GRAPH_PREFIX environment variable if `dirname` is empty and
+// suffixing `name` with ".mlir".
+std::string DumpRawStringToFile(llvm::StringRef name, llvm::StringRef content,
+                                llvm::StringRef dirname = "");
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_DUMP_MLIR_UTIL_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc
index 947a0ef0af3..69e90de3cb6 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc
@@ -69,5 +69,20 @@ TEST(DumpMlirModuleTest, Valid) {
   EXPECT_EQ(file_txt_module, expected_txt_module);
 }
 
+TEST(DumpRawStringToFileTest, Valid) {
+  llvm::StringRef example = "module {\n}";
+  setenv("TF_DUMP_GRAPH_PREFIX", testing::TmpDir().c_str(), 1);
+
+  std::string filepath = DumpRawStringToFile("example", example);
+  ASSERT_NE(filepath, "(TF_DUMP_GRAPH_PREFIX not specified)");
+  ASSERT_NE(filepath, "LOG(INFO)");
+  ASSERT_NE(filepath, "(unavailable)");
+
+  Env* env = Env::Default();
+  std::string file_txt_module;
+  TF_ASSERT_OK(ReadFileToString(env, filepath, &file_txt_module));
+  EXPECT_EQ(file_txt_module, example);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/error_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/error_util_test.cc
index 3f4947bec23..61214108957 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/error_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/error_util_test.cc
@@ -58,7 +58,8 @@ TEST(ErrorUtilTest, StatusScopedDiagnosticHandler) {
       emitError(loc) << "Second diagnostic message reported";
       return tensorflow::errors::Internal("Passed in error");
     };
-    Status s = StatusScopedDiagnosticHandler(&context).Combine(function());
+    StatusScopedDiagnosticHandler ssdh(&context);
+    Status s = ssdh.Combine(function());
     ASSERT_TRUE(tensorflow::errors::IsInternal(s));
     EXPECT_THAT(s.error_message(), HasSubstr("Passed in error"));
     EXPECT_THAT(s.error_message(), HasSubstr("Diagnostic message reported"));
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
index 33a09a6ddfb..6cf2781e48d 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "tensorflow/compiler/xla/array3d.h"
+#include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/types.h"
@@ -39,9 +39,9 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/statusor.h"
 
 namespace tensorflow {
-// Device coordinates are defined as (x, y, core), thus resulting in a rank 3
+// Device coordinates are defined as (x, y, z, core), thus resulting in a rank 4
 // topology.
-constexpr int kTPUTopologyRank = 3;
+constexpr int kTPUTopologyRank = 4;
 
 constexpr char kDeviceTPUSystem[] = "TPU_SYSTEM";
 constexpr char kDeviceTPU[] = "TPU";
@@ -209,43 +209,43 @@ struct TaskAndDevice {
 };
 
 // Checks if device coordinate is outside of topology mesh shape bounds.
-bool DeviceCoordinateOutOfBound(int x, int y, int core, int bound_x,
-                                int bound_y, int bound_core) {
-  return x < 0 || x >= bound_x || y < 0 || y >= bound_y || core < 0 ||
-         core >= bound_core;
+bool DeviceCoordinateOutOfBound(int x, int y, int z, int core, int bound_x,
+                                int bound_y, int bound_z, int bound_core) {
+  return x < 0 || x >= bound_x || y < 0 || y >= bound_y || z < 0 ||
+         z >= bound_z || core < 0 || core >= bound_core;
 }
 
 // Creates error message for an out of bound device coordinate.
 Status DeviceCoordinateErrorMsg(absl::string_view attribute, int x, int y,
-                                int core, int bound_x, int bound_y,
-                                int bound_core) {
-  return errors::InvalidArgument("device coordinate (", x, ", ", y, ", ", core,
-                                 ") in '", attribute,
+                                int z, int core, int bound_x, int bound_y,
+                                int bound_z, int bound_core) {
+  return errors::InvalidArgument("device coordinate (", x, ", ", y, ", ", z,
+                                 ", ", core, ") in '", attribute,
                                  "' is outside of mesh shape (", bound_x, ", ",
-                                 bound_y, ", ", bound_core, ")");
+                                 bound_y, ", ", bound_z, ", ", bound_core, ")");
 }
 
 // Creates error message for a duplicate device coordinate.
 Status DuplicateCoordinateErrorMsg(absl::string_view attribute, int x, int y,
-                                   int core) {
+                                   int z, int core) {
   return errors::InvalidArgument("'", attribute,
                                  "' has duplicate device coordinate (", x, ", ",
-                                 y, ", ", core, ")");
+                                 y, ", ", z, ", ", core, ")");
 }
 
 // Parses and validates topology (serialized string of TopologyProto), and maps
-// device coordinate (x, y, core) to task and device (of available TPUs).
+// device coordinate (x, y, z, core) to task and device (of available TPUs).
 // Topology attribute device coordinates are ordered by task then device (major
 // to minor).
 //
 // A valid TopologyProto must have:
-//  - a valid mesh shape (rank 3 with positive dimensions)
+//  - a valid mesh shape (rank 4 with positive dimensions)
 //  - `num_tasks` and `num_tpu_devices_per_task` must match the number of
 //    available TPU hosts and devices per host
 //  - device coordinates within the mesh shape
 //  - no duplicate device coordinates
 //  - number of device coordinates (in tuple 3) match number of availabe TPUs
-StatusOr<xla::Array3D<TaskAndDevice>> ParseTopologyAttr(
+StatusOr<xla::Array4D<TaskAndDevice>> ParseTopologyAttr(
     llvm::StringRef topology_attr, int num_tasks, int num_tpus_per_task) {
   tpu::TopologyProto topology_proto;
   if (!topology_proto.ParseFromString(topology_attr.str()))
@@ -288,22 +288,25 @@ StatusOr<xla::Array3D<TaskAndDevice>> ParseTopologyAttr(
 
   const int bound_x = topology_proto.mesh_shape(0);
   const int bound_y = topology_proto.mesh_shape(1);
-  const int bound_core = topology_proto.mesh_shape(2);
+  const int bound_z = topology_proto.mesh_shape(2);
+  const int bound_core = topology_proto.mesh_shape(3);
 
-  xla::Array3D<TaskAndDevice> topology(bound_x, bound_y, bound_core, {});
+  xla::Array4D<TaskAndDevice> topology(bound_x, bound_y, bound_z, bound_core);
   int pos = 0;
   for (int task = 0; task < num_tasks; ++task) {
     for (int device = 0; device < num_tpus_per_task; ++device) {
       int x = topology_proto.device_coordinates(pos++);
       int y = topology_proto.device_coordinates(pos++);
+      int z = topology_proto.device_coordinates(pos++);
       int core = topology_proto.device_coordinates(pos++);
-      if (DeviceCoordinateOutOfBound(x, y, core, bound_x, bound_y, bound_core))
-        return DeviceCoordinateErrorMsg(kTopologyAttr, x, y, core, bound_x,
-                                        bound_y, bound_core);
+      if (DeviceCoordinateOutOfBound(x, y, z, core, bound_x, bound_y, bound_z,
+                                     bound_core))
+        return DeviceCoordinateErrorMsg(kTopologyAttr, x, y, z, core, bound_x,
+                                        bound_y, bound_z, bound_core);
 
-      auto& task_and_device = topology(x, y, core);
+      auto& task_and_device = topology(x, y, z, core);
       if (task_and_device.task != -1)
-        return DuplicateCoordinateErrorMsg(kTopologyAttr, x, y, core);
+        return DuplicateCoordinateErrorMsg(kTopologyAttr, x, y, z, core);
 
       task_and_device = {task, device};
     }
@@ -346,16 +349,18 @@ GetGeneralTPUExecutionDeviceAssignment(
 
   const int bound_x = topology.n1();
   const int bound_y = topology.n2();
-  const int bound_core = topology.n3();
+  const int bound_z = topology.n3();
+  const int bound_core = topology.n4();
 
   // TPU XLA device ID is determined by its device coordinate, from major to
-  // minor coordinates (y, x, core).
-  auto location_to_id = [&](int x, int y, int core) {
-    return x * bound_core + y * bound_x * bound_core + core;
+  // minor coordinates (z, y, x, core).
+  auto location_to_id = [&](int x, int y, int z, int core) {
+    return (x + bound_x * (y + bound_y * z)) * bound_core + core;
   };
 
   std::vector<bool> used_device_ids(
-      location_to_id(bound_x - 1, bound_y - 1, bound_core - 1), false);
+      location_to_id(bound_x - 1, bound_y - 1, bound_z - 1, bound_core - 1),
+      false);
   ExecutionDevices execution_devices(
       num_replicas,
       llvm::SmallVector<std::string, 8>(num_cores_per_replica, ""));
@@ -366,22 +371,25 @@ GetGeneralTPUExecutionDeviceAssignment(
          ++logical_core) {
       int x = device_assignment_attr[pos++];
       int y = device_assignment_attr[pos++];
+      int z = device_assignment_attr[pos++];
       int core = device_assignment_attr[pos++];
-      if (DeviceCoordinateOutOfBound(x, y, core, bound_x, bound_y, bound_core))
-        return DeviceCoordinateErrorMsg(kDeviceAssignmentAttr, x, y, core,
-                                        bound_x, bound_y, bound_core);
+      if (DeviceCoordinateOutOfBound(x, y, z, core, bound_x, bound_y, bound_z,
+                                     bound_core))
+        return DeviceCoordinateErrorMsg(kDeviceAssignmentAttr, x, y, z, core,
+                                        bound_x, bound_y, bound_z, bound_core);
 
-      TaskAndDevice task_and_device = topology(x, y, core);
+      TaskAndDevice task_and_device = topology(x, y, z, core);
       const int task = task_and_device.task;
       const int device = task_and_device.device;
       if (task == -1 || device == -1)
         return errors::InvalidArgument(
             "no TPU device found for '", kDeviceAssignmentAttr,
-            "' device coordinate (", x, ", ", y, ", ", core, ")");
+            "' device coordinate (", x, ", ", y, ", ", z, ", ", core, ")");
 
-      const int device_id = location_to_id(x, y, core);
+      const int device_id = location_to_id(x, y, z, core);
       if (used_device_ids[device_id])
-        return DuplicateCoordinateErrorMsg(kDeviceAssignmentAttr, x, y, core);
+        return DuplicateCoordinateErrorMsg(kDeviceAssignmentAttr, x, y, z,
+                                           core);
 
       used_device_ids[device_id] = true;
       device_assignment(replica, logical_core) = device_id;
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
index de7009b495f..87319f2adeb 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
@@ -129,6 +129,7 @@ std::string TopologyWithDeviceCoordinates(
   topology_proto.add_mesh_shape(2);
   topology_proto.add_mesh_shape(1);
   topology_proto.add_mesh_shape(1);
+  topology_proto.add_mesh_shape(1);
   topology_proto.set_num_tasks(2);
   topology_proto.set_num_tpu_devices_per_task(1);
   for (int device_coordinate : device_coordinates)
@@ -155,89 +156,100 @@ INSTANTIATE_TEST_SUITE_P(
             "failed to parse 'topology' attribute to TopologyProto"),
         std::make_tuple(4, 2, TopologyWithMeshShape({0}),
                         std::vector<int64_t>(),
-                        "'topology' 'mesh_shape' must be rank 3, got rank 1"),
+                        "'topology' 'mesh_shape' must be rank 4, got rank 1"),
         std::make_tuple(
-            2, 1, TopologyWithMeshShape({2, 0, 2}), std::vector<int64_t>(),
+            2, 1, TopologyWithMeshShape({2, 0, 1, 2}), std::vector<int64_t>(),
             "'topology' 'mesh_shape' dimension 1 must be positive, got 0"),
-        std::make_tuple(2, 1, TopologyWithMeshShapeAndTasks({1, 1, 1}, 1, 1),
+        std::make_tuple(2, 1, TopologyWithMeshShapeAndTasks({1, 1, 1, 1}, 1, 1),
                         std::vector<int64_t>(),
                         "number of tasks from available TPU devices must be "
                         "'num_tasks' in 'topology' (1), got 2"),
-        std::make_tuple(2, 1, TopologyWithMeshShapeAndTasks({1, 1, 1}, 2, 2),
+        std::make_tuple(2, 1, TopologyWithMeshShapeAndTasks({1, 1, 1, 1}, 2, 2),
                         std::vector<int64_t>(),
                         "number of TPU devices available per task must be "
                         "'num_tpu_devices_per_task' in 'topology' (2), got 1"),
         std::make_tuple(
             2, 1, TopologyWithDeviceCoordinates({}), std::vector<int64_t>(),
             "length of 'device_coordinates' in 'topology' must be 'num_tasks' "
-            "* 'num_tpus_per_task' * 3 (2 * 1 * 3), got 0"),
-        std::make_tuple(2, 1,
-                        TopologyWithDeviceCoordinates({-1, 0, 0, 1, 0, 0}),
-                        std::vector<int64_t>(),
-                        "device coordinate (-1, 0, 0) in 'topology' is outside "
-                        "of mesh shape (2, 1, 1)"),
-        std::make_tuple(2, 1, TopologyWithDeviceCoordinates({2, 0, 0, 1, 0, 0}),
-                        std::vector<int64_t>(),
-                        "device coordinate (2, 0, 0) in 'topology' is outside "
-                        "of mesh shape (2, 1, 1)"),
-        std::make_tuple(2, 1,
-                        TopologyWithDeviceCoordinates({0, -1, 0, 1, 0, 0}),
-                        std::vector<int64_t>(),
-                        "device coordinate (0, -1, 0) in 'topology' is outside "
-                        "of mesh shape (2, 1, 1)"),
-        std::make_tuple(2, 1, TopologyWithDeviceCoordinates({0, 1, 0, 1, 0, 0}),
-                        std::vector<int64_t>(),
-                        "device coordinate (0, 1, 0) in 'topology' is outside "
-                        "of mesh shape (2, 1, 1)"),
-        std::make_tuple(2, 1,
-                        TopologyWithDeviceCoordinates({0, 0, -1, 1, 0, 0}),
-                        std::vector<int64_t>(),
-                        "device coordinate (0, 0, -1) in 'topology' is outside "
-                        "of mesh shape (2, 1, 1)"),
-        std::make_tuple(2, 1, TopologyWithDeviceCoordinates({0, 0, 1, 1, 0, 0}),
-                        std::vector<int64_t>(),
-                        "device coordinate (0, 0, 1) in 'topology' is outside "
-                        "of mesh shape (2, 1, 1)"),
+            "* 'num_tpus_per_task' * 4 (2 * 1 * 4), got 0"),
         std::make_tuple(
-            2, 1, TopologyWithDeviceCoordinates({0, 0, 0, 0, 0, 0}),
+            2, 1, TopologyWithDeviceCoordinates({-1, 0, 0, 0, 1, 0, 0, 0}),
             std::vector<int64_t>(),
-            "'topology' has duplicate device coordinate (0, 0, 0)")));
+            "device coordinate (-1, 0, 0, 0) in 'topology' is outside "
+            "of mesh shape (2, 1, 1, 1)"),
+        std::make_tuple(
+            2, 1, TopologyWithDeviceCoordinates({2, 0, 0, 0, 1, 0, 0, 0}),
+            std::vector<int64_t>(),
+            "device coordinate (2, 0, 0, 0) in 'topology' is outside "
+            "of mesh shape (2, 1, 1, 1)"),
+        std::make_tuple(
+            2, 1, TopologyWithDeviceCoordinates({0, -1, 0, 0, 1, 0, 0, 0}),
+            std::vector<int64_t>(),
+            "device coordinate (0, -1, 0, 0) in 'topology' is outside "
+            "of mesh shape (2, 1, 1, 1)"),
+        std::make_tuple(
+            2, 1, TopologyWithDeviceCoordinates({0, 1, 0, 0, 1, 0, 0, 0}),
+            std::vector<int64_t>(),
+            "device coordinate (0, 1, 0, 0) in 'topology' is outside "
+            "of mesh shape (2, 1, 1, 1)"),
+        std::make_tuple(
+            2, 1, TopologyWithDeviceCoordinates({0, 0, 0, -1, 1, 0, 0, 0}),
+            std::vector<int64_t>(),
+            "device coordinate (0, 0, 0, -1) in 'topology' is outside "
+            "of mesh shape (2, 1, 1, 1)"),
+        std::make_tuple(
+            2, 1, TopologyWithDeviceCoordinates({0, 0, 0, 1, 1, 0, 0, 0}),
+            std::vector<int64_t>(),
+            "device coordinate (0, 0, 0, 1) in 'topology' is outside "
+            "of mesh shape (2, 1, 1, 1)"),
+        std::make_tuple(
+            2, 1, TopologyWithDeviceCoordinates({0, 0, 0, 0, 0, 0, 0, 0}),
+            std::vector<int64_t>(),
+            "'topology' has duplicate device coordinate (0, 0, 0, 0)")));
 
 INSTANTIATE_TEST_SUITE_P(
     BadGeneralDeviceAssignmentMetadata, ParameterizedMetadataTest,
     ::testing::Values(
-        std::make_tuple(2, 1, TopologyWithDeviceCoordinates({0, 0, 0, 1, 0, 0}),
+        std::make_tuple(2, 1,
+                        TopologyWithDeviceCoordinates({0, 0, 0, 0, 1, 0, 0, 0}),
                         std::vector<int64_t>(),
                         "length of 'device_assignment' must be 'num_replicas' "
-                        "* 'num_cores_per_replica' * 3 (2 * 1 * 3), got 0"),
-        std::make_tuple(2, 1, TopologyWithDeviceCoordinates({0, 0, 0, 1, 0, 0}),
-                        std::vector<int64_t>{-1, 0, 0, 0, 0, 0},
-                        "device coordinate (-1, 0, 0) in 'device_assignment' "
-                        "is outside of mesh shape (2, 1, 1)"),
-        std::make_tuple(2, 1, TopologyWithDeviceCoordinates({0, 0, 0, 1, 0, 0}),
-                        std::vector<int64_t>{2, 0, 0, 0, 0, 0},
-                        "device coordinate (2, 0, 0) in 'device_assignment' is "
-                        "outside of mesh shape (2, 1, 1)"),
-        std::make_tuple(2, 1, TopologyWithDeviceCoordinates({0, 0, 0, 1, 0, 0}),
-                        std::vector<int64_t>{0, -1, 0, 0, 0, 0},
-                        "device coordinate (0, -1, 0) in 'device_assignment' "
-                        "is outside of mesh shape (2, 1, 1)"),
-        std::make_tuple(2, 1, TopologyWithDeviceCoordinates({0, 0, 0, 1, 0, 0}),
-                        std::vector<int64_t>{0, 1, 0, 0, 0, 0},
-                        "device coordinate (0, 1, 0) in 'device_assignment' is "
-                        "outside of mesh shape (2, 1, 1)"),
-        std::make_tuple(2, 1, TopologyWithDeviceCoordinates({0, 0, 0, 1, 0, 0}),
-                        std::vector<int64_t>{0, 0, -1, 0, 0, 0},
-                        "device coordinate (0, 0, -1) in 'device_assignment' "
-                        "is outside of mesh shape (2, 1, 1)"),
-        std::make_tuple(2, 1, TopologyWithDeviceCoordinates({0, 0, 0, 1, 0, 0}),
-                        std::vector<int64_t>{0, 0, 1, 0, 0, 0},
-                        "device coordinate (0, 0, 1) in 'device_assignment' is "
-                        "outside of mesh shape (2, 1, 1)"),
+                        "* 'num_cores_per_replica' * 4 (2 * 1 * 4), got 0"),
         std::make_tuple(
-            2, 1, TopologyWithDeviceCoordinates({0, 0, 0, 1, 0, 0}),
-            std::vector<int64_t>{0, 0, 0, 0, 0, 0},
-            "'device_assignment' has duplicate device coordinate (0, 0, 0)")));
+            2, 1, TopologyWithDeviceCoordinates({0, 0, 0, 0, 1, 0, 0, 0}),
+            std::vector<int64_t>{-1, 0, 0, 0, 0, 0, 0, 0},
+            "device coordinate (-1, 0, 0, 0) in 'device_assignment' "
+            "is outside of mesh shape (2, 1, 1, 1)"),
+        std::make_tuple(
+            2, 1, TopologyWithDeviceCoordinates({0, 0, 0, 0, 1, 0, 0, 0}),
+            std::vector<int64_t>{2, 0, 0, 0, 0, 0, 0, 0},
+            "device coordinate (2, 0, 0, 0) in 'device_assignment' is "
+            "outside of mesh shape (2, 1, 1, 1)"),
+        std::make_tuple(
+            2, 1, TopologyWithDeviceCoordinates({0, 0, 0, 0, 1, 0, 0, 0}),
+            std::vector<int64_t>{0, -1, 0, 0, 0, 0, 0, 0},
+            "device coordinate (0, -1, 0, 0) in 'device_assignment' "
+            "is outside of mesh shape (2, 1, 1, 1)"),
+        std::make_tuple(
+            2, 1, TopologyWithDeviceCoordinates({0, 0, 0, 0, 1, 0, 0, 0}),
+            std::vector<int64_t>{0, 1, 0, 0, 0, 0, 0, 0},
+            "device coordinate (0, 1, 0, 0) in 'device_assignment' is "
+            "outside of mesh shape (2, 1, 1, 1)"),
+        std::make_tuple(
+            2, 1, TopologyWithDeviceCoordinates({0, 0, 0, 0, 1, 0, 0, 0}),
+            std::vector<int64_t>{0, 0, 0, -1, 0, 0, 0, 0},
+            "device coordinate (0, 0, 0, -1) in 'device_assignment' "
+            "is outside of mesh shape (2, 1, 1, 1)"),
+        std::make_tuple(
+            2, 1, TopologyWithDeviceCoordinates({0, 0, 0, 0, 1, 0, 0, 0}),
+            std::vector<int64_t>{0, 0, 0, 1, 0, 0, 0, 0},
+            "device coordinate (0, 0, 0, 1) in 'device_assignment' is "
+            "outside of mesh shape (2, 1, 1, 1)"),
+        std::make_tuple(2, 1,
+                        TopologyWithDeviceCoordinates({0, 0, 0, 0, 1, 0, 0, 0}),
+                        std::vector<int64_t>{0, 0, 0, 0, 0, 0, 0, 0},
+                        "'device_assignment' has duplicate device coordinate "
+                        "(0, 0, 0, 0)")));
 
 std::vector<std::string> MakeDeviceSet(int num_tasks,
                                        int num_devices_per_task) {
@@ -270,15 +282,17 @@ TEST(TPURewriteDeviceUtilTest,
     topology_proto.add_mesh_shape(2);
     topology_proto.add_mesh_shape(1);
     topology_proto.add_mesh_shape(1);
+    topology_proto.add_mesh_shape(1);
     topology_proto.set_num_tasks(1);
     topology_proto.set_num_tpu_devices_per_task(1);
     topology_proto.add_device_coordinates(0);
     topology_proto.add_device_coordinates(0);
     topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(0);
   }
 
   std::string topology_attr = topology_proto.SerializeAsString();
-  std::vector<int64_t> device_assignment_attr{1, 0, 0};
+  std::vector<int64_t> device_assignment_attr{1, 0, 0, 0};
 
   llvm::SmallVector<Device, 8> devices;
   std::vector<std::string> device_names =
@@ -292,7 +306,7 @@ TEST(TPURewriteDeviceUtilTest,
   ASSERT_FALSE(status_or.ok());
   EXPECT_EQ(status_or.status().error_message(),
             "no TPU device found for 'device_assignment' device coordinate (1, "
-            "0, 0)");
+            "0, 0, 0)");
 }
 
 TEST(TPURewriteDeviceUtilTest, ValidFullMeshDeviceAssignment) {
@@ -342,6 +356,7 @@ TEST(TPURewriteDeviceUtilTest, ValidGeneralDeviceAssignmentMesh2x2x2) {
   {
     topology_proto.add_mesh_shape(2);
     topology_proto.add_mesh_shape(2);
+    topology_proto.add_mesh_shape(1);
     topology_proto.add_mesh_shape(2);
     topology_proto.set_num_tasks(2);
     topology_proto.set_num_tpu_devices_per_task(4);
@@ -349,31 +364,40 @@ TEST(TPURewriteDeviceUtilTest, ValidGeneralDeviceAssignmentMesh2x2x2) {
     topology_proto.add_device_coordinates(0);
     topology_proto.add_device_coordinates(0);
     topology_proto.add_device_coordinates(0);
-    topology_proto.add_device_coordinates(1);
-    topology_proto.add_device_coordinates(0);
-    topology_proto.add_device_coordinates(1);
-    topology_proto.add_device_coordinates(1);
     topology_proto.add_device_coordinates(0);
     topology_proto.add_device_coordinates(1);
     topology_proto.add_device_coordinates(0);
     topology_proto.add_device_coordinates(0);
     topology_proto.add_device_coordinates(1);
+    topology_proto.add_device_coordinates(1);
+    topology_proto.add_device_coordinates(0);
     topology_proto.add_device_coordinates(0);
     topology_proto.add_device_coordinates(1);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(1);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(0);
     topology_proto.add_device_coordinates(1);
     topology_proto.add_device_coordinates(1);
     topology_proto.add_device_coordinates(1);
     topology_proto.add_device_coordinates(0);
     topology_proto.add_device_coordinates(1);
+    topology_proto.add_device_coordinates(0);
     topology_proto.add_device_coordinates(1);
     topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(1);
+    topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(0);
     topology_proto.add_device_coordinates(0);
     topology_proto.add_device_coordinates(1);
   }
 
   std::string topology_attr = topology_proto.SerializeAsString();
-  std::vector<int64_t> device_assignment_attr{
-      0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1};
+  std::vector<int64_t> device_assignment_attr{0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
+                                              0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
+                                              0, 1, 1, 1, 0, 0, 1, 1, 0, 1};
 
   llvm::SmallVector<Device, 8> devices;
   std::vector<std::string> device_names =
@@ -433,11 +457,12 @@ TEST(TPURewriteDeviceUtilTest, ValidGeneralDeviceAssignmentMesh2x2x2) {
   EXPECT_EQ(computation_device_1.replica_device_ids(3), 7);
 }
 
-TEST(TPURewriteDeviceUtilTest, ValidGeneralDeviceAssignmentMesh1x2x3) {
+TEST(TPURewriteDeviceUtilTest, ValidGeneralDeviceAssignmentMesh1x2x1x3) {
   tpu::TopologyProto topology_proto;
   {
     topology_proto.add_mesh_shape(1);
     topology_proto.add_mesh_shape(2);
+    topology_proto.add_mesh_shape(1);
     topology_proto.add_mesh_shape(3);
     topology_proto.set_num_tasks(3);
     topology_proto.set_num_tpu_devices_per_task(2);
@@ -445,25 +470,31 @@ TEST(TPURewriteDeviceUtilTest, ValidGeneralDeviceAssignmentMesh1x2x3) {
     topology_proto.add_device_coordinates(0);
     topology_proto.add_device_coordinates(0);
     topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(0);
     topology_proto.add_device_coordinates(1);
     topology_proto.add_device_coordinates(0);
     topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(0);
     topology_proto.add_device_coordinates(1);
+    topology_proto.add_device_coordinates(0);
     topology_proto.add_device_coordinates(1);
     topology_proto.add_device_coordinates(0);
     topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(0);
     topology_proto.add_device_coordinates(1);
     topology_proto.add_device_coordinates(0);
     topology_proto.add_device_coordinates(0);
+    topology_proto.add_device_coordinates(0);
     topology_proto.add_device_coordinates(2);
     topology_proto.add_device_coordinates(0);
     topology_proto.add_device_coordinates(1);
+    topology_proto.add_device_coordinates(0);
     topology_proto.add_device_coordinates(2);
   }
 
   std::string topology_attr = topology_proto.SerializeAsString();
-  std::vector<int64_t> device_assignment_attr{0, 0, 1, 0, 1, 1, 0, 0, 2,
-                                              0, 1, 2, 0, 0, 0, 0, 1, 0};
+  std::vector<int64_t> device_assignment_attr{
+      0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 2, 0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0};
 
   llvm::SmallVector<Device, 8> devices;
   std::vector<std::string> device_names =
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.cc
index dff47861419..6aeead516e8 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/translate_utils.cc
@@ -41,20 +41,22 @@ mlir::LogicalResult ExtractTfVersions(mlir::ModuleOp module,
   auto version_attr = module.getAttrOfType<mlir::DictionaryAttr>("tf.versions");
   if (!version_attr) return mlir::failure();
 
-  auto producer = version_attr.get("producer").dyn_cast<mlir::IntegerAttr>();
+  auto producer =
+      version_attr.get("producer").dyn_cast_or_null<mlir::IntegerAttr>();
   if (!producer) return mlir::failure();
   versions->set_producer(producer.getInt());
 
   auto min_consumer =
-      version_attr.get("min_consumer").dyn_cast<mlir::IntegerAttr>();
-  if (!min_consumer) return mlir::failure();
-  versions->set_min_consumer(min_consumer.getInt());
+      version_attr.get("min_consumer").dyn_cast_or_null<mlir::IntegerAttr>();
+  if (min_consumer) versions->set_min_consumer(min_consumer.getInt());
 
   auto bad_consumers =
-      version_attr.get("bad_consumers").dyn_cast<mlir::ArrayAttr>();
-  if (!bad_consumers) return mlir::failure();
+      version_attr.get("bad_consumers").dyn_cast_or_null<mlir::ArrayAttr>();
+  if (!bad_consumers) return mlir::success();
+
   for (auto bad_consumer : bad_consumers) {
-    auto bad_consumer_int_attr = bad_consumer.dyn_cast<mlir::IntegerAttr>();
+    auto bad_consumer_int_attr =
+        bad_consumer.dyn_cast_or_null<mlir::IntegerAttr>();
     if (!bad_consumer_int_attr) return mlir::failure();
 
     versions->mutable_bad_consumers()->Add(bad_consumer_int_attr.getInt());
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
new file mode 100644
index 00000000000..bbe91054b3b
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
@@ -0,0 +1,204 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "mlir/IR/Attributes.h"  // TF:llvm-project
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace tensorflow {
+
+const char* const kXlaShardingAttrName = "_XlaSharding";
+const char* const kInputShardingAttr = "input_sharding_configuration";
+const char* const kOutputShardingAttr = "output_sharding_configuration";
+
+llvm::Optional<mlir::StringRef> ParseShardingAttribute(
+    mlir::Operation* operation) {
+  const auto& sharding_attr =
+      operation->getAttrOfType<mlir::StringAttr>(kXlaShardingAttrName);
+  if (!sharding_attr) return llvm::Optional<mlir::StringRef>();
+  return sharding_attr.getValue();
+}
+
+llvm::SmallVector<llvm::SmallVector<mlir::Value, 4>, 4>
+ExtractInputsForLogicalDevices(int num_logical_cores,
+                               mlir::tf_device::LaunchFuncOp launch_func) {
+  // Initialize the input list for each logical devices.
+  llvm::SmallVector<llvm::SmallVector<mlir::Value, 4>, 4> input_list;
+  input_list.reserve(num_logical_cores);
+  for (int i = 0; i < num_logical_cores; ++i)
+    input_list.emplace_back(llvm::SmallVector<mlir::Value, 4>());
+
+  llvm::SmallVector<mlir::Value, 4> launch_func_inputs(
+      launch_func.getOperands());
+  auto sharding_attrs =
+      launch_func.getOperation()->getAttrOfType<mlir::ArrayAttr>(
+          kInputShardingAttr);
+  // If sharding attribute does not exist, then all inputs are placed on 0th
+  // logical core by default.
+  if (!sharding_attrs) {
+    input_list[0] = launch_func_inputs;
+    return input_list;
+  }
+
+  // Enumerate sharding configuration for each inputs. If input has replicate
+  // sharding then all logical devices take the value as input. If input has
+  // maximal sharding then only the specified logical device take the value as
+  // the input.
+  for (const auto& sharding_attr_and_index : llvm::enumerate(sharding_attrs)) {
+    const auto& sharding_attr = sharding_attr_and_index.value();
+    const auto input_index = sharding_attr_and_index.index();
+    const auto& input_value = launch_func_inputs[input_index];
+
+    xla::OpSharding sharding;
+    sharding.ParseFromString(
+        sharding_attr.cast<mlir::StringAttr>().getValue().str());
+
+    const auto input_sharing_type = sharding.type();
+    if (input_sharing_type == xla::OpSharding::OTHER)
+      launch_func.emitError(
+          "tiled inputs are not yet supported for model parallelism");
+
+    if (input_sharing_type == xla::OpSharding::REPLICATED) {
+      for (auto inputs : input_list) inputs.emplace_back(input_value);
+    } else {
+      assert(input_sharing_type == xla::OpSharding::MAXIMAL);
+      const int logical_device_id = sharding.tile_assignment_devices(0);
+      input_list[logical_device_id].emplace_back(input_value);
+    }
+  }
+  return input_list;
+}
+
+mlir::LogicalResult ParseAndValidateOutputSharding(
+    mlir::tf_device::LaunchFuncOp launch_func,
+    mlir::SmallVector<xla::OpSharding, 4>* output_sharding_list) {
+  output_sharding_list->reserve(launch_func.getNumResults());
+
+  const auto output_sharding_attrs =
+      launch_func.getOperation()->getAttrOfType<mlir::ArrayAttr>(
+          kOutputShardingAttr);
+  if (!output_sharding_attrs)
+    return launch_func.emitError(
+        "output_sharding_configuration missing from launch func");
+
+  if (output_sharding_attrs.size() != launch_func.getNumResults())
+    return launch_func.emitError("incorrect number of output sharding");
+
+  for (auto output_sharding_and_index :
+       llvm::enumerate(output_sharding_attrs)) {
+    const auto& output_sharding = output_sharding_and_index.value();
+    const int sharding_index = output_sharding_and_index.index();
+    if (!output_sharding.isa<mlir::StringAttr>())
+      return launch_func.emitError(llvm::formatv(
+          "non-string output sharding at index {0}", sharding_index));
+
+    xla::OpSharding sharding;
+    if (!sharding.ParseFromString(
+            output_sharding.cast<mlir::StringAttr>().getValue().str()))
+      return launch_func.emitError("incorrect sharding format for outputs");
+
+    const auto output_sharing_type = sharding.type();
+    if (output_sharing_type == xla::OpSharding::OTHER)
+      return launch_func.emitError(
+          "tiled outputs are not yet supported for model parallelism");
+
+    output_sharding_list->emplace_back(std::move(sharding));
+  }
+  return mlir::success();
+}
+
+namespace {
+
+bool IsAssignedToLogicalDevice(const int core_id,
+                               const xla::OpSharding& sharding) {
+  return sharding.type() == xla::OpSharding::MAXIMAL &&
+         sharding.tile_assignment_devices(0) == core_id;
+}
+
+// Returns the index of the return value of region in
+// `tf_device.parallel_execute` that represents launch func output at
+// index |launch_func_output_index|. Regions of parallel_execute may
+// have different return values depending on outside sharding
+// configuration.
+int MapLaunchOutputIndexWithRegionOutputIndex(
+    llvm::ArrayRef<xla::OpSharding> output_sharding_config, const int core_id,
+    const int launch_func_output_index) {
+  int region_output_index = 0;
+  for (int output_index = 0; output_index < launch_func_output_index;
+       ++output_index) {
+    const auto& sharding = output_sharding_config[output_index];
+    if (sharding.type() == xla::OpSharding::REPLICATED ||
+        IsAssignedToLogicalDevice(core_id, sharding))
+      region_output_index++;
+  }
+
+  return region_output_index;
+}
+
+}  // namespace
+
+mlir::SmallVector<mlir::Type, 4> GetOutputTypesForLogicalDeviceComputation(
+    const int logical_device_id,
+    llvm::ArrayRef<xla::OpSharding> output_sharding_config,
+    mlir::tf_device::LaunchFuncOp launch_func) {
+  mlir::SmallVector<mlir::Type, 4> output_types;
+  output_types.reserve(launch_func.getNumResults());
+
+  for (auto result_and_index : llvm::enumerate(launch_func.getResults())) {
+    const auto output_index = result_and_index.index();
+    const auto& output_sharding = output_sharding_config[output_index];
+    const auto output_sharding_type = output_sharding.type();
+    const auto& launch_func_output = result_and_index.value();
+
+    if (output_sharding_type == xla::OpSharding::REPLICATED ||
+        IsAssignedToLogicalDevice(logical_device_id, output_sharding))
+      output_types.emplace_back(launch_func_output.getType());
+  }
+
+  return output_types;
+}
+
+void RemapOutputsFromLogicalDevices(
+    llvm::ArrayRef<xla::OpSharding> output_sharding_config,
+    mlir::tf_device::LaunchFuncOp launch_func,
+    mlir::tf_device::ParallelExecuteOp parallel_execute) {
+  for (auto result_and_index : llvm::enumerate(launch_func.getResults())) {
+    const auto output_index = result_and_index.index();
+    const auto& launch_func_output = result_and_index.value();
+    const auto& output_sharding = output_sharding_config[output_index];
+    const auto output_sharing_type = output_sharding.type();
+
+    int logical_device_id = 0;
+    if (output_sharing_type == xla::OpSharding::MAXIMAL)
+      logical_device_id = output_sharding.tile_assignment_devices(0);
+
+    // For maximal sharding configuration, correctly remap outputs from
+    // parallel_execute region to users of the launch func.
+    const int region_output_index = MapLaunchOutputIndexWithRegionOutputIndex(
+        output_sharding_config, logical_device_id, output_index);
+
+    const auto output_from_logical_device = parallel_execute.GetRegionOutputs(
+        logical_device_id)[region_output_index];
+
+    launch_func_output.replaceAllUsesWith(output_from_logical_device);
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h
new file mode 100644
index 00000000000..4f548ca95aa
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h
@@ -0,0 +1,68 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_XLA_SHARDING_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_XLA_SHARDING_UTIL_H_
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Operation.h"  // TF:llvm-project
+#include "mlir/IR/Value.h"  // TF:llvm-project
+#include "mlir/Support/LogicalResult.h"  // TF:llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace tensorflow {
+
+extern const char* const kXlaShardingAttrName;
+extern const char* const kInputShardingAttr;
+extern const char* const kOutputShardingAttr;
+
+// Parses "_XlaSharding" attribute from operation, if it exists.
+llvm::Optional<mlir::StringRef> ParseShardingAttribute(
+    mlir::Operation* operation);
+
+// Parses "input_sharding_configuration" attribute and returns a list where
+// i-th element is a list of mlir::Value's which represent inputs for the
+// TPU computation correponding to i-th logical device. If the attribute
+// does not exist, the all inputs are placed on logical core 0.
+llvm::SmallVector<llvm::SmallVector<mlir::Value, 4>, 4>
+ExtractInputsForLogicalDevices(int num_logical_cores,
+                               mlir::tf_device::LaunchFuncOp launch_func);
+
+// Extracts a list of OpSharding that represent output sharding configuration
+// of `tf_device.launch`.
+mlir::LogicalResult ParseAndValidateOutputSharding(
+    mlir::tf_device::LaunchFuncOp launch_func,
+    mlir::SmallVector<xla::OpSharding, 4>* output_sharding_list);
+
+// Retrieves output types for TPUExecute op representing execution for provided
+// logical device id. TPUExecute op for different logical device may have
+// different outputs depending on the output sharding configuration.
+mlir::SmallVector<mlir::Type, 4> GetOutputTypesForLogicalDeviceComputation(
+    const int logical_device_id,
+    llvm::ArrayRef<xla::OpSharding> output_sharding_config,
+    mlir::tf_device::LaunchFuncOp launch_func);
+
+// Remaps outputs of `tf_device.parallel_execute` op that represent concurrent
+// execution of the `tf_device.launch_func` with its users.
+void RemapOutputsFromLogicalDevices(
+    llvm::ArrayRef<xla::OpSharding> output_sharding_config,
+    mlir::tf_device::LaunchFuncOp launch_func,
+    mlir::tf_device::ParallelExecuteOp parallel_execute);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_XLA_SHARDING_UTIL_H_
diff --git a/tensorflow/compiler/mlir/tf_mlir_translate_main.cc b/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
index f5fc56556ec..29f9ec7eb46 100644
--- a/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
+++ b/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
@@ -49,15 +49,17 @@ static llvm::cl::opt<bool> splitInputFile(
     llvm::cl::init(false));
 
 // NOLINTNEXTLINE
-static llvm::cl::opt<bool> import_saved_model(
-    "savedmodel-to-mlir",
-    llvm::cl::desc("Import a saved model to its MLIR representation"),
+static llvm::cl::opt<bool> import_saved_model_object_graph(
+    "savedmodel-objectgraph-to-mlir",
+    llvm::cl::desc(
+        "Import a saved model's object graph to its MLIR representation"),
     llvm::cl::value_desc("dir"));
 
 // NOLINTNEXTLINE
-static llvm::cl::opt<bool> import_saved_model_v1(
-    "savedmodel-v1-to-mlir",
-    llvm::cl::desc("Import a saved model V1 to its MLIR representation"),
+static llvm::cl::opt<bool> import_saved_model_signature_defs(
+    "savedmodel-signaturedefs-to-mlir",
+    llvm::cl::desc(
+        "Import a saved model's SignatureDefs to to their MLIR representation"),
     llvm::cl::value_desc("dir"));
 
 // NOLINTNEXTLINE
@@ -83,11 +85,12 @@ int main(int argc, char** argv) {
 
   llvm::cl::ParseCommandLineOptions(argc, argv, "TF MLIR translation driver\n");
 
-  if (!import_saved_model && !import_saved_model_v1 && !requested_translation) {
+  if (!import_saved_model_object_graph && !import_saved_model_signature_defs &&
+      !requested_translation) {
     llvm::errs() << "error: need to specify one translation to perform\n";
     return 1;
-  } else if (import_saved_model && import_saved_model_v1 &&
-             requested_translation) {
+  } else if (import_saved_model_object_graph &&
+             import_saved_model_signature_defs && requested_translation) {
     llvm::errs()
         << "error: cannot specify more than one translation to perform\n";
     return 1;
@@ -100,26 +103,26 @@ int main(int argc, char** argv) {
     return 1;
   }
 
-  if (import_saved_model) {
+  if (import_saved_model_object_graph) {
     std::unordered_set<std::string> tags =
         absl::StrSplit(saved_model_tags, ',');
     std::vector<std::string> exported_names =
         absl::StrSplit(saved_model_exported_names, ',', absl::SkipEmpty());
     mlir::MLIRContext context;
 
-    auto module = tensorflow::SavedModelToMlirImport(
+    auto module = tensorflow::SavedModelObjectGraphToMlirImport(
         input_filename, tags, absl::Span<std::string>(exported_names),
         &context);
     if (!module) return 1;
 
     module->print(output->os());
-  } else if (import_saved_model_v1) {
+  } else if (import_saved_model_signature_defs) {
     std::unordered_set<std::string> tags =
         absl::StrSplit(saved_model_tags, ',');
     mlir::MLIRContext context;
 
-    auto module =
-        tensorflow::SavedModelV1ToMlirImport(input_filename, tags, &context);
+    auto module = tensorflow::SavedModelSignatureDefsToMlirImport(
+        input_filename, tags, &context);
     if (!module) return 1;
 
     module->print(output->os());
diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index 5bf056af832..72126a7ef8f 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -15,11 +15,13 @@ package_group(
         "//learning/brain/experimental/swift_mlir/...",
         "//learning/brain/google/xla/kernels/...",
         "//learning/brain/swift/swift_mlir/...",
+        "//platforms/xla/...",
         "//tensorflow/compiler/mlir/...",
         "//tensorflow/compiler/tf2xla/...",
         "//tensorflow/compiler/xla/...",
         "//third_party/iree/...",
         "//third_party/mlir_edge/...",
+        "//third_party/tf_runtime/tools/tf_kernel_gen/...",
     ],
 )
 
@@ -28,11 +30,27 @@ exports_files(["ir/hlo_ops.td"])
 filegroup(
     name = "hlo_ops_td_files",
     srcs = [
+        "ir/hlo_client_ops.td",
         "ir/hlo_ops.td",
         "ir/hlo_ops_base.td",
         "ir/hlo_utils.td",
         "ir/lhlo_ops.td",
         "@llvm-project//mlir:OpBaseTdFiles",
+        "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+    ],
+)
+
+gentbl(
+    name = "hlo_client_ops_inc_gen",
+    tbl_outs = [
+        ("-gen-op-decls", "ir/hlo_client_ops.h.inc"),
+        ("-gen-op-defs", "ir/hlo_client_ops.cc.inc"),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "ir/hlo_client_ops.td",
+    td_srcs = [
+        ":hlo_ops_td_files",
     ],
 )
 
@@ -47,7 +65,10 @@ gentbl(
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "ir/hlo_ops.td",
     td_includes = ["ir/hlo_utils.td"],
-    td_srcs = [":hlo_ops_td_files"],
+    td_srcs = [
+        ":hlo_ops_td_files",
+        "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
+    ],
 )
 
 gentbl(
@@ -130,28 +151,62 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "xla_legalize_tf_with_tf2xla",
+    srcs = [
+        "transforms/legalize_tf_with_tf2xla.cc",
+    ],
+    deps = [
+        ":hlo",
+        ":mlir_hlo_builder",
+        "//tensorflow/compiler/jit:xla_cpu_device",
+        "//tensorflow/compiler/jit:xla_cpu_jit",
+        "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:convert_type",
+        "//tensorflow/compiler/mlir/tensorflow:export_tf_dialect_op",
+        "//tensorflow/compiler/mlir/tensorflow:lower_tf_lib",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow:translate_utils",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:session_options",
+        "//tensorflow/stream_executor:timer",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "map_xla_to_scalar_op",
-    srcs = [],
     hdrs = ["transforms/map_xla_to_scalar_op.h"],
     deps = [
         ":hlo",
         ":lhlo",
+        ":map_hlo_to_lhlo_op",
         "@llvm-project//llvm:support",
         "@llvm-project//mlir:StandardOps",
     ],
 )
 
 cc_library(
-    name = "hlo_shape_derivation",
-    srcs = [],
-    hdrs = ["transforms/hlo_shape_derivation.h"],
+    name = "map_hlo_to_lhlo_op",
+    hdrs = ["transforms/map_hlo_to_lhlo_op.h"],
     deps = [
         ":hlo",
         ":lhlo",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:Transforms",
     ],
 )
 
@@ -173,6 +228,23 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "lhlo_legalize_to_parallel_loops",
+    srcs = ["transforms/lhlo_legalize_to_parallel_loops.cc"],
+    deps = [
+        ":lhlo",
+        "@com_google_absl//absl/memory",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LinalgOps",
+        "@llvm-project//mlir:LoopOps",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Transforms",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "xla_legalize_to_linalg",
     srcs = ["transforms/xla_legalize_to_linalg.cc"],
@@ -226,13 +298,26 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "lhlo_copy_removal",
+    srcs = ["transforms/lhlo_copy_removal.cc"],
+    deps = [
+        ":lhlo",
+        "@com_google_absl//absl/memory",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:StandardOps",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "hlo_legalize_to_lhlo",
     srcs = ["transforms/hlo_legalize_to_lhlo.cc"],
     deps = [
         ":hlo",
-        ":hlo_shape_derivation",
         ":lhlo",
+        ":map_hlo_to_lhlo_op",
         "@com_google_absl//absl/memory",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
@@ -342,7 +427,9 @@ cc_library(
     ],
     deps = [
         ":hlo",
+        "@llvm-project//llvm:support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Transforms",
     ],
 )
@@ -368,12 +455,14 @@ cc_library(
 cc_library(
     name = "hlo",
     srcs = [
+        "ir/hlo_client_ops.cc",
         "ir/hlo_ops.cc",
         "ir/hlo_ops.cc.inc",
         "ir/hlo_ops.h.inc",
         "ir/hlo_utils.cc",
     ],
     hdrs = [
+        "ir/hlo_client_ops.h",
         "ir/hlo_ops.h",
         "ir/hlo_utils.h",
         "transforms/passes.h",
@@ -382,6 +471,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":convert_op_folder",
+        ":hlo_client_ops_inc_gen",
         ":hlo_ops_base_inc_gen",
         ":hlo_ops_inc_gen",
         ":xla_canonicalize_inc_gen",
@@ -389,7 +479,9 @@ cc_library(
         "@llvm-project//llvm:support",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SideEffects",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
@@ -398,6 +490,31 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "mlir_hlo_builder",
+    srcs = [
+        "ir/mlir_hlo_builder.cc",
+    ],
+    hdrs = [
+        "ir/mlir_hlo_builder.h",
+    ],
+    deps = [
+        ":hlo",
+        ":hlo_utils",
+        ":type_to_shape",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:shape_inference",
+        "//tensorflow/core/platform:types",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
 cc_library(
     name = "lhlo",
     srcs = [
@@ -418,6 +535,7 @@ cc_library(
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SideEffects",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
@@ -598,6 +716,8 @@ tf_native_cc_binary(
 genrule(
     name = "operator_writer_inc",
     srcs = [
+        "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
         "@llvm-project//mlir:include/mlir/IR/OpBase.td",
         ":ir/hlo_ops.td",
         ":ir/hlo_ops_base.td",
diff --git a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
index 2c20e113956..fa029bd50d0 100644
--- a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
@@ -237,11 +237,9 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
     case HloOpcode::kBroadcast: {
       // Note that the HLO broadcast is more powerful than the XLA broadcast op.
       // BroadcastInDim offers a superset of the HLO op's functionality.
-      if (!instruction->dimensions().empty()) {
-        attributes.push_back(builder_->getNamedAttr(
-            "broadcast_dimensions",
-            ConvertDimensions(instruction->dimensions())));
-      }
+      attributes.push_back(
+          builder_->getNamedAttr("broadcast_dimensions",
+                                 ConvertDimensions(instruction->dimensions())));
       MakeAndReturn(BroadcastInDimOp);
     }
 #define MakeAndReturnBatchNormOp(batch_norm_op)                         \
diff --git a/tensorflow/compiler/mlir/xla/hlo_utils.cc b/tensorflow/compiler/mlir/xla/hlo_utils.cc
index 8fa8b25255a..a0ce8a796cb 100644
--- a/tensorflow/compiler/mlir/xla/hlo_utils.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_utils.cc
@@ -41,16 +41,23 @@ template <typename CppType>
       type, llvm::makeArrayRef(data_span.data(), data_span.size()));
 }
 
-llvm::SmallVector<AffineMap, 2> GetPermutationIfAvailable(
+StatusOr<llvm::SmallVector<AffineMap, 1>> GetPermutationIfAvailable(
     const Shape& shape, mlir::Builder builder) {
-  if (!shape.has_layout() || shape.layout().minor_to_major().empty()) {
-    return {};
+  if (!shape.has_layout() ||
+      LayoutUtil::IsMonotonicWithDim0Major(shape.layout())) {
+    return llvm::SmallVector<AffineMap, 1>{};
   }
-  llvm::SmallVector<unsigned, 2> permutation;
+  if (!shape.is_static()) {
+    return tensorflow::errors::Internal(
+        "Permutations for dynamic shapes are not yet supported");
+  }
+  llvm::SmallVector<int64_t, 2> permuted_sizes;
   for (auto dim : llvm::reverse(shape.layout().minor_to_major())) {
-    permutation.push_back(dim);
+    permuted_sizes.push_back(shape.dimensions(dim));
   }
-  return {AffineMap::getPermutationMap(permutation, builder.getContext())};
+  return llvm::SmallVector<AffineMap, 1>{AffineMap::get(
+      permuted_sizes.size(), 0,
+      makeCanonicalStridedLayoutExpr(permuted_sizes, builder.getContext()))};
 }
 
 }  // namespace
@@ -64,8 +71,10 @@ StatusOr<mlir::MemRefType> ConvertTensorShapeToMemRefType(
   using mlir::MemRefType;
   auto dimensions = shape.dimensions();
   llvm::SmallVector<int64_t, 4> array(dimensions.begin(), dimensions.end());
+  auto permutation_or = GetPermutationIfAvailable(shape, builder);
+  if (!permutation_or.ok()) return permutation_or.status();
   return MemRefType::get(array, element_type_or.ValueOrDie(),
-                         GetPermutationIfAvailable(shape, builder));
+                         permutation_or.ValueOrDie());
 }
 
 StatusOr<mlir::DenseElementsAttr> CreateDenseElementsAttrFromLiteral(
diff --git a/tensorflow/compiler/mlir/xla/ir/dialect_registration.cc b/tensorflow/compiler/mlir/xla/ir/dialect_registration.cc
index f5e5b0ad257..bafbc1ac9a9 100644
--- a/tensorflow/compiler/mlir/xla/ir/dialect_registration.cc
+++ b/tensorflow/compiler/mlir/xla/ir/dialect_registration.cc
@@ -13,9 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/mlir/xla/ir/hlo_client_ops.h"
 #include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.h"
 
 // Static initialization for XLA dialect registration.
 static mlir::DialectRegistration<mlir::xla_hlo::XlaHloDialect> xla_hlo_ops;
+static mlir::DialectRegistration<mlir::xla_hlo_client::XlaHloClientDialect>
+    xla_hlo_client_ops;
 static mlir::DialectRegistration<mlir::xla_lhlo::XlaLhloDialect> xla_lhlo_ops;
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_client_ops.cc b/tensorflow/compiler/mlir/xla/ir/hlo_client_ops.cc
new file mode 100644
index 00000000000..9056f532715
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_client_ops.cc
@@ -0,0 +1,127 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/xla/ir/hlo_client_ops.h"
+
+#include "mlir/IR/TypeUtilities.h"  // TF:llvm-project
+
+namespace mlir {
+namespace xla_hlo_client {
+
+template <typename T>
+static LogicalResult Verify(T op) {
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// BinaryOps
+//===----------------------------------------------------------------------===//
+
+namespace {
+// Gets the resulting type from a broadcast between two types.
+static Type GetBroadcastType(Builder* builder, Type x, Type y,
+                             Type element_type,
+                             DenseIntElementsAttr broadcast_dimensions) {
+  auto x_ranked = x.dyn_cast<RankedTensorType>();
+  auto y_ranked = y.dyn_cast<RankedTensorType>();
+  if (!x_ranked || !y_ranked) {
+    return UnrankedTensorType::get(element_type);
+  }
+
+  auto shape_x = x_ranked.getShape();
+  auto shape_y = y_ranked.getShape();
+
+  if (shape_x.size() == shape_y.size()) {
+    llvm::SmallVector<int64_t, 4> out_shape(shape_x.size());
+    for (int i = 0; i < shape_x.size(); i++) {
+      auto x_val = shape_x[i];
+      auto y_val = shape_y[i];
+      if (x_val == -1 || y_val == -1) {
+        out_shape[i] = -1;
+      } else {
+        out_shape[i] = std::max(x_val, y_val);
+      }
+    }
+    return RankedTensorType::get(out_shape, element_type);
+  }
+
+  // Return unranked tensor for invalid broadcast dimensions.
+  if (!broadcast_dimensions) return UnrankedTensorType::get(element_type);
+
+  auto shape_large = shape_x.size() > shape_y.size() ? shape_x : shape_y;
+  auto shape_small = shape_x.size() <= shape_y.size() ? shape_x : shape_y;
+
+  llvm::SmallVector<int64_t, 4> out_shape(shape_large.begin(),
+                                          shape_large.end());
+
+  // Update according to the broadcast dimensions.
+  for (auto index_pair : llvm::enumerate(broadcast_dimensions.getIntValues())) {
+    auto old_value = out_shape[index_pair.value().getSExtValue()];
+    auto new_value = shape_small[index_pair.index()];
+    if (old_value != -1 && (new_value == -1 || new_value > old_value)) {
+      out_shape[index_pair.value().getSExtValue()] = new_value;
+    }
+  }
+
+  return RankedTensorType::get(out_shape, element_type);
+}
+}  // namespace
+
+#define BINARY_BUILDER(Op)                                                   \
+  void Op::build(Builder* builder, OperationState& result, Value left,       \
+                 Value right, DenseIntElementsAttr broadcast_dimensions) {   \
+    auto type = GetBroadcastType(builder, left.getType().cast<ShapedType>(), \
+                                 right.getType().cast<ShapedType>(),         \
+                                 getElementTypeOrSelf(right.getType()),      \
+                                 broadcast_dimensions);                      \
+    return Op::build(builder, result, type, left, right,                     \
+                     broadcast_dimensions);                                  \
+  }
+
+BINARY_BUILDER(AddOp);
+BINARY_BUILDER(AndOp);
+BINARY_BUILDER(Atan2Op);
+BINARY_BUILDER(DivOp);
+BINARY_BUILDER(MaxOp);
+BINARY_BUILDER(MinOp);
+BINARY_BUILDER(MulOp);
+BINARY_BUILDER(OrOp);
+BINARY_BUILDER(PowOp);
+BINARY_BUILDER(RemOp);
+BINARY_BUILDER(ShiftLeftOp);
+BINARY_BUILDER(ShiftRightArithmeticOp);
+BINARY_BUILDER(ShiftRightLogicalOp);
+BINARY_BUILDER(SubOp);
+BINARY_BUILDER(XorOp);
+
+#undef BINARY_BUILDER
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/xla/ir/hlo_client_ops.cc.inc"
+
+//===----------------------------------------------------------------------===//
+// xla_hlo_client Dialect Constructor
+//===----------------------------------------------------------------------===//
+
+XlaHloClientDialect::XlaHloClientDialect(MLIRContext* context)
+    : Dialect(getDialectNamespace(), context) {
+  addOperations<
+#define GET_OP_LIST
+#include "tensorflow/compiler/mlir/xla/ir/hlo_client_ops.cc.inc"
+      >();
+}
+
+}  // namespace xla_hlo_client
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_client_ops.h b/tensorflow/compiler/mlir/xla/ir/hlo_client_ops.h
new file mode 100644
index 00000000000..541ab0ebe3f
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_client_ops.h
@@ -0,0 +1,44 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_XLA_IR_HLO_CLIENT_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_XLA_IR_HLO_CLIENT_OPS_H_
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Dialect.h"  // TF:llvm-project
+#include "mlir/IR/DialectImplementation.h"  // TF:llvm-project
+#include "mlir/IR/MLIRContext.h"  // TF:llvm-project
+#include "mlir/IR/OpDefinition.h"  // TF:llvm-project
+#include "mlir/IR/Operation.h"  // TF:llvm-project
+#include "mlir/IR/StandardTypes.h"  // TF:llvm-project
+#include "mlir/IR/Types.h"  // TF:llvm-project
+#include "mlir/Interfaces/SideEffects.h"  // TF:llvm-project
+
+namespace mlir {
+namespace xla_hlo_client {
+
+class XlaHloClientDialect : public Dialect {
+ public:
+  explicit XlaHloClientDialect(MLIRContext *context);
+  static StringRef getDialectNamespace() { return "xla_hlo_client"; }
+};
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/xla/ir/hlo_client_ops.h.inc"
+
+}  // namespace xla_hlo_client
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_XLA_IR_HLO_CLIENT_OPS_H_
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_client_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_client_ops.td
new file mode 100644
index 00000000000..2048604915d
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_client_ops.td
@@ -0,0 +1,134 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Defines "client" aligned HLO ops.
+// These ops are not necessarily orthogonal or optimized for transformation but
+// for ease of expression in certain cases deemed important for client
+// libraries (i.e. implicit broadcasting, helper ops, etc).
+// This dialect is considered to exist in addition to augment the xla_hlo
+// dialect for ergonomic needs, not duplicate/replace it.
+//
+// The typical use of this dialect is for client libraries to be able to emit
+// less constrained ops and rely on the conversion framework to lower any
+// xla_hlo_client ops to canonical xla_hlo ops.
+//
+// See: https://www.tensorflow.org/xla/operation_semantics
+
+#ifndef HLO_CLIENT_OPS
+#define HLO_CLIENT_OPS
+
+include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/SideEffects.td"
+include "tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td"
+
+def HLOClient_Dialect : Dialect {
+  let name = "xla_hlo_client";
+  let cppNamespace = "xla_hlo_client";
+}
+
+class HLOClient_Op<string mnemonic, list<OpTrait> traits> :
+    Op<HLOClient_Dialect, mnemonic, traits> {
+  // TODO(b/129012527) Much of this custom verification should be expressed as
+  // type constraints.
+  let verifier = [{ return Verify(*this); }];
+}
+
+//===----------------------------------------------------------------------===//
+// XLA binary elementwise op definitions.
+// From the client perspective, each of these support both explicit rank
+// broadcasting (via the broadcast_dimensions attribute) and implicit degenerate
+// shape broadcasting.
+//
+// These have 1:1 correspondance with same-named ops in the xla_hlo dialect;
+// however, those operations do not support broadcasting.
+//
+// See:
+//   https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations
+//   https://www.tensorflow.org/xla/broadcasting
+//===----------------------------------------------------------------------===//
+
+class HLOClient_BinaryElementwiseOp<string mnemonic, list<OpTrait> traits> :
+        HLOClient_Op<mnemonic, traits> {
+  let arguments = (ins
+    HLO_Tensor:$lhs,
+    HLO_Tensor:$rhs,
+    OptionalAttr<BroadcastDimAttr>:$broadcast_dimensions
+  );
+
+  let builders = [OpBuilder<
+    "Builder *builder, OperationState &result, Value left, Value  right, "
+    "DenseIntElementsAttr broadcast_dimensions"
+  >];
+
+  let results = (outs HLO_Tensor);
+  let parser = [{ return mlir::impl::parseOneResultSameOperandTypeOp(parser, result); }];
+  let printer = [{ return mlir::impl::printOneResultOp(getOperation(), p); }];
+}
+
+def HLOClient_AddOp : HLOClient_BinaryElementwiseOp<"add",
+      [Commutative, NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_AddOp;
+
+def HLOClient_Atan2Op : HLOClient_BinaryElementwiseOp<"atan2",
+      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_Atan2Op;
+
+def HLOClient_DivOp : HLOClient_BinaryElementwiseOp<"div",
+      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_DivOp;
+
+def HLOClient_MaxOp : HLOClient_BinaryElementwiseOp<"max",
+      [Commutative, NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_MaxOp;
+
+def HLOClient_MinOp : HLOClient_BinaryElementwiseOp<"min",
+      [Commutative, NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_MinOp;
+
+def HLOClient_MulOp : HLOClient_BinaryElementwiseOp<"mul",
+      [Commutative, NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_MulOp;
+
+def HLOClient_PowOp : HLOClient_BinaryElementwiseOp<"pow",
+      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_PowOp;
+
+def HLOClient_RemOp : HLOClient_BinaryElementwiseOp<"remainder",
+      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_RemOp;
+
+def HLOClient_ShiftLeftOp : HLOClient_BinaryElementwiseOp<"shift_left",
+      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_ShiftLeftOp;
+
+def HLOClient_ShiftRightArithmeticOp : HLOClient_BinaryElementwiseOp<"shift_right_arithmetic",
+      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_ShiftRightArithmeticOp;
+
+def HLOClient_ShiftRightLogicalOp : HLOClient_BinaryElementwiseOp<"shift_right_logical",
+      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_ShiftRightLogicalOp;
+
+def HLOClient_SubOp : HLOClient_BinaryElementwiseOp<"sub",
+      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_SubOp;
+
+//===----------------------------------------------------------------------===//
+// XLA binary elementwise op definitions.
+// The same description as the arithmetic binary elementwise ops applies.
+//===----------------------------------------------------------------------===//
+
+class HLOClient_BinaryLogicalElementwiseOp<string mnemonic> :
+        HLOClient_BinaryElementwiseOp<mnemonic, [Commutative, NoSideEffect]> {
+  let arguments = (ins
+    HLO_PredOrIntTensor:$lhs,
+    HLO_PredOrIntTensor:$rhs,
+    OptionalAttr<BroadcastDimAttr>:$broadcast_dimensions
+  );
+}
+
+def HLOClient_AndOp: HLOClient_BinaryLogicalElementwiseOp<"and">, BASE_HLO_AndOp;
+def HLOClient_OrOp: HLOClient_BinaryLogicalElementwiseOp<"or">, BASE_HLO_OrOp;
+def HLOClient_XorOp : HLOClient_BinaryLogicalElementwiseOp<"xor">, BASE_HLO_XorOp;
+
+#endif  // HLO_CLIENT_OPS
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
index f44bb9da758..023ab46a66f 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/MathExtras.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
@@ -437,8 +438,8 @@ static LogicalResult Verify(BroadcastInDimOp op) {
                       operandRank));
   }
 
-  auto dimensions = *op.broadcast_dimensions();
-  auto dimensionsType = op.broadcast_dimensions()->getType();
+  auto dimensions = op.broadcast_dimensions();
+  auto dimensionsType = op.broadcast_dimensions().getType();
   auto dimensionsRank = dimensionsType.getRank();
   if (dimensionsRank != 1) {
     return op.emitOpError(llvm::formatv(
@@ -878,6 +879,12 @@ static LogicalResult Verify(RecvOp op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// CopyOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult CopyOp::fold(ArrayRef<Attribute> operands) { return getOperand(); }
+
 //===----------------------------------------------------------------------===//
 // ReshapeOp
 //===----------------------------------------------------------------------===//
@@ -971,6 +978,47 @@ static LogicalResult Verify(SelectOp op) {
   return success();
 }
 
+// Makes it such that a SelectOp that is a non-root operation in a DRR infers
+// the return type based on operand type.
+LogicalResult SelectOp::inferReturnTypes(
+    MLIRContext*, Optional<Location> location, ValueRange operands,
+    ArrayRef<NamedAttribute> attributes, RegionRange regions,
+    SmallVectorImpl<Type>& inferredReturnTypes) {
+  auto x_type = operands[1].getType();
+  auto y_type = operands[2].getType();
+  auto x_tensor = x_type.cast<TensorType>();
+  auto y_tensor = y_type.cast<TensorType>();
+
+  // Check for type compatibility in the select op. This requires that the two
+  // non-predicate operands:
+  //   (a) have the same element type
+  //   (b) have compatible shapes (i.e. the same shape and/or at least one
+  //       dynamic shape)
+  if (x_tensor.getElementType() != y_tensor.getElementType() ||
+      failed(mlir::verifyCompatibleShape(x_type, y_type))) {
+    return emitOptionalError(location, "incompatible operand types: ", x_type,
+                             " and ", y_type);
+  }
+
+  // TODO(lucyfox): Support output shape inference when operands have compatible
+  // shapes. (The output shape should be the most general of the operand shapes
+  // at each dimension.) For now, handle the straightforward cases and fail
+  // otherwise. When this is fully implemented, this logic should move into
+  // reusable functionality in MLIR Core.
+  Type output_type;
+  if (x_type == y_type || !x_tensor.hasRank()) {
+    output_type = x_type;
+  } else if (!y_tensor.hasRank()) {
+    output_type = y_type;
+  } else {
+    return emitOptionalError(location,
+                             "currently unsupported operand types: ", x_type,
+                             " and ", y_type);
+  }
+  inferredReturnTypes.assign({output_type});
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // PadOp
 //===----------------------------------------------------------------------===//
@@ -1488,5 +1536,40 @@ void XlaHloDialect::printType(Type type, DialectAsmPrinter& os) const {
   os << "<unknown xla_hlo type>";
 }
 
+//===----------------------------------------------------------------------===//
+// Shape inference
+//===----------------------------------------------------------------------===//
+
+LogicalResult deriveShapeFromFirstOperand(
+    OpBuilder* builder, Operation* op,
+    SmallVectorImpl<Value>* reifiedReturnShapes) {
+  Value operand = op->getOperand(0);
+  ShapedType operand_type = operand.getType().dyn_cast<ShapedType>();
+  if (!operand_type) {
+    op->emitOpError() << "first operand is not a shaped type";
+    return failure();
+  }
+  auto loc = op->getLoc();
+  SmallVector<Value, 4> shape_values;
+  shape_values.reserve(operand_type.getRank());
+  auto shape_scalar_type = builder->getIntegerType(64);
+  for (auto element : llvm::enumerate(operand_type.getShape())) {
+    if (element.value() == ShapedType::kDynamicSize) {
+      Value dim = builder->create<DimOp>(loc, operand, element.index());
+      shape_values.push_back(
+          builder->create<IndexCastOp>(loc, dim, shape_scalar_type));
+    } else {
+      shape_values.push_back(builder->create<ConstantOp>(
+          loc, builder->getI64IntegerAttr(element.value())));
+    }
+  }
+  *reifiedReturnShapes =
+      SmallVector<Value, 1>{builder->create<ScalarsToDimensionTensorOp>(
+          loc,
+          RankedTensorType::get({operand_type.getRank()}, shape_scalar_type),
+          shape_values)};
+  return success();
+}
+
 }  // namespace xla_hlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.h b/tensorflow/compiler/mlir/xla/ir/hlo_ops.h
index d0bc9619db9..1a864507253 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.h
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.h
@@ -28,6 +28,8 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // TF:llvm-project
 #include "mlir/IR/StandardTypes.h"  // TF:llvm-project
 #include "mlir/IR/Types.h"  // TF:llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // TF:llvm-project
+#include "mlir/Interfaces/SideEffects.h"  // TF:llvm-project
 #include "mlir/Support/Functional.h"  // TF:llvm-project
 
 namespace mlir {
@@ -72,6 +74,22 @@ class TokenType : public Type::TypeBase<TokenType, Type> {
   static bool kindof(unsigned kind) { return kind == HLOTypes::Token; }
 };
 
+// Shape derivation function that computes the shape of the result based on
+// the first argument. For a 2-dimensional input tensor, this produces IR of
+// the form
+//
+//  %0 = dim %arg0, 0 : memref<?x?xf32>
+//  %1 = index_cast %0 : index to i64
+//  %2 = dim %arg0, 1 : memref<?x?xf32>
+//  %3 = index_cast %2 : index to i64
+//  %4 = "xla_hlo.scalars_to_dimension_tensor"(%1, %3)
+//    : (i64, i64) -> tensor<2xi64>
+//
+// and returns %4 as the shape value.
+LogicalResult deriveShapeFromFirstOperand(
+    OpBuilder *builder, Operation *op,
+    SmallVectorImpl<Value> *reifiedReturnShapes);
+
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h.inc"
 
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index 8fe7bb9f58c..d85a44eca10 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -13,12 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// This is the operation definition file for XLA.
+// This is the operation definition file for XLA HLO ops which map to the
+// traditional definition in xla_data.proto (or are aligned with the goals
+// thereof).
+// See: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/xla_data.proto
 
 #ifndef HLO_OPS
 #define HLO_OPS
 
 include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/InferTypeOpInterface.td"
+include "mlir/Interfaces/SideEffects.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_utils.td"
 
@@ -37,65 +42,12 @@ class HLO_Op<string mnemonic, list<OpTrait> traits> :
   let verifier = [{ return Verify(*this); }];
 }
 
-//===----------------------------------------------------------------------===//
-// XLA type definitions.
-//===----------------------------------------------------------------------===//
-
-// Token type.
-def HLO_Token : Type<CPred<"$_self.isa<TokenType>()">, "token">;
-
-// Any integer tensor types
-def HLO_IntTensor : TensorOf<[HLO_Int]>;
-
-// Any floating-point tensor types
-def HLO_FpTensor : TensorOf<[AnyFloat]>;
-
-def HLO_PredTensor : TensorOf<[HLO_Pred]>;
-
-def HLO_Tensor : TensorOf<[AnyFloat, AnySignlessInteger, AnyComplex]>;
-
-def HLO_ComplexTensor : TensorOf<[AnyComplex]>;
-
-def HLO_Tuple : NestedTupleOf<[HLO_Tensor, HLO_Token]>;
-
-def HLO_TensorOrTuple : AnyTypeOf<[HLO_Tensor, HLO_Tuple]>;
-
-// Dynamic representation of a shape vector as a tensor. Ideally this would be
-// an index type (as it stores indices) but that is currently disallowed in
-// MLIR.
-def HLO_DimensionTensor : ShapedContainerType<
-    [AnySignlessInteger], And<[IsTensorTypePred, HasAnyRankOfPred<[1]>]>,
-    "a 1D tensor of dimensions">;
-
-// In general, static shaped tensor constraints should be avoided unless
-// it is for a legacy op which is only correct with static shapes.
-def HLO_StaticShapeTensor : StaticShapeTensorOf<[
-      AnyFloat, AnySignlessInteger, AnyComplex]>;
-
-//===----------------------------------------------------------------------===//
-// XLA combined type definitions.
-//===----------------------------------------------------------------------===//
-
-// Any integer or floating-point tensor types
-def HLO_IntOrFpTensor : TensorOf<[HLO_Int, AnyFloat]>;
-
-// Any integer or predicate tensor types
-def HLO_PredOrIntTensor : TensorOf<[HLO_Pred, HLO_Int]>;
-
-// Any floating-point or complex tensor types
-def HLO_FpOrComplexTensor : TensorOf<[AnyFloat, AnyComplex]>;
-
-// Any int, floating-point or complex tensor types
-def HLO_IntFpOrComplexTensor : TensorOf<[HLO_Int, AnyFloat, AnyComplex]>;
-
-// Any pred, int or floating-point tensor types
-def HLO_PredIntOrFpTensor : TensorOf<[HLO_Pred, HLO_Int, AnyFloat]>;
-
 //===----------------------------------------------------------------------===//
 // XLA nullary op definitions.
 //===----------------------------------------------------------------------===//
 
-def HLO_ConstOp : HLO_Op<"constant", [NoSideEffect]>, BASE_HLO_ConstOp {
+def HLO_ConstOp : HLO_Op<"constant", [ConstantLike, NoSideEffect]>,
+                  BASE_HLO_ConstOp {
   let arguments = (ins
     ElementsAttr:$value
   );
@@ -126,15 +78,41 @@ def HLO_IotaOp : HLO_Op<"iota", [NoSideEffect]>, BASE_HLO_IotaOp {
   let hasCustomHLOConverter = 1;
 }
 
+def HLO_CreateTokenOp : HLO_Op<"create_token", [NoSideEffect]> {
+  string summary = "Create Token operator";
+
+  string description = [{
+    Produces a HLO token. Tokens are used for ordering side-effecting perations.
+    This is exported to HLO as an AfterAll operation with no operands to
+    generate a token.
+  }];
+
+  let results = (outs HLO_Token:$output);
+}
+
 //===----------------------------------------------------------------------===//
 // XLA unary elementwise op definitions.
 //===----------------------------------------------------------------------===//
 // See https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions
 class HLO_UnaryElementwiseOp<string mnemonic, list<OpTrait> traits,
-      Type TensorType>: HLO_Op<mnemonic, traits> {
-
+      Type TensorType>: HLO_Op<mnemonic,
+        !listconcat(traits, [InferShapedTypeOpInterface])> {
     let arguments = (ins TensorType:$operand);
     let results = (outs TensorType);
+    let extraClassDeclaration = [{
+      static  LogicalResult inferReturnTypeComponents(
+          MLIRContext* context, Optional<Location> location,
+          ValueRange operands, ArrayRef<NamedAttribute> attributes,
+          RegionRange regions,
+          SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {
+        return failure();
+      }
+      LogicalResult reifyReturnTypeShapes(
+          OpBuilder& builder, SmallVectorImpl<Value>& reifiedReturnShapes) {
+        return deriveShapeFromFirstOperand(&builder, getOperation(),
+                                           &reifiedReturnShapes);
+      }
+    }];
 }
 
 // Abs supports complex to real, so element type is not guaranteed to match.
@@ -273,11 +251,11 @@ def HLO_RealOp: HLO_Op<
 
 // See https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations
 class HLO_BinaryElementwiseOp<string mnemonic, list<OpTrait> traits> :
-        HLO_Op<mnemonic, traits> {
+        HLO_Op<mnemonic, !listconcat(traits, [InferShapedTypeOpInterface])> {
   let arguments = (ins
     HLO_Tensor:$lhs,
     HLO_Tensor:$rhs,
-    BroadcastDimAttr:$broadcast_dimensions
+    OptionalAttr<BroadcastDimAttr>:$broadcast_dimensions
   );
 
   let builders = [OpBuilder<
@@ -285,6 +263,20 @@ class HLO_BinaryElementwiseOp<string mnemonic, list<OpTrait> traits> :
     "DenseIntElementsAttr broadcast_dimensions"
   >];
 
+  let extraClassDeclaration = [{
+    static  LogicalResult inferReturnTypeComponents(
+        MLIRContext* context, Optional<Location> location, ValueRange operands,
+        ArrayRef<NamedAttribute> attributes, RegionRange regions,
+        SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {
+      return failure();
+    }
+    LogicalResult reifyReturnTypeShapes(
+        OpBuilder& builder, SmallVectorImpl<Value>& reifiedReturnShapes) {
+      return deriveShapeFromFirstOperand(&builder, getOperation(),
+                                         &reifiedReturnShapes);
+    }
+  }];
+
   let results = (outs HLO_Tensor);
   let parser = [{ return mlir::impl::parseOneResultSameOperandTypeOp(parser, result); }];
   let printer = [{ return mlir::impl::printOneResultOp(getOperation(), p); }];
@@ -336,7 +328,7 @@ class HLO_BinaryLogicalElementwiseOp<string mnemonic> :
   let arguments = (ins
     HLO_PredOrIntTensor:$lhs,
     HLO_PredOrIntTensor:$rhs,
-    BroadcastDimAttr:$broadcast_dimensions
+    OptionalAttr<BroadcastDimAttr>:$broadcast_dimensions
   );
 }
 
@@ -619,7 +611,7 @@ def HLO_CompareOp: HLO_Op<"compare",
   let arguments = (ins
     HLO_Tensor:$lhs,
     HLO_Tensor:$rhs,
-    BroadcastDimAttr:$broadcast_dimensions,
+    OptionalAttr<BroadcastDimAttr>:$broadcast_dimensions,
     HLO_ComparisonDirectionAttr:$comparison_direction
   );
   let builders = [OpBuilder<
@@ -809,7 +801,7 @@ def HLO_DynamicBroadcastInDimOp : HLO_Op<"dynamic_broadcast_in_dim",
   let arguments = (ins
     HLO_Tensor:$operand,
     HLO_DimensionTensor:$output_dimensions,
-    BroadcastDimAttr:$broadcast_dimensions
+    OptionalAttr<BroadcastDimAttr>:$broadcast_dimensions
   );
 
   let results = (outs HLO_Tensor);
@@ -908,6 +900,7 @@ def HLO_ConvOp : HLO_Op<"conv", [NoSideEffect]>, BASE_HLO_ConvOp {
 def HLO_CopyOp: HLO_Op<"copy", [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_CopyOp {
   let arguments = (ins HLO_Tensor);
   let results = (outs HLO_Tensor);
+  let hasFolder = 1;
 }
 
 def HLO_CrossReplicaSumOp : HLO_Op<"cross-replica-sum",
@@ -942,10 +935,11 @@ def HLO_DotOp: HLO_Op<"dot", [NoSideEffect]>, BASE_HLO_DotOp {
 }
 
 def DotDimensionNumbers : StructAttr<"DotDimensionNumbers", HLO_Dialect, [
-                StructFieldAttr<"lhs_batching_dimensions",   ElementsAttr>,
-                StructFieldAttr<"rhs_batching_dimensions",   ElementsAttr>,
-                StructFieldAttr<"lhs_contracting_dimensions", ElementsAttr>,
-                StructFieldAttr<"rhs_contracting_dimensions", ElementsAttr>] > {
+                StructFieldAttr<"lhs_batching_dimensions",   I64ElementsAttr>,
+                StructFieldAttr<"rhs_batching_dimensions",   I64ElementsAttr>,
+                StructFieldAttr<"lhs_contracting_dimensions", I64ElementsAttr>,
+                StructFieldAttr<"rhs_contracting_dimensions", I64ElementsAttr>
+  ]> {
   let description = "Structure of dimension information for dot product";
 }
 
@@ -1087,7 +1081,7 @@ def HLO_ScatterOp: HLO_Op<"scatter", [NoSideEffect]>, BASE_HLO_ScatterOp {
 }
 
 // TODO(jpienaar): Add broadcastable trait.
-def HLO_SelectOp: HLO_Op<"select", [NoSideEffect]>, BASE_HLO_SelectOp {
+def HLO_SelectOp: HLO_Op<"select", [NoSideEffect, DeclareOpInterfaceMethods<InferTypeOpInterface>]>, BASE_HLO_SelectOp {
   let arguments = (ins
     HLO_PredTensor:$pred,
     HLO_Tensor:$on_true,
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
index 64303e86fe0..8dee4d0eb69 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
@@ -18,14 +18,68 @@ limitations under the License.
 
 include "mlir/IR/OpBase.td"
 
-def HLO_Int : IntOfWidths<[8, 16, 32, 64]>;
+def HLO_Int : SignlessIntOfWidths<[8, 16, 32, 64]>;
 def HLO_Pred : TypeAlias<I1, "pred (AKA boolean or 1-bit integer)">;
 
 // The broadcasting dimensions correspond to a tuple that describes how a
 // smaller rank shape is broadcast into a larger rank shape. For example,
 // given a 2x3x4 cuboid and a 3x4 matrix, a broadcasting tuple (1,2) means
 // matching the matrix to dimensions 1 and 2 of the cuboid.
-def BroadcastDimAttr : OptionalAttr<I64ElementsAttr>;
+defvar BroadcastDimAttr = I64ElementsAttr;
+
+//===----------------------------------------------------------------------===//
+// XLA on tensors type definitions.
+//===----------------------------------------------------------------------===//
+
+// Token type.
+def HLO_Token : Type<CPred<"$_self.isa<TokenType>()">, "token">;
+
+// Any integer tensor types
+def HLO_IntTensor : TensorOf<[HLO_Int]>;
+
+// Any floating-point tensor types
+def HLO_FpTensor : TensorOf<[AnyFloat]>;
+
+def HLO_PredTensor : TensorOf<[HLO_Pred]>;
+
+def HLO_Tensor : TensorOf<[AnyFloat, AnySignlessInteger, AnyComplex]>;
+
+def HLO_ComplexTensor : TensorOf<[AnyComplex]>;
+
+def HLO_Tuple : NestedTupleOf<[HLO_Tensor, HLO_Token]>;
+
+def HLO_TensorOrTuple : AnyTypeOf<[HLO_Tensor, HLO_Tuple]>;
+
+// Dynamic representation of a shape vector as a tensor. Ideally this would be
+// an index type (as it stores indices) but that is currently disallowed in
+// MLIR.
+def HLO_DimensionTensor : ShapedContainerType<
+    [AnySignlessInteger], And<[IsTensorTypePred, HasAnyRankOfPred<[1]>]>,
+    "a 1D tensor of dimensions">;
+
+// In general, static shaped tensor constraints should be avoided unless
+// it is for a legacy op which is only correct with static shapes.
+def HLO_StaticShapeTensor : StaticShapeTensorOf<[
+      AnyFloat, AnySignlessInteger, AnyComplex]>;
+
+//===----------------------------------------------------------------------===//
+// XLA on tensors combined type definitions.
+//===----------------------------------------------------------------------===//
+
+// Any integer or floating-point tensor types
+def HLO_IntOrFpTensor : TensorOf<[HLO_Int, AnyFloat]>;
+
+// Any integer or predicate tensor types
+def HLO_PredOrIntTensor : TensorOf<[HLO_Pred, HLO_Int]>;
+
+// Any floating-point or complex tensor types
+def HLO_FpOrComplexTensor : TensorOf<[AnyFloat, AnyComplex]>;
+
+// Any int, floating-point or complex tensor types
+def HLO_IntFpOrComplexTensor : TensorOf<[HLO_Int, AnyFloat, AnyComplex]>;
+
+// Any pred, int or floating-point tensor types
+def HLO_PredIntOrFpTensor : TensorOf<[HLO_Pred, HLO_Int, AnyFloat]>;
 
 //===----------------------------------------------------------------------===//
 // XLA nullary op definitions.
@@ -703,8 +757,8 @@ class BASE_HLO_AllToAllOp {
     AllToAll is a collective operation that sends data from all cores to all
     cores. It has two phases:
     - The scatter phase. On each core, the operand is split into `split_count`
-      number of blocks along the `split_dimensions`, and the blocks are
-      scattered to all cores, e.g., the i-th block is send to the i-th core.
+      number of blocks along the `split_dimension`, and the blocks are
+      scattered to all cores, e.g., the i-th block is sent to the i-th core.
     - The gather phase. Each core concatenates the received blocks along the
       `concat_dimension`.
 
@@ -718,7 +772,7 @@ class BASE_HLO_AllToAllOp {
       will be concatenated in the same order of 1, 2, 3. Then, another AllToAll
       will be applied within replicas 4, 5, 0, and the concatenation order is
       also 4, 5, 0. If `replica_groups` is empty, all replicas belong to one
-      group, in the concatenation order of their appearance.
+      group, and the concatenation order is the numerical order (0, 1, 2, ...).
 
     Prerequisites:
     - The dimension size of the operand on the split_dimension is divisible by
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_utils.cc b/tensorflow/compiler/mlir/xla/ir/hlo_utils.cc
index 130acaf1acb..0143e781549 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_utils.cc
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_utils.cc
@@ -22,14 +22,16 @@ limitations under the License.
 namespace mlir {
 namespace xla {
 
-DenseIntElementsAttr getBroadcastDimensionsAttr(Builder *b, Value x, Value y) {
+DenseIntElementsAttr getBroadcastDimensionsAttr(Builder *b, Value x, Value y,
+                                                bool allow_empty) {
   TensorType xType = x.getType().dyn_cast<RankedTensorType>();
   TensorType yType = y.getType().dyn_cast<RankedTensorType>();
-  if (xType == yType || !xType || !yType) return {};
+  if (!xType || !yType) return {};
+  if (allow_empty && xType == yType) return {};
 
   // If the shapes have the same rank, then there is nothing to do.
   auto xRank = xType.getRank(), yRank = yType.getRank();
-  if (xRank == yRank) return {};
+  if (allow_empty && xRank == yRank) return {};
 
   // Otherwise if the ranks of the inputs don't match, TensorFlow automatically
   // reshapes the smaller by padding with dimensions of size 1 as a prefix. In
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_utils.h b/tensorflow/compiler/mlir/xla/ir/hlo_utils.h
index 3e3570f5b54..84ea3a1e1a8 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_utils.h
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_utils.h
@@ -28,9 +28,12 @@ namespace xla {
 
 // Computes the broadcast dimensions attr for an elementwise binary operator
 // between two ranked tensors.
+// If `allow_empty` is true, then null can be returned to mean that the
+// broadcast is an "identity".
 mlir::DenseIntElementsAttr getBroadcastDimensionsAttr(mlir::Builder* b,
                                                       mlir::Value x,
-                                                      mlir::Value y);
+                                                      mlir::Value y,
+                                                      bool allow_empty = true);
 
 /// Get a constant splat for the given value type.
 template <typename T>
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_utils.td b/tensorflow/compiler/mlir/xla/ir/hlo_utils.td
index 97b29bf0851..c6ea1fe9749 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_utils.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_utils.td
@@ -32,6 +32,9 @@ def NullDenseIntElementsAttr : NativeCodeCall<"DenseIntElementsAttr()">;
 def BinBroadcastDimensions : NativeCodeCall<
     "xla::getBroadcastDimensionsAttr(&$_builder, $0, $1)">;
 
+def BinBroadcastDimensionsNonEmpty : NativeCodeCall<
+    "xla::getBroadcastDimensionsAttr(&$_builder, $0, $1, /*allow_empty=*/false)">;
+
 // Here, the element type can be any integer or float type. But, note that only
 // 32 bit integers are supported for the value.
 class GetScalarOfType<int value> : NativeCodeCall<
diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.h b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.h
index 1a07b1a45f3..f9cb2284526 100644
--- a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.h
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // TF:llvm-project
 #include "mlir/IR/StandardTypes.h"  // TF:llvm-project
 #include "mlir/IR/Types.h"  // TF:llvm-project
+#include "mlir/Interfaces/SideEffects.h"  // TF:llvm-project
 #include "mlir/Support/Functional.h"  // TF:llvm-project
 
 namespace mlir {
diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
index 3a675f20d92..a37c530532d 100644
--- a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
@@ -19,6 +19,7 @@ limitations under the License.
 #define LHLO_OPS
 
 include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/SideEffects.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td"
 
 def LHLO_Dialect : Dialect {
@@ -97,6 +98,8 @@ def LHLO_NegOp: LHLO_UnaryElementwiseOp<"neg">, BASE_HLO_NegOp;
 
 def LHLO_RsqrtOp: LHLO_UnaryElementwiseOp<"rsqrt">, BASE_HLO_RsqrtOp;
 
+def LHLO_SqrtOp: LHLO_UnaryElementwiseOp<"sqrt">, BASE_HLO_SqrtOp;
+
 def LHLO_SignOp: LHLO_UnaryElementwiseOp<"sign">, BASE_HLO_SignOp;
 
 def LHLO_TanhOp: LHLO_UnaryElementwiseOp<"tanh">, BASE_HLO_TanhOp;
@@ -111,7 +114,7 @@ class LHLO_BinaryElementwiseOp<string mnemonic, list<OpTrait> traits> :
       LHLO_Buffer:$lhs,
       LHLO_Buffer:$rhs,
       LHLO_Buffer:$out,
-      BroadcastDimAttr:$broadcast_dimensions
+      OptionalAttr<BroadcastDimAttr>:$broadcast_dimensions
   );
 }
 
@@ -152,6 +155,29 @@ def LHLO_ReduceOp: LHLO_Op<"reduce", [
 
   let regions = (region SizedRegion<1>:$body);
 }
+
+def LHLO_ReduceWindowOp: LHLO_Op<"reduce_window", [
+      NoSideEffect,
+      SingleBlockImplicitTerminator<"TerminatorOp">
+    ]>, BASE_HLO_ReduceWindowOp {
+
+  let arguments = (ins
+    LHLO_Buffer:$operand,
+    LHLO_Buffer:$init_value,
+    LHLO_Buffer:$out,
+    I64ElementsAttr:$window_dimensions,
+    // If strides or dilations attributes are missing then the default value is
+    // one for each of the input dimensions. Similarly, padding values are zero
+    // for both low and high in each of the dimensions, if not specified.
+    OptionalAttr<I64ElementsAttr>:$window_strides,
+    OptionalAttr<I64ElementsAttr>:$base_dilations,
+    OptionalAttr<I64ElementsAttr>:$window_dilations,
+    OptionalAttr<I64ElementsAttr>:$padding
+  );
+
+  let regions = (region SizedRegion<1>:$body);
+}
+
 //===----------------------------------------------------------------------===//
 // XLA tuple op definitions.
 //===----------------------------------------------------------------------===//
@@ -175,7 +201,7 @@ def LHLO_CompareOp: LHLO_Op<"compare", []>, BASE_HLO_CompareOp {
     LHLO_Buffer:$lhs,
     LHLO_Buffer:$rhs,
     LHLO_PredBuffer:$out,
-    BroadcastDimAttr:$broadcast_dimensions,
+    OptionalAttr<BroadcastDimAttr>:$broadcast_dimensions,
     HLO_ComparisonDirectionAttr:$comparison_direction
   );
 }
@@ -313,6 +339,21 @@ def LHLO_SelectOp: LHLO_Op<"select", []>, BASE_HLO_SelectOp {
   );
 }
 
+def LHLO_SelectAndScatterOp: LHLO_Op<"select_and_scatter",
+      [NoSideEffect]>, BASE_HLO_SelectAndScatterOp {
+  let arguments = (ins
+    LHLO_Buffer:$operand,
+    LHLO_Buffer:$source,
+    LHLO_Buffer:$init_value,
+    LHLO_Buffer:$out,
+    OptionalAttr<I64ElementsAttr>:$window_dimensions,
+    OptionalAttr<I64ElementsAttr>:$window_strides,
+    OptionalAttr<I64ElementsAttr>:$padding
+  );
+
+  let regions = (region SizedRegion<1>:$select, SizedRegion<1>:$scatter);
+}
+
 def LHLO_ReverseOp: LHLO_Op<"reverse", []>, BASE_HLO_ReverseOp {
   let arguments = (ins
     LHLO_Buffer:$operand,
diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
new file mode 100644
index 00000000000..1573810bc90
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
@@ -0,0 +1,139 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/Builders.h"  // TF:llvm-project
+#include "mlir/IR/StandardTypes.h"  // TF:llvm-project
+#include "tensorflow/compiler/mlir/xla/hlo_utils.h"
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
+#include "tensorflow/compiler/mlir/xla/type_to_shape.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
+
+namespace xla {
+
+static std::string GetMlirOpName(HloOpcode opcode) {
+  std::string op_name = HloOpcodeString(opcode);
+  absl::c_replace(op_name, '-', '_');
+  return mlir::xla_hlo::XlaHloDialect::getDialectNamespace().str() + "." +
+         op_name;
+}
+
+static std::string ToString(mlir::Type ty) {
+  std::string str;
+  llvm::raw_string_ostream sstream(str);
+  ty.print(sstream);
+  sstream.flush();
+  return str;
+}
+
+// Returns 1D 64-bit dense elements attribute with the given values.
+static mlir::DenseIntElementsAttr GetI64ElementsAttr(
+    absl::Span<const int64> values, mlir::Builder* builder) {
+  auto ty = mlir::RankedTensorType::get({static_cast<int64_t>(values.size())},
+                                        builder->getIntegerType(64));
+  llvm::SmallVector<int64_t, 4> mlir_values;
+  mlir_values.reserve(values.size());
+  for (const auto& value : values) {
+    mlir_values.push_back(value);
+  }
+  return mlir::DenseIntElementsAttr::get(ty, mlir_values);
+}
+
+MlirHloBuilder::~MlirHloBuilder() = default;
+
+StatusOr<XlaOp> MlirHloBuilder::MakeXlaOp(mlir::Value val) {
+  mlir::Type ty = val.getType();
+  auto shape = std::make_unique<Shape>(TypeToShape(ty));
+  if (shape->element_type() == PrimitiveType::PRIMITIVE_TYPE_INVALID) {
+    return InvalidArgument("unsupported type: %s", ToString(ty).c_str());
+  }
+
+  int64 handle = reinterpret_cast<int64>(val.getAsOpaquePointer());
+  handle_to_shape_[handle] = std::move(shape);
+  return XlaOp(handle, this);
+}
+
+StatusOr<XlaOp> MlirHloBuilder::ReshapeInternal(const Shape& shape,
+                                                XlaOp operand,
+                                                int64 inferred_dimension) {
+  TF_RETURN_IF_ERROR(first_error());
+
+  if (inferred_dimension != -1)
+    return Unimplemented("inferred_dimension not yet supported for Reshape op");
+  TF_ASSIGN_OR_RETURN(mlir::Type ty, ConvertShapeToType<mlir::RankedTensorType>(
+                                         shape, builder_));
+  mlir::Value value = GetValue(operand);
+  auto op = builder_.create<mlir::xla_hlo::ReshapeOp>(loc_, ty, value);
+  return MakeXlaOp(op.getResult());
+}
+
+StatusOr<XlaOp> MlirHloBuilder::InDimBroadcast(
+    const Shape& shape, XlaOp operand,
+    absl::Span<const int64> broadcast_dimensions) {
+  TF_RETURN_IF_ERROR(first_error());
+  TF_ASSIGN_OR_RETURN(mlir::Type ty, ConvertShapeToType<mlir::RankedTensorType>(
+                                         shape, builder_));
+  mlir::Value value = GetValue(operand);
+  auto op = builder_.create<mlir::xla_hlo::BroadcastInDimOp>(
+      loc_, ty, value, GetI64ElementsAttr(broadcast_dimensions, &builder_));
+  return MakeXlaOp(op.getResult());
+}
+
+XlaOp MlirHloBuilder::BinaryOpNoBroadcast(
+    HloOpcode binop, const Shape& shape, XlaOp lhs, XlaOp rhs,
+    absl::optional<ComparisonDirection> direction) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    if (direction.has_value())
+      return Unimplemented("direction attribute not yet supported");
+    return CreateOp(GetMlirOpName(binop), shape, {lhs, rhs}, /*attributes=*/{});
+  });
+}
+
+StatusOr<XlaOp> MlirHloBuilder::AddOpWithShape(
+    HloOpcode opcode, const Shape& shape, absl::Span<const XlaOp> operands) {
+  return CreateOp(GetMlirOpName(opcode), shape,
+                  llvm::makeArrayRef<XlaOp>(operands.data(), operands.size()),
+                  /*attributes=*/{});
+}
+
+StatusOr<XlaOp> MlirHloBuilder::CreateOp(
+    const std::string& op_name, const Shape& shape,
+    llvm::ArrayRef<XlaOp> operands,
+    llvm::ArrayRef<mlir::NamedAttribute> attributes) {
+  llvm::SmallVector<mlir::Value, 4> operand_values;
+  operand_values.reserve(operands.size());
+  for (XlaOp xla_op : operands) {
+    operand_values.push_back(GetValue(xla_op));
+  }
+  TF_ASSIGN_OR_RETURN(mlir::Type ty, ConvertShapeToType<mlir::RankedTensorType>(
+                                         shape, builder_));
+  mlir::OperationState state(loc_, op_name, operand_values, {ty}, attributes);
+  mlir::Operation* op = builder_.createOperation(state);
+  return MakeXlaOp(op->getResult(0));
+}
+
+StatusOr<const Shape*> MlirHloBuilder::GetShapePtr(XlaOp op) const {
+  TF_RETURN_IF_ERROR(first_error());
+  TF_RETURN_IF_ERROR(CheckOpBuilder(op));
+  auto it = handle_to_shape_.find(op.handle());
+  if (it == handle_to_shape_.end()) {
+    return InvalidArgument("No XlaOp with handle %d", op.handle());
+  }
+  return it->second.get();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
new file mode 100644
index 00000000000..9bebbc025a5
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
@@ -0,0 +1,118 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_XLA_IR_MLIR_HLO_BUILDER_H_
+#define TENSORFLOW_COMPILER_MLIR_XLA_IR_MLIR_HLO_BUILDER_H_
+
+#include <memory>
+
+#include "absl/container/flat_hash_map.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Builders.h"  // TF:llvm-project
+#include "mlir/IR/Function.h"  // TF:llvm-project
+#include "mlir/IR/Location.h"  // TF:llvm-project
+#include "mlir/IR/Operation.h"  // TF:llvm-project
+#include "mlir/IR/Value.h"  // TF:llvm-project
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace xla {
+
+// Provides a way to construct xla_hlo dialect ops in MLIR using XlaBuilder
+// interface.
+//
+// Requires that all XlaOp arguments are either returned by any of the builder
+// method or constructed using MakeXlaOp method in this builder.
+//
+// TODO(hinsu): Support more ops and utility functions to set special attributes
+// like OpMetadata and Sharding.
+class MlirHloBuilder : public XlaBuilder {
+ public:
+  // Constructs builder for the given function. New operations are added to the
+  // beginning of the function, if it is non empty and has a block.
+  explicit MlirHloBuilder(mlir::FuncOp func)
+      : XlaBuilder(func.getName().str()),
+        builder_(&func.getBody()),
+        loc_(builder_.getUnknownLoc()) {}
+
+  // TODO(hinsu): Add a constructor to build a new MLIR function from scratch
+  // and override Build methods.
+
+  MlirHloBuilder(const MlirHloBuilder&) = delete;
+  MlirHloBuilder& operator=(const MlirHloBuilder&) = delete;
+
+  ~MlirHloBuilder() override;
+
+  // Wraps the given MLIR value under an XlaOp instance. Note that all HLO
+  // operations returns exactly one result therefore each op has an XlaOp
+  // wrapping result of the op.
+  //
+  // Returns an error if the HLO dialect doesn't support type of the given
+  // value.
+  StatusOr<XlaOp> MakeXlaOp(mlir::Value val);
+
+  // Returns value corresponding to the given op.
+  //
+  // Requires that the op was created by this builder.
+  mlir::Value GetValue(XlaOp op) {
+    void* ptr = reinterpret_cast<void*>(op.handle());
+    return mlir::Value::getFromOpaquePointer(ptr);
+  }
+
+  // Sets location for newly built ops, until reset.
+  void SetLocation(mlir::Location loc) { loc_ = loc; }
+
+  // Update insertion point so that newly built ops are inserted before the
+  // given op in order, until reset.
+  void setInsertionPoint(mlir::Operation* op) {
+    builder_.setInsertionPoint(op);
+  }
+
+  // Returns the shape of the given op.
+  StatusOr<const Shape*> GetShapePtr(XlaOp op) const override;
+
+ private:
+  StatusOr<XlaOp> ReshapeInternal(const Shape& shape, XlaOp operand,
+                                  int64 inferred_dimension) override;
+
+  StatusOr<XlaOp> InDimBroadcast(
+      const Shape& shape, XlaOp operand,
+      absl::Span<const int64> broadcast_dimensions) override;
+
+  XlaOp BinaryOpNoBroadcast(
+      HloOpcode binop, const Shape& shape, XlaOp lhs, XlaOp rhs,
+      absl::optional<ComparisonDirection> direction) override;
+
+  StatusOr<XlaOp> AddOpWithShape(HloOpcode opcode, const Shape& shape,
+                                 absl::Span<const XlaOp> operands) override;
+
+  // Creates HLO dialect op and returns the result as an XlaOp.
+  StatusOr<XlaOp> CreateOp(const std::string& op_name, const Shape& shape,
+                           llvm::ArrayRef<XlaOp> operands,
+                           llvm::ArrayRef<mlir::NamedAttribute> attributes);
+
+  mlir::OpBuilder builder_;
+  mlir::Location loc_;
+
+  absl::flat_hash_map<int64, std::unique_ptr<Shape>> handle_to_shape_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_MLIR_XLA_IR_MLIR_HLO_BUILDER_H_
diff --git a/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir b/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
index b73cfcfa538..18a29968600 100644
--- a/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
@@ -74,3 +74,11 @@ func @extract_scalars_to_tensor(%arg0: i32, %arg1: i32) -> i32 {
   // CHECK: return %[[ARG0]]
   return %2 : i32
 }
+
+// CHECK-LABEL: func @fold_copy
+// CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
+func @fold_copy(%arg : tensor<1x4xf32>) -> tensor<1x4xf32> {
+  // CHECK: return [[ARG]]
+  %0 = "xla_hlo.copy"(%arg) : (tensor<1x4xf32>) -> tensor<1x4xf32>
+  return %0 : tensor<1x4xf32>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
index be6f0e6a949..2aeb5f1041d 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt -hlo-legalize-to-lhlo -lhlo-redundant-copies-removal -split-input-file %s -o - | FileCheck %s -dump-input-on-failure
+// RUN: tf-opt -hlo-legalize-to-lhlo %s -o - | FileCheck %s --dump-input-on-failure
 
 // CHECK-LABEL: func @attrs
 func @attrs_copy(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
@@ -6,69 +6,48 @@ func @attrs_copy(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_result = "xla_hlo.exp"(%tensor_operand)
       {some_attr_1 = "exp.1", some_attr_2 = dense<1> : tensor<1xi64>}
       : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK-NEXT: "xla_lhlo.exp"(%{{.*}}, %{{.*}}) {some_attr_1 = "exp.1", some_attr_2 = dense<1> : tensor<1xi64>}
+  // CHECK: "xla_lhlo.exp"(%{{.*}}, %{{.*}}) {some_attr_1 = "exp.1", some_attr_2 = dense<1> : tensor<1xi64>}
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
 
 // -----
 
-// CHECK-LABEL: func @func_op
-func @func_op(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK: (%[[NEW_ARG0:.*]]: memref<4xf32>, %[[NEW_ARG1:.*]]: memref<4xf32>, %[[RESULT:.*]]: memref<4xf32>)
-  %0 = xla_hlo.max %arg0, %arg1 {name = "maximum.47"} : tensor<4xf32>
-  // CHECK-NEXT: "xla_lhlo.max"(%[[NEW_ARG0]], %[[NEW_ARG1]], %[[RESULT]])
-  return %0 : tensor<4xf32>
-  // CHECK-NEXT: "xla_lhlo.terminator"() : () -> ()
-}
-
-// -----
-
 // CHECK-LABEL: func @func_op_long
 func @func_op_long(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
   // CHECK: (%[[NEW_ARG0:.*]]: memref<4xf32>, %[[NEW_ARG1:.*]]: memref<4xf32>, %[[RESULT:.*]]: memref<4xf32>)
+  // CHECK-NEXT: %[[MUL_RESULT:.*]] = alloc() {temp = true} : memref<4xf32>
   // CHECK-NEXT: %[[SUB_RESULT:.*]] = alloc() {temp = true} : memref<4xf32>
   // CHECK-NEXT: %[[MIN_RESULT:.*]] = alloc() {temp = true} : memref<4xf32>
   // CHECK-NEXT: %[[ADD_RESULT:.*]] = alloc() {temp = true} : memref<4xf32>
   // CHECK-NEXT: %[[MAX_RESULT:.*]] = alloc() {temp = true} : memref<4xf32>
-  %1 = xla_hlo.max %arg0, %arg1 {name = "maximum.47"} : tensor<4xf32>
+  %1 = xla_hlo.max %arg0, %arg1 : tensor<4xf32>
   // CHECK-NEXT: "xla_lhlo.max"(%[[NEW_ARG0]], %[[NEW_ARG1]], %[[MAX_RESULT]])
-  %2 = xla_hlo.add %arg0, %1 {name = "maximum.47"} : tensor<4xf32>
+  %2 = xla_hlo.add %arg0, %1 : tensor<4xf32>
   // CHECK-NEXT: "xla_lhlo.add"(%[[NEW_ARG0]], %[[MAX_RESULT]], %[[ADD_RESULT]])
-  %3 = xla_hlo.min %arg0, %arg1 {name = "maximum.47"} : tensor<4xf32>
+  %3 = xla_hlo.min %arg0, %arg1 : tensor<4xf32>
   // CHECK-NEXT: "xla_lhlo.min"(%[[NEW_ARG0]], %[[NEW_ARG1]], %[[MIN_RESULT]])
-  %4 = xla_hlo.sub %arg1, %3 {name = "maximum.47"} : tensor<4xf32>
+  %4 = xla_hlo.sub %arg1, %3 : tensor<4xf32>
   // CHECK-NEXT: "xla_lhlo.sub"(%[[NEW_ARG1]], %[[MIN_RESULT]], %[[SUB_RESULT]])
-  %5 = xla_hlo.mul %2, %4 {name = "maximum.47"} : tensor<4xf32>
-  // CHECK-NEXT: "xla_lhlo.mul"(%[[ADD_RESULT]], %[[SUB_RESULT]], %[[RESULT]])
+  %5 = xla_hlo.mul %2, %4 : tensor<4xf32>
+  // CHECK-NEXT: "xla_lhlo.mul"(%[[ADD_RESULT]], %[[SUB_RESULT]], %[[MUL_RESULT]])
   // CHECK-NEXT: dealloc %[[MAX_RESULT]] : memref<4xf32>
   // CHECK-NEXT: dealloc %[[ADD_RESULT]] : memref<4xf32>
   // CHECK-NEXT: dealloc %[[MIN_RESULT]] : memref<4xf32>
   // CHECK-NEXT: dealloc %[[SUB_RESULT]] : memref<4xf32>
+  // CHECK-NEXT: "xla_lhlo.copy"(%[[MUL_RESULT]], %[[RESULT]]) : (memref<4xf32>, memref<4xf32>) -> ()
+  // CHECK-NEXT: dealloc %[[MUL_RESULT]] : memref<4xf32>
   return %5 : tensor<4xf32>
   // CHECK-NEXT: "xla_lhlo.terminator"() : () -> ()
 }
 
 // -----
 
-// CHECK-LABEL: func @remove_lhlo_copy_op_created_from_tensor_store
-func @remove_lhlo_copy_op_created_from_tensor_store(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: memref<f32>) {
-  %0 = "xla_hlo.max"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
-  tensor_store %0, %arg2 : memref<f32>
-  return
-}
-// CHECK: (%[[NEW_ARG0:.*]]: memref<f32>, %[[NEW_ARG1:.*]]: memref<f32>, %[[RESULT:.*]]: memref<f32>)
-// CHECK-NOT: %[[ALLOC_OPERAND:.*]] = alloc() {temp = true} : memref<f32>
-// CHECK: "xla_lhlo.max"(%[[NEW_ARG0]], %[[NEW_ARG1]], %[[RESULT]]) : (memref<f32>, memref<f32>, memref<f32>) -> ()
-// CHECK-NOT: "xla_lhlo.copy"(%[[ALLOC_OPERAND]], %[[RESULT]]) : (memref<f32>, memref<f32>) -> ()
-// CHECK-NOT: dealloc %[[ALLOC_OPERAND]] : memref<f32>
-// CHECK: "xla_lhlo.terminator"() : () -> ()
-
-// -----
-
 // CHECK-LABEL: func @fusion
 func @fusion(%multiplier: memref<2x2xf32>, %summand_1: memref<2x2xf32>,
              %summand_2: memref<2x2xf32>, %result: memref<2x2xf32>) {
+  // CHECK: (%{{.*}}: {{.*}}, {{.*}}: {{.*}}, {{.*}}: {{.*}}, %[[RESULT:.*]]: {{.*}})
+  // CHECK-NEXT:  %[[MUL_RESULT:.*]] = alloc() {temp = true} : memref<2x2xf32>
   // CHECK-NEXT:  %[[ADD_RESULT:.*]] = alloc() {temp = true} : memref<2x2xf32>
   %tensor_summand_1 = tensor_load %summand_1 : memref<2x2xf32>
   %tensor_summand_2 = tensor_load %summand_2 : memref<2x2xf32>
@@ -78,9 +57,11 @@ func @fusion(%multiplier: memref<2x2xf32>, %summand_1: memref<2x2xf32>,
   %tensor_multiplier = tensor_load %multiplier : memref<2x2xf32>
   %tensor_result = "xla_hlo.mul"(%sum, %tensor_multiplier)
       : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK-NEXT: "xla_lhlo.mul"(%[[ADD_RESULT]], %{{.*}}, %{{.*}})
+  // CHECK-NEXT: "xla_lhlo.mul"(%[[ADD_RESULT]], %{{.*}}, %[[MUL_RESULT]])
+  // CHECK-NEXT: "xla_lhlo.copy"(%[[MUL_RESULT]], %[[RESULT]])
   tensor_store %tensor_result, %result : memref<2x2xf32>
   // CHECK-NEXT:  dealloc %[[ADD_RESULT]] : memref<2x2xf32>
+  // CHECK-NEXT:  dealloc %[[MUL_RESULT]] : memref<2x2xf32>
   // CHECK-NEXT: "xla_lhlo.terminator"() : () -> ()
   "xla_lhlo.terminator"() : () -> ()
 }
@@ -92,7 +73,7 @@ func @copy(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
   %tensor_result = "xla_hlo.copy"(%tensor_operand)
       : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK-NEXT: "xla_lhlo.copy"(%{{.*}}, %{{.*}})
+  // CHECK: "xla_lhlo.copy"(%{{.*}}, %{{.*}})
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
@@ -104,7 +85,19 @@ func @exp(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
   %tensor_result = "xla_hlo.exp"(%tensor_operand)
       : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK-NEXT: "xla_lhlo.exp"(%{{.*}}, %{{.*}})
+  // CHECK: "xla_lhlo.exp"(%{{.*}}, %{{.*}})
+  tensor_store %tensor_result, %result : memref<2x2xf32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @log
+func @log(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
+  %tensor_operand = tensor_load %operand : memref<2x2xf32>
+  %tensor_result = "xla_hlo.log"(%tensor_operand)
+      : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  // CHECK: "xla_lhlo.log"(%{{.*}}, %{{.*}})
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
@@ -119,7 +112,7 @@ func @select(%pred: memref<2x2xi1>, %lhs: memref<2x2xf32>,
   %tensor_rhs = tensor_load %rhs : memref<2x2xf32>
   %tensor_result = "xla_hlo.select"(%tensor_pred, %tensor_lhs, %tensor_rhs)
       : (tensor<2x2xi1>, tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK-NEXT: "xla_lhlo.select"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}})
+  // CHECK: "xla_lhlo.select"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}})
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
@@ -133,7 +126,7 @@ func @compare(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>, %result: memref<2x2x
   %tensor_result = "xla_hlo.compare"(%tensor_lhs, %tensor_rhs)
       {comparison_direction = "EQ"}
       : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xi1>
-  // CHECK-NEXT: "xla_lhlo.compare"(%{{.*}}, %{{.*}}, %{{.*}}) {comparison_direction = "EQ"}
+  // CHECK: "xla_lhlo.compare"(%{{.*}}, %{{.*}}, %{{.*}}) {comparison_direction = "EQ"}
   tensor_store %tensor_result, %result : memref<2x2xi1>
   return
 }
@@ -146,7 +139,7 @@ func @broadcast(%operand: memref<5xf32>, %result: memref<10x5xf32>) {
   %tensor_result = "xla_hlo.broadcast_in_dim"(%tensor_operand)
       {broadcast_dimensions = dense<1> : tensor<1xi64>}
         : (tensor<5xf32>) -> tensor<10x5xf32>
-  // CHECK-NEXT: "xla_lhlo.broadcast_in_dim"(%{{.*}}, %{{.*}}) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK: "xla_lhlo.broadcast_in_dim"(%{{.*}}, %{{.*}}) {broadcast_dimensions = dense<1> : tensor<1xi64>}
   tensor_store %tensor_result, %result : memref<10x5xf32>
   return
 }
@@ -183,7 +176,7 @@ func @dyn_broadcast(%operand: memref<?x?xf32>) {
 func @iota(%result: memref<10xi32>) {
   %tensor_result = "xla_hlo.iota"()
       {iota_dimension = 0 : i64} : () -> tensor<10xi32>
-  // CHECK-NEXT: "xla_lhlo.iota"(%{{.*}}) {iota_dimension = 0 : i64}
+  // CHECK: "xla_lhlo.iota"(%{{.*}}) {iota_dimension = 0 : i64}
   tensor_store %tensor_result, %result : memref<10xi32>
   return
 }
@@ -195,7 +188,7 @@ func @abs(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
   %tensor_result = "xla_hlo.abs"(%tensor_operand)
       : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK-NEXT: "xla_lhlo.abs"(%{{.*}}, %{{.*}})
+  // CHECK: "xla_lhlo.abs"(%{{.*}}, %{{.*}})
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
@@ -207,7 +200,7 @@ func @ceil(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
   %tensor_result = "xla_hlo.ceil"(%tensor_operand)
       : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK-NEXT: "xla_lhlo.ceil"(%{{.*}}, %{{.*}})
+  // CHECK: "xla_lhlo.ceil"(%{{.*}}, %{{.*}})
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
@@ -231,7 +224,7 @@ func @cos(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
   %tensor_result = "xla_hlo.cos"(%tensor_operand)
       : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK-NEXT: "xla_lhlo.cos"(%{{.*}}, %{{.*}})
+  // CHECK: "xla_lhlo.cos"(%{{.*}}, %{{.*}})
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
@@ -243,7 +236,19 @@ func @neg(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
   %tensor_result = "xla_hlo.neg"(%tensor_operand)
       : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK-NEXT: "xla_lhlo.neg"(%{{.*}}, %{{.*}})
+  // CHECK: "xla_lhlo.neg"(%{{.*}}, %{{.*}})
+  tensor_store %tensor_result, %result : memref<2x2xf32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @rsqrt
+func @rsqrt(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
+  %tensor_operand = tensor_load %operand : memref<2x2xf32>
+  %tensor_result = "xla_hlo.rsqrt"(%tensor_operand)
+      : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  // CHECK: "xla_lhlo.rsqrt"(%{{.*}}, %{{.*}})
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
@@ -255,7 +260,19 @@ func @sign(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
   %tensor_result = "xla_hlo.sign"(%tensor_operand)
       : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK-NEXT: "xla_lhlo.sign"(%{{.*}}, %{{.*}})
+  // CHECK: "xla_lhlo.sign"(%{{.*}}, %{{.*}})
+  tensor_store %tensor_result, %result : memref<2x2xf32>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @sqrt
+func @sqrt(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
+  %tensor_operand = tensor_load %operand : memref<2x2xf32>
+  %tensor_result = "xla_hlo.sqrt"(%tensor_operand)
+      : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  // CHECK: "xla_lhlo.sqrt"(%{{.*}}, %{{.*}})
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
@@ -267,7 +284,7 @@ func @tanh(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
   %tensor_result = "xla_hlo.tanh"(%tensor_operand)
       : (tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK-NEXT: "xla_lhlo.tanh"(%{{.*}}, %{{.*}})
+  // CHECK: "xla_lhlo.tanh"(%{{.*}}, %{{.*}})
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
@@ -280,7 +297,7 @@ func @remainder(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>, %result: memref<2x
   %tensor_rhs = tensor_load %rhs : memref<2x2xf32>
   %tensor_result = "xla_hlo.remainder"(%tensor_lhs, %tensor_rhs)
       : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK-NEXT: "xla_lhlo.remainder"(%{{.*}}, %{{.*}}, %{{.*}})
+  // CHECK: "xla_lhlo.remainder"(%{{.*}}, %{{.*}}, %{{.*}})
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir
index 61add8c4389..1f4c9c6ea6c 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir
@@ -77,6 +77,17 @@ func @integer_remainder(%lhs: tensor<2x2xi32>,
 
 // -----
 
+// CHECK-LABEL: func @float_rsqrt
+func @float_rsqrt(%operand: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %tensor_result = "xla_hlo.rsqrt"(%operand)
+      : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  // CHECK: linalg.generic
+  // CHECK: rsqrt
+  return %tensor_result : tensor<2x2xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @float_sub
 func @float_sub(%lhs: tensor<2x2xf32>,
                 %rhs: tensor<2x2xf32>) -> tensor<2x2xf32> {
@@ -121,6 +132,16 @@ func @float_exp(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
 
 // -----
 
+// CHECK-LABEL: func @float_log
+func @float_log(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  // CHECK: linalg.generic
+  // CHECK: log
+  %0 = "xla_hlo.log"(%arg0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @float_ceil
 func @float_ceil(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   // CHECK: linalg.generic
@@ -192,13 +213,12 @@ func @int_cmp(%lhs: tensor<2x2xi32>,
 // -----
 
 // CHECK-LABEL: func @copy
+// CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
 func @copy(%input: tensor<2x4x8xf32>) -> tensor<2x4x8xf32> {
   %0 = "xla_hlo.copy"(%input) : (tensor<2x4x8xf32>) -> (tensor<2x4x8xf32>)
   return %0 : tensor<2x4x8xf32>
 }
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32):
-// CHECK-NEXT:   linalg.yield %[[OPERAND_IN]] : f32
+// CHECK: return [[ARG]] : tensor<2x4x8xf32>
 
 // -----
 
@@ -231,7 +251,7 @@ func @broadcast(%operand: tensor<5x7x1xf32>) -> tensor<7x10x6x4x5xf32> {
 
 // -----
 
-// CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1, d2) -> (0)>
+// CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1, d2) -> ()>
 // CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 // CHECK-LABEL: func @broadcast_scalar
 func @broadcast_scalar(%operand: tensor<f32>) -> tensor<7x10x6xf32> {
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
new file mode 100644
index 00000000000..f2dff2c9956
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
@@ -0,0 +1,91 @@
+// RUN: tf-opt -xla-legalize-tf-with-tf2xla=device-type=XLA_CPU %s | FileCheck %s --dump-input-on-failure
+
+// INVALID_DEVICE: tf-opt -xla-legalize-tf-with-tf2xla=device-type=INVALID_DEVICE %s | FileCheck %s --dump-input-on-failure
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+
+// CHECK-LABEL: abs
+// expected-error@+1 {{unsupported device}}
+func @abs(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+  // CHECK: %[[RESULT:.*]] = "xla_hlo.abs"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
+  %0 = "tf.Abs"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
+
+  // return %[[RESULT]]
+  return %0 : tensor<2xf32>
+}
+
+// CHECK-LABEL: unknown_op
+func @unknown_op(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+  // CHECK: tf.CustomTestOp
+  // expected-remark@+1 {{constant 20}}
+  %0 = "tf.CustomTestOp"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// CHECK-LABEL: dynamic_operand
+func @dynamic_operand(%arg0: tensor<?xf32>) -> tensor<?xf32> {
+  // CHECK: tf.Abs
+  // expected-remark@+1 {{lowering requires static shaped operands}}
+  %0 = "tf.Abs"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
+
+  return %0 : tensor<?xf32>
+}
+
+// CHECK-LABEL: multiple_dialect_ops
+func @multiple_dialect_ops(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+  // CHECK: xla_hlo.neg
+  %0 = "xla_hlo.neg"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
+  // CHECK: xla_hlo.abs
+  %1 = "tf.Abs"(%0) : (tensor<2xf32>) -> tensor<2xf32>
+
+  return %1 : tensor<2xf32>
+}
+
+// CHECK-LABEL: binary_op
+func @binary_op(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // CHECK: xla_hlo.atan2 %arg0, %arg1 : tensor<2xf32>
+  %0 = "tf.Atan2"(%arg0, %arg1) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+  return %0 : tensor<2xf32>
+}
+
+// CHECK-LABEL: binary_op_broadcast
+func @binary_op_broadcast(%arg0: tensor<4x1xf32>, %arg1: tensor<4x1x4xf32>) -> tensor<4x4x4xf32> {
+  // CHECK: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<4x1xf32>) -> tensor<4x4x1xf32>
+  // CHECK: %[[RESHAPE0:.*]] = "xla_hlo.reshape"(%[[BROADCAST0]]) : (tensor<4x4x1xf32>) -> tensor<4x4xf32>
+  // CHECK: %[[UPDATED_ARG0:.*]] = "xla_hlo.broadcast_in_dim"(%[[RESHAPE0]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<4x4xf32>) -> tensor<4x4x4xf32>
+
+  // CHECK: %[[RESHAPE1:.*]] = "xla_hlo.reshape"(%arg1) : (tensor<4x1x4xf32>) -> tensor<4x4xf32>
+  // CHECK: %[[UPDATED_ARG1:.*]] = "xla_hlo.broadcast_in_dim"(%[[RESHAPE1]]) {broadcast_dimensions = dense<[0, 2]> : tensor<2xi64>} : (tensor<4x4xf32>) -> tensor<4x4x4xf32>
+
+  // CHECK: %[[RESULT:.*]] = xla_hlo.atan2 %[[UPDATED_ARG0]], %[[UPDATED_ARG1]] : tensor<4x4x4xf32>
+  // CHECK: return %[[RESULT]] : tensor<4x4x4xf32>
+
+  %0 = "tf.Atan2"(%arg0, %arg1) : (tensor<4x1xf32>, tensor<4x1x4xf32>) -> tensor<4x4x4xf32>
+  return %0: tensor<4x4x4xf32>
+}
+
+// CHECK-LABEL: func @ternary_op
+func @ternary_op(%arg0: tensor<2xi1>, %arg1: tensor<2xi32>, %arg2: tensor<2xi32>) -> tensor<2xi32> {
+  // CHECK: "xla_hlo.select"(%arg0, %arg1, %arg2)
+  %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<2xi1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  return %0: tensor<2xi32>
+}
+
+// CHECK-LABEL: func @convert
+func @convert(%arg0: tensor<2xi32>) -> tensor<2xf32> {
+  // CHECK: "xla_hlo.convert"(%arg0) : (tensor<2xi32>) -> tensor<2xf32>
+  %0 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<2xi32>) -> tensor<2xf32>
+  return %0 : tensor<2xf32>
+}
+
+// TODO(hinsu): Add a test with variant type once one of the ops supporting
+// the type is whitelisted. It should be rejected with unsupported type remark.
+
+// TODO(hinsu): Add a test with uint8 type once one of the ops supporting the
+// type is whitelisted. Unsigned types are not yet added to the HLO dialect so
+// it should return an error. See b/130356985
+
+// TODO(hinsu): Add a test with a valid TF op for which tf2xla kernel is
+// available but doesn't support this instance.
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index d80722e2865..b759fe593c2 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -523,17 +523,17 @@ func @broadcast_shift_right(%arg0: tensor<4xi32>, %arg1: tensor<2x4xi32>) -> ten
 }
 
 // CHECK-LABEL: func @shift_right_unsigned
-func @shift_right_unsigned(%arg0: tensor<4x!tf.uint8>, %arg1: tensor<4x!tf.uint8>) -> tensor<4x!tf.uint8> {
+func @shift_right_unsigned(%arg0: tensor<4xui8>, %arg1: tensor<4xui8>) -> tensor<4xui8> {
   // CHECK:  tf.RightShift
-  %0 = "tf.RightShift"(%arg0, %arg1) : (tensor<4x!tf.uint8>, tensor<4x!tf.uint8>) -> tensor<4x!tf.uint8>
-  return %0 : tensor<4x!tf.uint8>
+  %0 = "tf.RightShift"(%arg0, %arg1) : (tensor<4xui8>, tensor<4xui8>) -> tensor<4xui8>
+  return %0 : tensor<4xui8>
 }
 
 // CHECK-LABEL: func @broadcast_shift_right_unsigned
-func @broadcast_shift_right_unsigned(%arg0: tensor<4x!tf.uint8>, %arg1: tensor<2x4x!tf.uint8>) -> tensor<2x4x!tf.uint8> {
+func @broadcast_shift_right_unsigned(%arg0: tensor<4xui8>, %arg1: tensor<2x4xui8>) -> tensor<2x4xui8> {
   // CHECK:  tf.RightShift
-  %0 = "tf.RightShift"(%arg0, %arg1) : (tensor<4x!tf.uint8>, tensor<2x4x!tf.uint8>) -> tensor<2x4x!tf.uint8>
-  return %0 : tensor<2x4x!tf.uint8>
+  %0 = "tf.RightShift"(%arg0, %arg1) : (tensor<4xui8>, tensor<2x4xui8>) -> tensor<2x4xui8>
+  return %0 : tensor<2x4xui8>
 }
 
 // CHECK-LABEL: func @and
@@ -1133,8 +1133,8 @@ func @preventgradient(%arg0: tensor<1xi32>) -> tensor<1xi32> {
 
 // CHECK-LABEL: func @infeed_dequeue_tuple
 func @infeed_dequeue_tuple() -> (tensor<3xi32>, tensor<4xf32>) {
-// CHECK: [[AFTER_ALL:%.*]] = "xla_hlo.after_all"() : () -> !xla_hlo.token
-// CHECK: [[INFEED:%.*]] = "xla_hlo.infeed"([[AFTER_ALL]]) {infeed_config = ""} : (!xla_hlo.token) -> tuple<tuple<tensor<3xi32>, tensor<4xf32>>, !xla_hlo.token>
+// CHECK: [[TOKEN:%.*]] = "xla_hlo.create_token"() : () -> !xla_hlo.token
+// CHECK: [[INFEED:%.*]] = "xla_hlo.infeed"([[TOKEN]]) {infeed_config = ""} : (!xla_hlo.token) -> tuple<tuple<tensor<3xi32>, tensor<4xf32>>, !xla_hlo.token>
 // CHECK: [[INFEED_VAL:%.*]] = "xla_hlo.get_tuple_element"([[INFEED]]) {index = 0 : i32} : (tuple<tuple<tensor<3xi32>, tensor<4xf32>>, !xla_hlo.token>) -> tuple<tensor<3xi32>, tensor<4xf32>>
 // CHECK: [[RES_1:%.*]] = "xla_hlo.get_tuple_element"([[INFEED_VAL]]) {index = 0 : i32} : (tuple<tensor<3xi32>, tensor<4xf32>>) -> tensor<3xi32>
 // CHECK: [[RES_2:%.*]] = "xla_hlo.get_tuple_element"([[INFEED_VAL]]) {index = 1 : i32} : (tuple<tensor<3xi32>, tensor<4xf32>>) -> tensor<4xf32>
@@ -1228,6 +1228,80 @@ func @test_sparse_mat_mul(%arg0: tensor<3x4xf32>, %arg1: tensor<4x5xf32>) -> ten
   return %0: tensor<3x5xf32>
 }
 
+//===----------------------------------------------------------------------===//
+// MatrixBandPart op legalizations.
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: matrix_band_part
+// CHECK-SAME: (%[[INPUT:.*]]: tensor<64x64xbf16>, %[[LOWER:.*]]: tensor<i64>, %[[UPPER:.*]]: tensor<i64>)
+func @matrix_band_part(%arg0: tensor<64x64xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<64x64xbf16> {
+  // CHECK: %[[M:.*]] = xla_hlo.constant dense<64> : tensor<i64>
+  // CHECK: %[[N:.*]] = xla_hlo.constant dense<64> : tensor<i64>
+
+  // CHECK: %[[ZERO:.*]] = xla_hlo.constant dense<0> : tensor<i64>
+  // CHECK: %[[A:.*]] = "xla_hlo.compare"(%[[LOWER]], %[[ZERO]]) {comparison_direction = "LT"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  // CHECK: %[[B:.*]] = "xla_hlo.select"(%[[A]], %[[M]], %[[LOWER]]) : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+
+  // CHECK: %[[C:.*]] = "xla_hlo.compare"(%[[UPPER]], %[[ZERO]]) {comparison_direction = "LT"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  // CHECK: %[[D:.*]] = "xla_hlo.select"(%[[C]], %[[N]], %[[UPPER]]) : (tensor<i1>, tensor<i64>, tensor<i64>) -> tensor<i64>
+
+  // CHECK: %[[E:.*]] = "xla_hlo.convert"(%[[B]]) : (tensor<i64>) -> tensor<bf16>
+  // CHECK: %[[F:.*]] = "xla_hlo.neg"(%[[E]]) : (tensor<bf16>) -> tensor<bf16>
+
+  // CHECK: %[[X:.*]] = "xla_hlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<64x64xbf16>
+  // CHECK: %[[Y:.*]] = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<64x64xbf16>
+  // CHECK: %[[OFFSET:.*]] = xla_hlo.sub %[[X]], %[[Y]] : tensor<64x64xbf16>
+  // CHECK: %[[G:.*]] = "xla_hlo.compare"(%[[F]], %[[OFFSET]]) {comparison_direction = "LE"} : (tensor<bf16>, tensor<64x64xbf16>) -> tensor<*xi1>
+
+  // CHECK: %[[H:.*]] = "xla_hlo.convert"(%[[D]]) : (tensor<i64>) -> tensor<bf16>
+  // CHECK: %[[I:.*]] = "xla_hlo.compare"(%[[OFFSET]], %[[H]]) {comparison_direction = "LE"} : (tensor<64x64xbf16>, tensor<bf16>) -> tensor<*xi1>
+
+  // CHECK: %[[J:.*]] = xla_hlo.and %[[G]], %[[I]] : tensor<*xi1>
+
+  // CHECK: %[[ZERO2:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<64x64xbf16>
+  // CHECK: %[[R:.*]] = "xla_hlo.select"(%[[J]], %[[INPUT]], %[[ZERO2]])
+  // CHECK: return %[[R]]
+  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<64x64xbf16>, tensor<i64>, tensor<i64>) -> tensor<64x64xbf16>
+  return %0 : tensor<64x64xbf16>
+}
+
+// CHECK-LABEL: matrix_band_part_2
+// CHECK-SAME: (%[[INPUT:.*]]: tensor<12x24x48xbf16>, %[[LOWER:.*]]: tensor<i64>, %[[UPPER:.*]]: tensor<i64>)
+func @matrix_band_part_2(%arg0: tensor<12x24x48xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<12x24x48xbf16> {
+  // CHECK: %[[X:.*]] = "xla_hlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<24x48xbf16>
+  // CHECK: %[[Y:.*]] = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<24x48xbf16>
+  // CHECK: %[[OFFSET:.*]] = xla_hlo.sub %[[X]], %[[Y]] : tensor<24x48xbf16>
+
+  // CHECK: %[[G:.*]] = "xla_hlo.compare"(%[[F]], %[[OFFSET]]) {comparison_direction = "LE"} : (tensor<bf16>, tensor<24x48xbf16>) -> tensor<*xi1>
+
+  // CHECK: %[[H:.*]] = "xla_hlo.convert"(%[[D]]) : (tensor<i64>) -> tensor<bf16>
+  // CHECK: %[[I:.*]] = "xla_hlo.compare"(%[[OFFSET]], %[[H]]) {comparison_direction = "LE"} : (tensor<24x48xbf16>, tensor<bf16>) -> tensor<*xi1>
+  // CHECK: %[[J:.*]] = xla_hlo.and %[[G]], %[[I]] {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : tensor<*xi1>
+
+  // CHECK: %[[ZERO2:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<12x24x48xbf16>
+  // CHECK: %[[R:.*]] = "xla_hlo.select"(%[[J]], %[[INPUT]], %[[ZERO2]])
+  // CHECK: return %[[R]]
+  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<12x24x48xbf16>, tensor<i64>, tensor<i64>) -> tensor<12x24x48xbf16>
+  return %0 : tensor<12x24x48xbf16>
+}
+
+// CHECK-LABEL: matrix_band_part_3
+// CHECK-SAME: (%[[INPUT:.*]]: tensor<*xbf16>, %[[LOWER:.*]]: tensor<i64>, %[[UPPER:.*]]: tensor<i64>)
+func @matrix_band_part_3(%arg0: tensor<*xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<*xbf16> {
+  // CHECK: "tf.MatrixBandPart"
+  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<*xbf16>, tensor<i64>, tensor<i64>) -> tensor<*xbf16>
+  return %0 : tensor<*xbf16>
+}
+
+// CHECK-LABEL: matrix_band_part_4
+// CHECK-SAME: (%[[INPUT:.*]]: tensor<24x48xbf16>, %[[LOWER:.*]]: tensor<i64>, %[[UPPER:.*]]: tensor<i64>)
+func @matrix_band_part_4(%arg0: tensor<24x48xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<24x48xbf16> {
+  // This one should lower.
+  // CHECK-NOT: "tf.MatrixBandPart"
+  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<24x48xbf16>, tensor<i64>, tensor<i64>) -> tensor<24x48xbf16>
+  return %0 : tensor<24x48xbf16>
+}
+
 //===----------------------------------------------------------------------===//
 // MaxPool op legalizations.
 //===----------------------------------------------------------------------===//
@@ -1319,8 +1393,8 @@ func @one_hot(%indices: tensor<3xi32>, %on_value: tensor<f32>, %off_value: tenso
 // CHECK-SAME: [[VAL_0:%.*]]: tensor<3xi32>, [[VAL_1:%.*]]: tensor<4xf32>)
 func @outfeed_enqueue_tuple(%data_1: tensor<3xi32>, %data_2: tensor<4xf32>) -> () {
 // CHECK: [[TUPLE:%.*]] = "xla_hlo.tuple"([[VAL_0]], [[VAL_1]]) : (tensor<3xi32>, tensor<4xf32>) -> tuple<tensor<3xi32>, tensor<4xf32>>
-// CHECK: [[AFTER_ALL:%.*]] = "xla_hlo.after_all"() : () -> !xla_hlo.token
-// CHECK: "xla_hlo.outfeed"([[TUPLE]], [[AFTER_ALL]]) {outfeed_config = ""} : (tuple<tensor<3xi32>, tensor<4xf32>>, !xla_hlo.token) -> !xla_hlo.token
+// CHECK: [[TOKEN:%.*]] = "xla_hlo.create_token"() : () -> !xla_hlo.token
+// CHECK: "xla_hlo.outfeed"([[TUPLE]], [[TOKEN]]) {outfeed_config = ""} : (tuple<tensor<3xi32>, tensor<4xf32>>, !xla_hlo.token) -> !xla_hlo.token
   "tf.OutfeedEnqueueTuple"(%data_1, %data_2) : (tensor<3xi32>, tensor<4xf32>) -> ()
   return
 }
@@ -2415,6 +2489,25 @@ func @strided_slice_new_axis_mask(%input: tensor<2x4x8x16x32x64xf32>) {
   return
 }
 
+// CHECK-LABEL: strided_slice_implicit_ellipsis_mask(
+// CHECK-SAME: [[INPUT:%.*]]: tensor<10x16x2xf32>
+func @strided_slice_implicit_ellipsis_mask(%input: tensor<10x16x2xf32>) -> tensor<2x16x2xf32> {
+  // StridedSlice gets input[8:10], which is same as input[8:10, ...]
+  // The start_indices, limit_indices, and strides attribute of xla_hlo.slice
+  // reflect the canonicalized slice.
+  %begin = "tf.Const"() {value = dense<8> : tensor<1xi32>} : () -> tensor<1xi32>
+  %end = "tf.Const"() {value = dense<10> : tensor<1xi32>} : () -> tensor<1xi32>
+  %strides = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK: [[SLICE:%.*]] = "xla_hlo.slice"([[INPUT]])
+  // CHECK-DAG-SAME: limit_indices = dense<[10, 16, 2]> : tensor<3xi64>
+  // CHECK-DAG-SAME: start_indices = dense<[8, 0, 0]> : tensor<3xi64>
+  // CHECK-DAG-SAME: strides = dense<1> : tensor<3xi64>
+  // CHECK: [[RESHAPE:%.*]] = "xla_hlo.reshape"([[SLICE]]) : (tensor<2x16x2xf32>) -> tensor<2x16x2xf32>
+  %0 = "tf.StridedSlice"(%input, %begin, %end, %strides) {Index = i32, T = f32} : (tensor<10x16x2xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2x16x2xf32>
+  // CHECK: return [[RESHAPE]] : tensor<2x16x2xf32>
+  return %0 : tensor<2x16x2xf32>
+}
+
 
 //===----------------------------------------------------------------------===//
 // Reduction op legalizations.
@@ -3247,6 +3340,121 @@ func @strided_slice_grad(%grad: tensor<4x16x1022xf32>) -> tensor<4x128x1024xf32>
   return %0: tensor<4x128x1024xf32>
 }
 
+// CHECK-LABEL: strided_slice_grad_shrink_axis_mask
+// CHECK-SAME: [[GRAD:%.*]]: tensor<8xf32>
+func @strided_slice_grad_shrink_axis_mask(%grad: tensor<8xf32>) -> tensor<4x8xf32> {
+  // Input to StridedSlice was of shape 4x8xf32
+  // Strided slice gets input[2:3, 0:8]
+  // shrink_axis_mask is 1 denoting that dim#0 is shrunk. So the output is 8xf32
+  // which is the shape of gradient.
+  // StridedSliceGrad would reshape the gradient to 1x8xf32 and
+  // then pad to match the shape of input 4x8xf32.
+
+  %shape = "tf.Const"() {value = dense<[4, 8]> : tensor<2xi32>} : () -> (tensor<2xi32>)
+  %begin = "tf.Const"() {value = dense<[2, 0]> : tensor<2xi32>} : () -> (tensor<2xi32>)
+  %end = "tf.Const"() {value = dense<[3, 8]> : tensor<2xi32>} : () -> (tensor<2xi32>)
+  %strides = "tf.Const"() {value = dense<1> : tensor<2xi32>} : () -> (tensor<2xi32>)
+
+  // CHECK: [[RESHAPE:%.*]] = "xla_hlo.reshape"([[GRAD]]) : (tensor<8xf32>) -> tensor<1x8xf32>
+  // CHECK: [[ZEROS:%.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK: [[PAD:%.*]] = "xla_hlo.pad"([[RESHAPE]], [[ZEROS]])
+  // CHECK-DAG-SAME: edge_padding_low = dense<[2, 0]> : tensor<2xi64>
+  // CHECK-DAG-SAME: edge_padding_high = dense<[1, 0]> : tensor<2xi64>
+  // CHECK-DAG-SAME: interior_padding = dense<0> : tensor<2xi64>
+  %0 = "tf.StridedSliceGrad"(%shape, %begin, %end, %strides, %grad) {begin_mask = 0, end_mask = 0, shrink_axis_mask = 1} : (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>, tensor<8xf32>) -> tensor<4x8xf32>
+
+  // CHECK: return [[PAD]] : tensor<4x8xf32>
+  return %0 : tensor<4x8xf32>
+}
+
+// CHECK-LABEL: strided_slice_grad_new_axis_mask
+// CHECK-SAME: [[GRAD:%.*]]: tensor<1x2xf32>
+func @strided_slice_grad_new_axis_mask(%grad: tensor<1x2xf32>) -> tensor<8xf32> {
+  // Input to StridedSlice was of shape 8xf32
+  // Strided slice gets input[tf.new_axis, 2:4]
+  // new_axis_mask is 1 denoting new axis is inserted at dim#0. So the output is
+  // 1x2xf32 which is the shape of gradient.
+  // StridedSliceGrad would reshape the gradient to 2xf32 and
+  // then pad to match the shape of input 4x8xf32.
+
+  %shape = "tf.Const"() {value = dense<[8]> : tensor<1xi32>} : () -> (tensor<1xi32>)
+  %begin = "tf.Const"() {value = dense<[0, 2]> : tensor<2xi32>} : () -> (tensor<2xi32>)
+  %end = "tf.Const"() {value = dense<[0, 4]> : tensor<2xi32>} : () -> (tensor<2xi32>)
+  %strides = "tf.Const"() {value = dense<1> : tensor<2xi32>} : () -> (tensor<2xi32>)
+
+  // CHECK: [[RESHAPE:%.*]] = "xla_hlo.reshape"([[GRAD]]) : (tensor<1x2xf32>) -> tensor<2xf32>
+  // CHECK: [[ZEROS:%.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK: [[PAD:%.*]] = "xla_hlo.pad"([[RESHAPE]], [[ZEROS]])
+  // CHECK-DAG-SAME: edge_padding_low = dense<2> : tensor<1xi64>
+  // CHECK-DAG-SAME: edge_padding_high = dense<4> : tensor<1xi64>
+  // CHECK-DAG-SAME: interior_padding = dense<0> : tensor<1xi64>
+  %0 = "tf.StridedSliceGrad"(%shape, %begin, %end, %strides, %grad) {begin_mask = 0, end_mask = 0, new_axis_mask = 1} : (tensor<1xi32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>, tensor<1x2xf32>) -> tensor<8xf32>
+
+  // CHECK: return [[PAD]] : tensor<8xf32>
+  return %0 : tensor<8xf32>
+}
+
+// CHECK-LABEL: strided_slice_grad_ellipsis_mask
+// CHECK-SAME: [[GRAD:%.*]]: tensor<2x4x8xf32>
+func @strided_slice_grad_ellipsis_mask(%grad: tensor<2x4x8xf32>) -> tensor<4x4x8xf32> {
+  // Input to StridedSlice was of shape 4x4x8xf32
+  // Strided slice gets input[2:4, ...]
+  // ellipsis_mask is 2 denoting that slice contains all elements in dim#1 and
+  // dim#2, ignoring begin and end indices for these dimensions. So the output
+  // is 2x4x8xf32 which is the shape of gradient.
+  // StridedSliceGrad would pad the gradient to match the shape of
+  // input 4x4x8xf32.
+
+  %shape = "tf.Const"() {value = dense<[4, 4, 8]> : tensor<3xi32>} : () -> (tensor<3xi32>)
+  %begin = "tf.Const"() {value = dense<[2, 3]> : tensor<2xi32>} : () -> (tensor<2xi32>)
+  %end = "tf.Const"() {value = dense<[4, 5]> : tensor<2xi32>} : () -> (tensor<2xi32>)
+  %strides = "tf.Const"() {value = dense<1> : tensor<2xi32>} : () -> (tensor<2xi32>)
+
+  // CHECK: [[RESHAPE:%.*]] = "xla_hlo.reshape"([[GRAD]]) : (tensor<2x4x8xf32>) -> tensor<2x4x8xf32>
+  // CHECK: [[ZEROS:%.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK: [[PAD:%.*]] = "xla_hlo.pad"([[RESHAPE]], [[ZEROS]])
+  // CHECK-DAG-SAME: edge_padding_low = dense<[2, 0, 0]> : tensor<3xi64>
+  // CHECK-DAG-SAME: edge_padding_high = dense<0> : tensor<3xi64>
+  // CHECK-DAG-SAME: interior_padding = dense<0> : tensor<3xi64>
+  %0 = "tf.StridedSliceGrad"(%shape, %begin, %end, %strides, %grad) {begin_mask = 0, end_mask = 0, ellipsis_mask = 2} : (tensor<3xi32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>, tensor<2x4x8xf32>) -> tensor<4x4x8xf32>
+
+  // CHECK: return [[PAD]] : tensor<4x4x8xf32>
+  return %0 : tensor<4x4x8xf32>
+}
+
+
+// CHECK-LABEL: strided_slice_grad_all_masks
+// CHECK-SAME: [[GRAD:%.*]]: tensor<1x4x8x8x10x2x1xf32>
+func @strided_slice_grad_all_masks(%grad: tensor<1x4x8x8x10x2x1xf32>) -> tensor<2x4x8x16x32x64xf32> {
+  // For StridedSlice input[1, tf.new_axis, ..., 8:, :10, 2:6:2, tf.new_axis]
+  // New axis mask is at index 1 and 6 of sparse spec, so
+  // new_axis_mask = 2^1 + 2^6 = 66
+  // The ellipsis mask is applied to dim #1, #2 of input i.e, we get
+  // canonicalized slice input[1, :, :, 8:, :10, 2:6:2]
+  // The StridedSliceGrad op would propogate the gradient for the sliced tensor
+  // to the original input tensor by padding with zeroes.
+
+  %shape = "tf.Const"() {value = dense<[2, 4, 8, 16, 32, 64]> : tensor<6xi32>} : () -> (tensor<6xi32>)
+  %begin = "tf.Const"() {value = dense<[1, 0, 0, 8, 1, 2, 0]> : tensor<7xi32>} : () -> (tensor<7xi32>)
+  %end = "tf.Const"() {value = dense<[2, 0, 0, 10, 10, 6, 0]> : tensor<7xi32>} : () -> (tensor<7xi32>)
+  %strides = "tf.Const"() {value = dense<[1, 1, 1, 1, 1, 2, 1]> : tensor<7xi32>} : () -> (tensor<7xi32>)
+
+  // Remove 2 new axes (at index 1 and 6) and 1 shrink axis (at index 0)
+  // CHECK: [[RESHAPE:%.*]] = "xla_hlo.reshape"([[GRAD]]) : (tensor<1x4x8x8x10x2x1xf32>) -> tensor<1x4x8x8x10x2xf32>
+  // CHECK: [[ZERO:%.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
+  // The edge_padding_low, edge_padding_high and interior_padding attributes of
+  // xla_hlo.pad would reflect the padding required to get the shape of the
+  // input of StridedSlice op.
+  // CHECK: [[PAD:%.*]] = "xla_hlo.pad"([[RESHAPE]], [[ZERO]])
+  // CHECK-DAG-SAME: edge_padding_low = dense<[1, 0, 0, 8, 0, 2]> : tensor<6xi64>
+  // CHECK-DAG-SAME: edge_padding_high = dense<[0, 0, 0, 0, 22, 59]> : tensor<6xi64>
+  // CHECK-DAG-SAME: interior_padding = dense<[0, 0, 0, 0, 0, 1]> : tensor<6xi64>
+  %0 = "tf.StridedSliceGrad"(%shape, %begin, %end, %strides, %grad) {begin_mask = 16, end_mask = 8, shrink_axis_mask = 1, ellipsis_mask = 4, new_axis_mask = 66} : (tensor<6xi32>, tensor<7xi32>, tensor<7xi32>, tensor<7xi32>, tensor<1x4x8x8x10x2x1xf32>) -> tensor<2x4x8x16x32x64xf32>
+
+  // CHECK: return [[PAD]] : tensor<2x4x8x16x32x64xf32>
+  return %0 : tensor<2x4x8x16x32x64xf32>
+}
+
 // CHECK-LABEL: @tensor_scatter_update
 func @tensor_scatter_update(%tensor: tensor<?x?x?xf32>, %indices: tensor<?x2xi32>, %updates: tensor<?x?xf32>) -> tensor<?x?x?xf32> {
   // CHECK: "xla_hlo.scatter"(%arg0, %arg1, %arg2) ( {
@@ -3435,3 +3643,115 @@ func @xla_dynamic_update_slice(%arg0: tensor<4x16xf32>, %arg1: tensor<2x4xf32>,
   %0 = "tf.XlaDynamicUpdateSlice"(%arg0, %arg1, %arg2) : (tensor<4x16xf32>, tensor<2x4xf32>, tensor<2xi32>) -> tensor<4x16xf32>
   return %0 : tensor<4x16xf32>
 }
+
+// CHECK-LABEL: xla_dynamic_update_slice2
+func @xla_dynamic_update_slice2(%arg0: tensor<4xf32>, %arg1: tensor<2xf32>, %arg2: tensor<1xi32>) -> tensor<4xf32> {
+  // CHECK: [[SLICE0:%.+]] = "xla_hlo.slice"(%arg2) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<1xi32>) -> tensor<1xi32>
+  // CHECK: [[RESHAPE0:%.+]] = "xla_hlo.reshape"([[SLICE0]]) : (tensor<1xi32>) -> tensor<i32>
+  // CHECK: [[DUS:%.+]] = "xla_hlo.dynamic-update-slice"(%arg0, %arg1, [[RESHAPE0]]) : (tensor<4xf32>, tensor<2xf32>, tensor<i32>) -> tensor<4xf32>
+  // CHECK: return [[DUS]]
+  %0 = "tf.XlaDynamicUpdateSlice"(%arg0, %arg1, %arg2) : (tensor<4xf32>, tensor<2xf32>, tensor<1xi32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
+
+//===----------------------------------------------------------------------===//
+// Cumsum op legalizations.
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @cumsum_static
+// CHECK-SAME: [[X:%.*]]: tensor<4xf32>
+func @cumsum_static(%arg0: tensor<4xf32>) -> tensor<4xf32> {
+  // CHECK: [[AXIS:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
+  // CHECK: [[CONVERT_X:%.*]] = "xla_hlo.convert"([[X]]) : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: [[INIT:%.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK: [[REDUCE:%.*]] = "xla_hlo.reduce_window"([[CONVERT_X]], [[INIT]]) ( {
+  // CHECK: ^bb0([[A:%.*]]: tensor<f32>, [[B:%.*]]: tensor<f32>):
+  // CHECK:   [[SUM:%.*]] = xla_hlo.add [[A]], [[B]] : tensor<f32>
+  // CHECK:   "xla_hlo.return"([[SUM]]) : (tensor<f32>) -> ()
+  // CHECK: }) {padding = dense<{{\[\[}}3, 0]]> : tensor<1x2xi64>, window_dimensions = dense<4> : tensor<1xi64>, window_strides = dense<1> : tensor<1xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK: [[CONVERT_REDUCE:%.*]] = "xla_hlo.convert"([[REDUCE]]) : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: return [[CONVERT_REDUCE]]
+  %0 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "", dtype = i32, value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %1 = "tf.Cumsum"(%arg0, %0) {exclusive = false, reverse = false} : (tensor<4xf32>, tensor<i32>) -> tensor<4xf32>
+  return %1 : tensor<4xf32>
+}
+
+// CHECK-LABEL: func @cumsum_exclusive
+func @cumsum_exclusive(%arg0: tensor<4xf32>) -> tensor<4xf32> {
+  // CHECK: "tf.Cumsum"
+  %0 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "", dtype = i32, value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %1 = "tf.Cumsum"(%arg0, %0) {exclusive = true, reverse = false} : (tensor<4xf32>, tensor<i32>) -> tensor<4xf32>
+  return %1 : tensor<4xf32>
+}
+
+// CHECK-LABEL: func @cumsum_reverse
+func @cumsum_reverse(%arg0: tensor<4xf32>) -> tensor<4xf32> {
+  // CHECK: "tf.Cumsum"
+  %0 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "", dtype = i32, value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %1 = "tf.Cumsum"(%arg0, %0) {exclusive = false, reverse = true} : (tensor<4xf32>, tensor<i32>) -> tensor<4xf32>
+  return %1 : tensor<4xf32>
+}
+
+// CHECK-LABEL: func @cumsum_dynamic
+func @cumsum_dynamic(%arg0: tensor<?xf32>, %arg1: tensor<i32>) -> tensor<?xf32> {
+  // CHECK: "tf.Cumsum"
+  %0 = "tf.Cumsum"(%arg0, %arg1) : (tensor<?xf32>, tensor<i32>) -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+//===----------------------------------------------------------------------===//
+// tf.BatchMatMulV2 op legalizations.
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @batchmatmulv2_broadcast_singleton_dimension
+func @batchmatmulv2_broadcast_singleton_dimension(%arg0: tensor<1x4x2xf32>, %arg1: tensor<3x2x4xf32>) -> tensor<3x4x4xf32> {
+  // CHECK:         [[BLHS:%.+]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x4x2xf32>) -> tensor<3x4x2xf32>
+  // CHECK:         [[BRHS:%.+]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<3x2x4xf32>) -> tensor<3x2x4xf32>
+  // CHECK:         [[BDST:%.+]] = "xla_hlo.dot_general"([[BLHS]], [[BRHS]]) {dot_dimension_numbers = {
+  // CHECK-SAME:      lhs_batching_dimensions = dense<0> : tensor<1xi64>,
+  // CHECK-SAME:      lhs_contracting_dimensions = dense<2> : tensor<1xi64>,
+  // CHECK-SAME:      rhs_batching_dimensions = dense<0> : tensor<1xi64>,
+  // CHECK-SAME:      rhs_contracting_dimensions = dense<1> : tensor<1xi64>
+  // CHECK-SAME:    }} : (tensor<3x4x2xf32>, tensor<3x2x4xf32>) -> tensor<3x4x4xf32>
+  // CHECK:         return [[BDST]] : tensor<3x4x4xf32>
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {T = f32, adj_x = false, adj_y = false, device = ""} : (tensor<1x4x2xf32>, tensor<3x2x4xf32>) -> tensor<3x4x4xf32>
+  return %0 : tensor<3x4x4xf32>
+}
+
+// CHECK-LABEL: func @batchmatmulv2_lhs_batch
+func @batchmatmulv2_lhs_batch(%arg0: tensor<3x4x2xf32>, %arg1: tensor<2x4xf32>) -> tensor<3x4x4xf32> {
+  // CHECK:         [[BLHS:%.+]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<3x4x2xf32>) -> tensor<3x4x2xf32>
+  // CHECK:         [[BRHS:%.+]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<2x4xf32>) -> tensor<3x2x4xf32>
+  // CHECK:         [[BDST:%.+]] = "xla_hlo.dot_general"([[BLHS]], [[BRHS]]) {dot_dimension_numbers = {
+  // CHECK-SAME:      lhs_batching_dimensions = dense<0> : tensor<1xi64>,
+  // CHECK-SAME:      lhs_contracting_dimensions = dense<2> : tensor<1xi64>,
+  // CHECK-SAME:      rhs_batching_dimensions = dense<0> : tensor<1xi64>,
+  // CHECK-SAME:      rhs_contracting_dimensions = dense<1> : tensor<1xi64>
+  // CHECK-SAME:    }} : (tensor<3x4x2xf32>, tensor<3x2x4xf32>) -> tensor<3x4x4xf32>
+  // CHECK:         return [[BDST]] : tensor<3x4x4xf32>
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {T = f32, adj_x = false, adj_y = false, device = ""} : (tensor<3x4x2xf32>, tensor<2x4xf32>) -> tensor<3x4x4xf32>
+  return %0 : tensor<3x4x4xf32>
+}
+
+// CHECK-LABEL: func @batchmatmulv2_rhs_batch
+func @batchmatmulv2_rhs_batch(%arg0: tensor<4x2xf32>, %arg1: tensor<3x2x4xf32>) -> tensor<3x4x4xf32> {
+  // CHECK:         [[BLHS:%.+]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<4x2xf32>) -> tensor<3x4x2xf32>
+  // CHECK:         [[BRHS:%.+]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<3x2x4xf32>) -> tensor<3x2x4xf32>
+  // CHECK:         [[BDST:%.+]] = "xla_hlo.dot_general"([[BLHS]], [[BRHS]]) {dot_dimension_numbers = {
+  // CHECK-SAME:      lhs_batching_dimensions = dense<0> : tensor<1xi64>,
+  // CHECK-SAME:      lhs_contracting_dimensions = dense<2> : tensor<1xi64>,
+  // CHECK-SAME:      rhs_batching_dimensions = dense<0> : tensor<1xi64>,
+  // CHECK-SAME:      rhs_contracting_dimensions = dense<1> : tensor<1xi64>
+  // CHECK-SAME:    }} : (tensor<3x4x2xf32>, tensor<3x2x4xf32>) -> tensor<3x4x4xf32>
+  // CHECK:         return [[BDST]] : tensor<3x4x4xf32>
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {T = f32, adj_x = false, adj_y = false, device = ""} : (tensor<4x2xf32>, tensor<3x2x4xf32>) -> tensor<3x4x4xf32>
+  return %0 : tensor<3x4x4xf32>
+}
+
+// CHECK-LABEL: func @batchmatmulv2_dynamic
+func @batchmatmulv2_dynamic(%arg0: tensor<?x4x2xf32>, %arg1: tensor<?x2x4xf32>) -> tensor<?x4x4xf32> {
+  // CHECK: "tf.BatchMatMulV2"
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {T = f32, adj_x = false, adj_y = false, device = ""} : (tensor<?x4x2xf32>, tensor<?x2x4xf32>) -> tensor<?x4x4xf32>
+  return %0 : tensor<?x4x4xf32>
+}
+
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-copy-removal.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-copy-removal.mlir
new file mode 100644
index 00000000000..35546594ccb
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-copy-removal.mlir
@@ -0,0 +1,93 @@
+// RUN: tf-opt -lhlo-copy-removal %s -o - | FileCheck %s --dump-input-on-failure
+
+// CHECK-LABEL: func @remove_simple
+func @remove_simple(%arg0: memref<2x2xf32>) {
+    %0 = alloc() {temp = true} : memref<2x2xf32>
+    "xla_lhlo.copy"(%0, %arg0) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+    dealloc %0 : memref<2x2xf32>
+    // CHECK-NEXT: "xla_lhlo.terminator"() : () -> ()
+    "xla_lhlo.terminator"() : () -> ()
+}
+
+// -----
+
+// CHECK-LABEL: func @remove_without_dealloc
+func @remove_without_dealloc(%arg0: memref<2x2xf32>) {
+    %0 = alloc() {temp = true} : memref<2x2xf32>
+    "xla_lhlo.copy"(%0, %arg0) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+    // CHECK-NEXT: "xla_lhlo.terminator"() : () -> ()
+    "xla_lhlo.terminator"() : () -> ()
+}
+
+// -----
+
+// CHECK-LABEL: func @replace_dependency
+func @replace_dependency(%arg0: memref<2x2xf32>, %arg1: memref<2x2xf32>) {
+    %0 = alloc() {temp = true} : memref<2x2xf32>
+    "xla_lhlo.exp"(%arg0, %0) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+    // CHECK-NEXT: "xla_lhlo.exp"(%arg0, %arg1) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+    "xla_lhlo.copy"(%0, %arg1) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+    dealloc %0 : memref<2x2xf32>
+    // CHECK-NEXT: "xla_lhlo.terminator"() : () -> ()
+    "xla_lhlo.terminator"() : () -> ()
+}
+
+// -----
+
+// CHECK-LABEL: func @keep_copies
+func @keep_copies(%arg0: memref<2x2xf32>, %arg1: memref<2x2xf32>) {
+    // CHECK-NEXT: "xla_lhlo.copy"(%arg0, %arg1) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+    "xla_lhlo.copy"(%arg0, %arg1) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+    // CHECK-NEXT: "xla_lhlo.terminator"() : () -> ()
+    "xla_lhlo.terminator"() : () -> ()
+}
+
+// -----
+
+// CHECK-LABEL: func @must_not_be_removed
+func @must_not_be_removed(%arg0: memref<2x2xf32>,
+                          %arg1: memref<2x2xf32>,
+                          %arg2: memref<2x2xf32>) {
+    // CHECK-NEXT: %[[ALLOC:.*]] = alloc() {temp = true} : memref<2x2xf32>
+    %0 = alloc() {temp = true} : memref<2x2xf32>
+    // CHECK-NEXT: "xla_lhlo.exp"(%arg0, %[[ALLOC]]) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+    "xla_lhlo.exp"(%arg0, %0) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+    // CHECK-NEXT: "xla_lhlo.exp"(%arg1, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+    "xla_lhlo.exp"(%arg1, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+    // CHECK-NEXT: "xla_lhlo.copy"(%[[ALLOC]], %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+    "xla_lhlo.copy"(%0, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+    dealloc %0 : memref<2x2xf32>
+    "xla_lhlo.terminator"() : () -> ()
+}
+
+// -----
+
+// CHECK-LABEL: func @must_be_removed_first
+func @must_be_removed_first(%arg0: memref<2x2xf32>,
+                            %arg1: memref<2x2xf32>,
+                            %arg2: memref<2x2xf32>) {
+    %0 = alloc() {temp = true} : memref<2x2xf32>
+    // CHECK-NEXT: "xla_lhlo.exp"(%arg1, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+    "xla_lhlo.exp"(%arg1, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+    // CHECK-NEXT: "xla_lhlo.exp"(%arg0, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+    "xla_lhlo.exp"(%arg0, %0) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+    "xla_lhlo.copy"(%0, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+    dealloc %0 : memref<2x2xf32>
+    "xla_lhlo.terminator"() : () -> ()
+}
+
+// -----
+
+// CHECK-LABEL: func @must_be_removed_second
+func @must_be_removed_second(%arg0: memref<2x2xf32>,
+                             %arg1: memref<2x2xf32>,
+                             %arg2: memref<2x2xf32>) {
+    %0 = alloc() {temp = true} : memref<2x2xf32>
+    // CHECK-NEXT: "xla_lhlo.exp"(%arg0, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+    "xla_lhlo.exp"(%arg0, %0) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+    "xla_lhlo.copy"(%0, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+    // CHECK-NEXT: "xla_lhlo.exp"(%arg1, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+    "xla_lhlo.exp"(%arg1, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+    dealloc %0 : memref<2x2xf32>
+    "xla_lhlo.terminator"() : () -> ()
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir
index 7f7e37ebe66..0a48cbd372f 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir
@@ -125,3 +125,55 @@ func @fusion_of_three(%arg0: memref<100x10xf32>,
 //       PLOOP:         subf
 //       PLOOP:       linalg.generic
 //       PLOOP:         exp
+
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+#pointwise_4d_trait = {args_in = 2, args_out = 1, indexing_maps = [#map1, #map1, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
+func @fusion_4d(%multiplier: memref<6x6x6x6xf32>, %summand_1: memref<6x6x6x6xf32>,
+             %summand_2: memref<6x6x6x6xf32>, %result: memref<6x6x6x6xf32>) {
+  %temp_result = alloc() {temp = true} : memref<6x6x6x6xf32>
+  linalg.generic #pointwise_4d_trait %summand_1, %summand_2, %temp_result {
+  ^bb0(%summand_1_in: f32, %summand_2_in: f32, %temp_result_in: f32):
+    %out = addf %summand_1_in, %summand_2_in : f32
+    linalg.yield %out : f32
+  } : memref<6x6x6x6xf32>, memref<6x6x6x6xf32>, memref<6x6x6x6xf32>
+  linalg.generic #pointwise_4d_trait %temp_result, %multiplier, %result {
+  ^bb0(%temp_result_in: f32, %multiplier_in: f32, %result_in: f32):
+    %out = mulf %temp_result_in, %multiplier_in : f32
+    linalg.yield %out : f32
+  } : memref<6x6x6x6xf32>, memref<6x6x6x6xf32>, memref<6x6x6x6xf32>
+  dealloc %temp_result : memref<6x6x6x6xf32>
+  "xla_lhlo.terminator"() : () -> ()
+}
+// CHECK-LABEL: func @fusion_4d
+//       CHECK:  %[[C1:.*]] = constant 1
+//   CHECK-NOT:  linalg.generic
+//       CHECK:  loop.for {{.*}} step %[[C1]]
+//       CHECK:    loop.for {{.*}} step %[[C1]]
+//       CHECK:      loop.for {{.*}} step %[[C1]]
+//       CHECK:        loop.for {{.*}} step %[[C1]]
+//   CHECK-NOT:  loop.for
+//       CHECK:      linalg.generic
+//       CHECK:        addf
+//       CHECK:      linalg.generic
+//       CHECK:        mulf
+
+// TILED-LABEL: func @fusion_4d
+//   TILED-DAG:  %[[C2:.*]] = constant 2
+//   TILED-DAG:  %[[C3:.*]] = constant 3
+//   TILED-NOT:  linalg.generic
+//       TILED:  loop.for {{.*}} step %[[C2]]
+//       TILED:    loop.for {{.*}} step %[[C3]]
+//   TILED-NOT:  loop.for
+//       TILED:      linalg.generic
+//       TILED:        addf
+//       TILED:      linalg.generic
+//       TILED:        mulf
+
+// PLOOP-LABEL: func @fusion_4d
+//   PLOOP-NOT:  linalg.generic
+//       PLOOP:  loop.parallel
+//   PLOOP-NOT:  loop.parallel
+//       PLOOP:      linalg.generic
+//       PLOOP:        addf
+//       PLOOP:      linalg.generic
+//       PLOOP:        mulf
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir
index d43ca3b6bb2..5d0c767a716 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir
@@ -102,6 +102,20 @@ func @exp(%input: memref<2x2xf32>,
 
 // -----
 
+// CHECK-LABEL: func @log
+func @log(%input: memref<2x2xf32>,
+          %result: memref<2x2xf32>) {
+  "xla_lhlo.log"(%input, %result)
+      : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+  return
+}
+// CHECK: linalg.generic
+// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]):
+// CHECK-NEXT:   %[[RESULT:.*]] = log %[[OPERAND_IN]] : f32
+// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
+
+// -----
+
 // CHECK-LABEL: func @copy
 func @copy(%input: memref<2x4x8xf32>,
            %result: memref<2x4x8xf32>) {
@@ -210,6 +224,7 @@ func @broadcast(%operand: memref<5x7x1xf32>, %result: memref<7x10x6x4x5xf32>) {
 
 // -----
 
+// CHECK-DAG: #[[RESULT_MAP_0:.*]] = affine_map<(d0, d1, d2) -> ()>
 // CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 // CHECK-LABEL: func @broadcast_scalar
 func @broadcast_scalar(%operand: memref<f32>, %result: memref<7x10x6xf32>) {
@@ -218,9 +233,8 @@ func @broadcast_scalar(%operand: memref<f32>, %result: memref<7x10x6xf32>) {
     : (memref<f32>, memref<7x10x6xf32>) -> ()
   return
 }
-// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[RESULT_MAP]]]
-// CHECK-NEXT: ^bb0(%[[RESULT:.*]]: f32):
-// CHECK-NEXT: %[[CONST:.*]] = load %{{.*}} : memref<f32>
+// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[RESULT_MAP_0]], #[[RESULT_MAP]]]
+// CHECK-NEXT: ^bb0(%[[CONST:.*]]: f32, %[[RESULT:.*]]: f32):
 // CHECK-NEXT:   linalg.yield %[[CONST]] : f32
 
 // -----
@@ -401,6 +415,20 @@ func @remainder(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>,
 
 // -----
 
+// CHECK-LABEL: func @rsqrt
+func @rsqrt(%input: memref<2x2xf32>,
+          %result: memref<2x2xf32>) {
+  "xla_lhlo.rsqrt"(%input, %result)
+      : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+  return
+}
+// CHECK: linalg.generic
+// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]):
+// CHECK-NEXT:   %[[RESULT:.*]] = rsqrt %[[OPERAND_IN]] : f32
+// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
+
+// -----
+
 // CHECK-LABEL: func @sign
 func @sign(%input: memref<2x2xf32>,
           %result: memref<2x2xf32>) {
@@ -416,6 +444,20 @@ func @sign(%input: memref<2x2xf32>,
 
 // -----
 
+// CHECK-LABEL: func @sqrt
+func @sqrt(%input: memref<2x2xf32>,
+          %result: memref<2x2xf32>) {
+  "xla_lhlo.sqrt"(%input, %result)
+      : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+  return
+}
+// CHECK: linalg.generic
+// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]):
+// CHECK-NEXT:   %[[RESULT:.*]] = sqrt %[[OPERAND_IN]] : f32
+// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
+
+// -----
+
 // CHECK-LABEL: func @tanh
 func @tanh(%input: memref<2x2xf32>,
           %result: memref<2x2xf32>) {
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-parallel-loops.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-parallel-loops.mlir
new file mode 100644
index 00000000000..3317d24d820
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-parallel-loops.mlir
@@ -0,0 +1,127 @@
+// RUN: tf-opt %s -lhlo-legalize-to-parallel-loops -canonicalize -split-input-file | FileCheck %s --dump-input-on-failure
+
+func @reduce(%arg: memref<100x10x5xf32>,
+             %init: memref<f32>,
+             %result: memref<100x5xf32>) {
+  "xla_lhlo.reduce"(%arg, %init, %result) ( {
+    ^bb0(%lhs: memref<f32>, %rhs: memref<f32>, %res: memref<f32>):
+      "xla_lhlo.add"(%lhs, %rhs, %res)
+        : (memref<f32>, memref<f32>, memref<f32>) -> ()
+      "xla_lhlo.terminator"() : () -> ()
+    } ) {dimensions = dense<[1]> : tensor<1xi64>}
+      : (memref<100x10x5xf32>, memref<f32>, memref<100x5xf32>) -> ()
+  return
+}
+// CHECK-LABEL: func @reduce(
+// CHECK-SAME: [[ARG_BUF:%.*]]: memref<100x10x5xf32>,
+// CHECK-SAME: [[INIT_BUF:%.*]]: memref<f32>,
+// CHECK-SAME: [[RESULT_BUF:%.*]]: memref<100x5xf32>) {
+// CHECK-DAG:  [[C0:%.*]] = constant 0 : index
+// CHECK-DAG:  [[C1:%.*]] = constant 1 : index
+// CHECK-DAG:  [[C5:%.*]] = constant 5 : index
+// CHECK-DAG:  [[C10:%.*]] = constant 10 : index
+// CHECK-DAG:  [[C100:%.*]] = constant 100 : index
+// CHECK:  [[INIT:%.*]] = load [[INIT_BUF]]
+// CHECK:  loop.parallel ([[I:%.*]], [[K:%.*]]) = ([[C0]], [[C0]])
+// CHECK-SAME:                     to ([[C100]], [[C5]]) step ([[C1]], [[C1]]) {
+// CHECK:    [[REDUCTION_RESULT:%.*]] = loop.parallel ([[J:%.*]]) =
+// CHECK-SAME:      ([[C0]]) to ([[C10]]) step ([[C1]]) init ([[INIT]]) -> f32 {
+// CHECK:      [[ELEM_TO_REDUCE:%.*]] = load [[ARG_BUF]]
+// CHECK-SAME:                 {{\[}}[[I]], [[J]], [[K]]] : memref<100x10x5xf32>
+// CHECK:      loop.reduce([[ELEM_TO_REDUCE]]) : f32 {
+// CHECK:      ^bb0([[ELEM:%.*]]: f32, [[ACC:%.*]]: f32):
+// CHECK:        [[ELEM_BUF:%.*]] = alloc() : memref<f32>
+// CHECK:        store [[ELEM]], [[ELEM_BUF]][] : memref<f32>
+// CHECK:        [[ACC_BUF:%.*]] = alloc() : memref<f32>
+// CHECK:        store [[ACC]], [[ACC_BUF]][] : memref<f32>
+// CHECK:        "xla_lhlo.add"([[ELEM_BUF]], [[ACC_BUF]], [[ACC_BUF]])
+// CHECK:        [[ACC_RESULT:%.*]] = load [[ACC_BUF]][] : memref<f32>
+// CHECK:        loop.reduce.return [[ACC_RESULT]] : f32
+// CHECK:      }
+// CHECK:      loop.yield
+// CHECK:    }
+// CHECK:    store [[REDUCTION_RESULT]], [[RESULT_BUF]]{{\[}}[[I]], [[K]]]
+// CHECK:    loop.yield
+
+// -----
+
+func @reduce_no_outer_loop(%arg: memref<100xf32>,
+                           %init: memref<f32>,
+                           %result: memref<1xf32>) {
+  "xla_lhlo.reduce"(%arg, %init, %result) ( {
+    ^bb0(%lhs: memref<f32>, %rhs: memref<f32>, %res: memref<f32>):
+      "xla_lhlo.add"(%lhs, %rhs, %res)
+        : (memref<f32>, memref<f32>, memref<f32>) -> ()
+      "xla_lhlo.terminator"() : () -> ()
+    } ) {dimensions = dense<[0]> : tensor<1xi64>}
+      : (memref<100xf32>, memref<f32>, memref<1xf32>) -> ()
+  return
+}
+// CHECK-LABEL: func @reduce_no_outer_loop(
+// CHECK-SAME: [[ARG_BUF:%.*]]: memref<100xf32>,
+// CHECK-SAME: [[ELEM_TO_REDUCE_BUF:%.*]]: memref<f32>,
+// CHECK-SAME: [[RESULT_BUF:%.*]]: memref<1xf32>) {
+// CHECK-DAG:  [[C0:%.*]] = constant 0 : index
+// CHECK-DAG:  [[C1:%.*]] = constant 1 : index
+// CHECK-DAG:  [[C100:%.*]] = constant 100 : index
+// CHECK:      [[INIT:%.*]] = load [[INIT_BUF]]
+// CHECK:      [[REDUCTION_RESULT:%.*]] = loop.parallel ([[I:%.*]]) = ([[C0]])
+// CHECK-SAME:     to ([[C100]]) step ([[C1]]) init ([[INIT]]) -> f32 {
+// CHECK:        [[ELEM_TO_REDUCE:%.*]] = load [[ARG_BUF]]{{\[}}[[I]]{{\]}}
+// CHECK:        loop.reduce([[ELEM_TO_REDUCE]]) : f32 {
+// CHECK:        ^bb0([[ELEM:%.*]]: f32, [[ACC:%.*]]: f32):
+// CHECK:          [[ELEM_BUF:%.*]] = alloc() : memref<f32>
+// CHECK:          store [[ELEM]], [[ELEM_BUF]][] : memref<f32>
+// CHECK:          [[ACC_BUF:%.*]] = alloc() : memref<f32>
+// CHECK:          store [[ACC]], [[ACC_BUF]][] : memref<f32>
+// CHECK:          "xla_lhlo.add"([[ELEM_BUF]], [[ACC_BUF]], [[ACC_BUF]])
+// CHECK:          [[ACC_RESULT:%.*]] = load [[ACC_BUF]][] : memref<f32>
+// CHECK:          loop.reduce.return [[ACC_RESULT]]
+// CHECK:        }
+// CHECK:        loop.yield
+// CHECK:      store [[REDUCTION_RESULT]], [[RESULT_BUF]]{{\[}}[[C0]]]
+
+// -----
+
+func @dynamic_reduce(%arg: memref<?x?x?xf32>,
+                     %init: memref<f32>,
+                     %result: memref<?x?xf32>) {
+  "xla_lhlo.reduce"(%arg, %init, %result) ( {
+    ^bb0(%lhs: memref<f32>, %rhs: memref<f32>, %res: memref<f32>):
+      "xla_lhlo.add"(%lhs, %rhs, %res)
+        : (memref<f32>, memref<f32>, memref<f32>) -> ()
+      "xla_lhlo.terminator"() : () -> ()
+    } ) {dimensions = dense<[1]> : tensor<1xi64>}
+      : (memref<?x?x?xf32>, memref<f32>, memref<?x?xf32>) -> ()
+  return
+}
+// CHECK-LABEL: func @dynamic_reduce(
+// CHECK-SAME: [[ARG_BUF:%.*]]: memref<?x?x?xf32>,
+// CHECK-SAME: [[INIT_BUF:%.*]]: memref<f32>,
+// CHECK-SAME: [[RESULT_BUF:%.*]]: memref<?x?xf32>) {
+// CHECK-DAG:  [[C0:%.*]] = constant 0 : index
+// CHECK-DAG:  [[C1:%.*]] = constant 1 : index
+// CHECK:  [[DIM0:%.*]] = dim [[ARG_BUF]], 0 : memref<?x?x?xf32>
+// CHECK:  [[DIM1:%.*]] = dim [[ARG_BUF]], 1 : memref<?x?x?xf32>
+// CHECK:  [[DIM2:%.*]] = dim [[ARG_BUF]], 2 : memref<?x?x?xf32>
+// CHECK:  [[INIT:%.*]] = load [[INIT_BUF]]
+// CHECK:  loop.parallel ([[I:%.*]], [[K:%.*]]) = ([[C0]], [[C0]])
+// CHECK-SAME:                     to ([[DIM0]], [[DIM2]]) step ([[C1]], [[C1]]) {
+// CHECK:    [[REDUCTION_RESULT:%.*]] = loop.parallel ([[J:%.*]]) =
+// CHECK-SAME:     ([[C0]]) to ([[DIM1]]) step ([[C1]]) init ([[INIT]]) -> f32 {
+// CHECK:      [[ELEM_TO_REDUCE:%.*]] = load [[ARG_BUF]]
+// CHECK-SAME:                 {{\[}}[[I]], [[J]], [[K]]] : memref<?x?x?xf32>
+// CHECK:      loop.reduce([[ELEM_TO_REDUCE]]) : f32 {
+// CHECK:      ^bb0([[ELEM:%.*]]: f32, [[ACC:%.*]]: f32):
+// CHECK:        [[ELEM_BUF:%.*]] = alloc() : memref<f32>
+// CHECK:        store [[ELEM]], [[ELEM_BUF]][] : memref<f32>
+// CHECK:        [[ACC_BUF:%.*]] = alloc() : memref<f32>
+// CHECK:        store [[ACC]], [[ACC_BUF]][] : memref<f32>
+// CHECK:        "xla_lhlo.add"([[ELEM_BUF]], [[ACC_BUF]], [[ACC_BUF]])
+// CHECK:        [[ACC_RESULT:%.*]] = load [[ACC_BUF]][] : memref<f32>
+// CHECK:        loop.reduce.return [[ACC_RESULT]] : f32
+// CHECK:      }
+// CHECK:      loop.yield
+// CHECK:    }
+// CHECK:    store [[REDUCTION_RESULT]], [[RESULT_BUF]]{{\[}}[[I]], [[K]]]
+// CHECK:    loop.yield
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
index 9f181d574c0..2953fc84d71 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
@@ -146,7 +146,7 @@ func @broadcast_in_dim_memref(%arg0: memref<1x2xi32>, %out: memref<1x2x2xi32>) -
 
 // CHECK-LABEL: func @broadcast_in_dim_zero_rank_memref
 func @broadcast_in_dim_zero_rank_memref(%arg0: memref<i32>, %out: memref<1x2x3xi32>) -> () {
-  "xla_lhlo.broadcast_in_dim"(%arg0, %out) : (memref<i32>, memref<1x2x3xi32>) -> ()
+  "xla_lhlo.broadcast_in_dim"(%arg0, %out) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (memref<i32>, memref<1x2x3xi32>) -> ()
   return
 }
 
diff --git a/tensorflow/compiler/mlir/xla/tests/ops.mlir b/tensorflow/compiler/mlir/xla/tests/ops.mlir
index ce70c7896c1..037eded9ba6 100644
--- a/tensorflow/compiler/mlir/xla/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/ops.mlir
@@ -102,7 +102,7 @@ func @broadcast_in_dim(%arg0: tensor<1x2xi32>) -> tensor<1x2x2xi32> {
 
 // CHECK-LABEL: func @broadcast_in_dim_zero_rank
 func @broadcast_in_dim_zero_rank(%arg0: tensor<i32>) -> tensor<1x2x3xi32> {
-  %0 = "xla_hlo.broadcast_in_dim"(%arg0) : (tensor<i32>) -> tensor<1x2x3xi32>
+  %0 = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>) -> tensor<1x2x3xi32>
   return %0 : tensor<1x2x3xi32>
 }
 
@@ -446,7 +446,7 @@ func @recv_non_token_second_result(%token: !xla_hlo.token) -> tuple<tensor<3x4xi
 
 func @rng_uniform_invalid_type(%mu: tensor<complex<f32>>, %sigma: tensor<f32>) -> tensor<2x3x5xf32> {
   %shape = xla_hlo.constant dense<[2, 3, 5]> : tensor<3xi64>
-  // expected-error@+1 {{must be tensor of pred (AKA boolean or 1-bit integer) or 8/16/32/64-bit integer or floating-point values, but got 'tensor<complex<f32>>'}}
+  // expected-error@+1 {{must be tensor of pred (AKA boolean or 1-bit integer) or 8/16/32/64-bit signless integer or floating-point values, but got 'tensor<complex<f32>>'}}
   %0 = "xla_hlo.rng_uniform"(%mu, %sigma, %shape) : (tensor<complex<f32>>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
   return %0 : tensor<2x3x5xf32>
 }
@@ -477,6 +477,31 @@ func @select_scalar_pred(%arg0: tensor<i1>, %arg1: tensor<2x3xi32>, %arg2: tenso
 
 // -----
 
+// CHECK-LABEL: func @select_cast_compatible_types
+func @select_cast_compatible_types(%arg0: tensor<i1>, %arg1: tensor<*xi32>, %arg2: tensor<2x3xi32>) -> tensor<*xi32> {
+  %0 = "xla_hlo.select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<*xi32>, tensor<2x3xi32>) -> tensor<*xi32>
+  return %0 : tensor<*xi32>
+}
+
+// -----
+
+func @select_cast_compatible_types(%arg0: tensor<i1>, %arg1: tensor<2x?xi32>, %arg2: tensor<?x3xi32>) -> tensor<?x?xi32> {
+  // TODO(lucyfox): Update once this is supported.
+  // expected-error@+1 {{currently unsupported operand types: 'tensor<2x?xi32>' and 'tensor<?x3xi32>'}}
+  %0 = "xla_hlo.select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<2x?xi32>, tensor<?x3xi32>) -> tensor<?x?xi32>
+  return %0 : tensor<?x?xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @select_scalar_x_y
+func @select_scalar_x_y(%arg0: tensor<i1>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<i32> {
+  %0 = "xla_hlo.select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
+  return %0 : tensor<i32>
+}
+
+// -----
+
 func @select_bad_pred_type(%arg0: tensor<3xi32>, %arg1: tensor<2x3xi32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32> {
   // expected-error@+1 {{must be tensor of pred (AKA boolean or 1-bit integer) values}}
   %0 = "xla_hlo.select"(%arg0, %arg1, %arg2) : (tensor<3xi32>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
@@ -485,18 +510,16 @@ func @select_bad_pred_type(%arg0: tensor<3xi32>, %arg1: tensor<2x3xi32>, %arg2:
 
 // -----
 
-// TODO(jpienaar): Re-enable post updating select function verify.
 func @select_bad_shape_mismatch(%arg0: tensor<3xi1>, %arg1: tensor<2x4xi32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32> {
-  // should-be-error@+1 {{on_true type (tensor<2x4xi32>) does not match on_false type (tensor<2x3xi32>)}}
+  // expected-error@+1 {{incompatible operand types: 'tensor<2x4xi32>' and 'tensor<2x3xi32>'}}
   %0 = "xla_hlo.select"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<2x4xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   return %0 : tensor<2x3xi32>
 }
 
 // -----
 
-// TODO(jpienaar): Re-enable post updating select function verify.
 func @select_bad_element_type_mismatch(%arg0: tensor<3xi1>, %arg1: tensor<2x3xf32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32> {
-  // should-be-error@+1 {{on_true type (tensor<2x3xf32>) does not match on_false type (tensor<2x3xi32>)}}
+  // expected-error@+1 {{incompatible operand types: 'tensor<2x3xf32>' and 'tensor<2x3xi32>'}}
   %0 = "xla_hlo.select"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<2x3xf32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   return %0 : tensor<2x3xi32>
 }
@@ -731,7 +754,7 @@ func @or_i1_type(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
 // -----
 
 func @or_invalid_f32_type(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // expected-error@+1 {{must be tensor of pred (AKA boolean or 1-bit integer) or 8/16/32/64-bit integer values, but got 'tensor<4xf32>'}}
+  // expected-error@+1 {{must be tensor of pred (AKA boolean or 1-bit integer) or 8/16/32/64-bit signless integer values, but got 'tensor<4xf32>'}}
   %0 = "xla_hlo.or"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
   return %0 : tensor<4xf32>
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/export.mlir b/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
index aac4e613358..8af27bb586a 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
@@ -160,6 +160,16 @@ func @main(%arg0: tensor<1xf32>) -> tensor<1x10xf32> {
 
 // -----
 
+// CHECK:  HloModule
+func @main() -> !xla_hlo.token {
+  %0 = "xla_hlo.create_token"() : () -> !xla_hlo.token
+  return %0 : !xla_hlo.token
+}
+
+// CHECK:  ROOT [[TOKEN:%.*]] = token[] after-all()
+
+// -----
+
 // CHECK:  HloModule
 func @main(%arg0: tensor<4xi32>) -> tensor<4xi32> {
   %0 = call @callee(%arg0, %arg0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/fully_connected_reference_model.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/fully_connected_reference_model.hlotxt
index 38e58be8e64..01a24c06d2c 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/fully_connected_reference_model.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/fully_connected_reference_model.hlotxt
@@ -27,7 +27,7 @@ ENTRY %tfcompile.48 {
   // CHECK-NEXT: %cst = constant  {name = "constant.8"} dense<1.000000e+00> : tensor<f32>
   %constant.8 = f32[] constant(1)
 
-  // CHECK-NEXT: %5 = "xla_hlo.broadcast_in_dim"(%cst) {name = "broadcast.9"} : (tensor<f32>) -> tensor<300x1x5xf32>
+  // CHECK-NEXT: %5 = "xla_hlo.broadcast_in_dim"(%cst) {broadcast_dimensions = dense<[]> : tensor<0xi64>, name = "broadcast.9"} : (tensor<f32>) -> tensor<300x1x5xf32>
   %broadcast.9 = f32[300,1,5] broadcast(%constant.8), dimensions={}
 
   // CHECK-NEXT: %6 = xla_hlo.mul %4, %5 {name = "multiply.31"} : tensor<300x1x5xf32>
@@ -36,7 +36,7 @@ ENTRY %tfcompile.48 {
   // CHECK-NEXT: %cst_0 = constant  {name = "constant.32"} dense<0.000000e+00> : tensor<f32>
   %constant.32 = f32[] constant(0)
 
-  // CHECK-NEXT: %7 = "xla_hlo.broadcast_in_dim"(%cst_0) {name = "broadcast.33"} : (tensor<f32>) -> tensor<300x1x5xf32>
+  // CHECK-NEXT: %7 = "xla_hlo.broadcast_in_dim"(%cst_0) {broadcast_dimensions = dense<[]> : tensor<0xi64>, name = "broadcast.33"} : (tensor<f32>) -> tensor<300x1x5xf32>
   %broadcast.33 = f32[300,1,5] broadcast(%constant.32), dimensions={}
 
   // CHECK-NEXT: %8 = "xla_hlo.compare"(%6, %7) {comparison_direction = "GT", name = "compare.34"} : (tensor<300x1x5xf32>, tensor<300x1x5xf32>) -> tensor<300x1x5xi1>
@@ -45,13 +45,13 @@ ENTRY %tfcompile.48 {
   // CHECK-NEXT: %cst_1 = constant  {name = "constant.10"} dense<0.000000e+00> : tensor<f32>
   %constant.10 = f32[] constant(0)
 
-  // CHECK-NEXT: %9 = "xla_hlo.broadcast_in_dim"(%cst_1) {name = "broadcast.11"} : (tensor<f32>) -> tensor<300x1x5xf32>
+  // CHECK-NEXT: %9 = "xla_hlo.broadcast_in_dim"(%cst_1) {broadcast_dimensions = dense<[]> : tensor<0xi64>, name = "broadcast.11"} : (tensor<f32>) -> tensor<300x1x5xf32>
   %broadcast.11 = f32[300,1,5] broadcast(%constant.10), dimensions={}
 
   // CHECK-NEXT: %cst_2 = constant  {name = "constant.40"} dense<0.000000e+00> : tensor<f32>
   %constant.40 = f32[] constant(0)
 
-  // CHECK-NEXT: %10 = "xla_hlo.broadcast_in_dim"(%cst_2) {name = "broadcast.41"} : (tensor<f32>) -> tensor<300x5xf32>
+  // CHECK-NEXT: %10 = "xla_hlo.broadcast_in_dim"(%cst_2) {broadcast_dimensions = dense<[]> : tensor<0xi64>, name = "broadcast.41"} : (tensor<f32>) -> tensor<300x5xf32>
   %broadcast.41 = f32[300,5] broadcast(%constant.40), dimensions={}
 
   // CHECK-NEXT: %11 = "xla_hlo.copy"(%arg1) {name = "copy.1"} : (tensor<1x300x3x1xf32>) -> tensor<1x300x3x1xf32>
diff --git a/tensorflow/compiler/mlir/xla/tests/unfuse_batch_norm.mlir b/tensorflow/compiler/mlir/xla/tests/unfuse_batch_norm.mlir
index 1270e339d98..b5e1eaf104a 100644
--- a/tensorflow/compiler/mlir/xla/tests/unfuse_batch_norm.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/unfuse_batch_norm.mlir
@@ -11,7 +11,7 @@ func @batchNormInference_2D_inner_features(
     %mean: tensor<256xf32>, %variance: tensor<256xf32>)
     -> (tensor<4x256xf32>) {
   // CHECK-DAG: %[[EPS:.+]] = xla_hlo.constant dense<1.001000e-05> : tensor<f32>
-  // CHECK-DAG: %[[EPS_BCAST:.+]] = "xla_hlo.broadcast_in_dim"(%[[EPS]]) : (tensor<f32>) -> tensor<256xf32>
+  // CHECK-DAG: %[[EPS_BCAST:.+]] = "xla_hlo.broadcast_in_dim"(%[[EPS]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>) -> tensor<256xf32>
   // CHECK-DAG: %[[VARIANCE_EPS:.+]] = xla_hlo.add %[[VARIANCE]], %[[EPS_BCAST]] : tensor<256xf32>
   // CHECK-DAG: %[[STDDEV:.+]] = "xla_hlo.sqrt"(%[[VARIANCE_EPS]]) : (tensor<256xf32>) -> tensor<256xf32>
   // CHECK-DAG: %[[STDDEV_BCAST:.+]] = "xla_hlo.broadcast_in_dim"(%[[STDDEV]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<4x256xf32>
@@ -92,3 +92,46 @@ func @batchNormInference_f16_overflow(
         tensor<256xf16>) -> tensor<4x256xf16>
   return %0 : tensor<4x256xf16>
 }
+
+// -----
+// CHECK-LABEL: @batchNormInference_dynamic_shape
+// Validate that dynamic shapes are handled properly.
+// CHECK-SAME: %[[X:[^:[:space:]]+]]
+// CHECK-SAME: %[[SCALE:[^:[:space:]]+]]
+// CHECK-SAME: %[[OFFSET:[^:[:space:]]+]]
+// CHECK-SAME: %[[MEAN:[^:[:space:]]+]]
+// CHECK-SAME: %[[VARIANCE:[^:[:space:]]+]]
+func @batchNormInference_dynamic_shape(
+    %x: tensor<?x?x?x?xf32>, %scale: tensor<?xf32>, %offset: tensor<?xf32>,
+    %mean: tensor<?xf32>, %variance: tensor<?xf32>)
+    -> tensor<?x?x?x?xf32> {
+  // CHECK-DAG: %[[EPS:.+]] = xla_hlo.constant dense<1.000000e-03> : tensor<f32>
+  // CHECK-DAG: %[[DIM:.+]] = dim %[[VARIANCE]], 0 : tensor<?xf32>
+  // CHECK-DAG: %[[INDEX_CAST:.+]] = index_cast %[[DIM]] : index to i32
+  // CHECK-DAG: %[[TO_DIM_TENSOR:.+]] = "xla_hlo.scalars_to_dimension_tensor"(%[[INDEX_CAST]]) : (i32) -> tensor<1xi32>
+  // CHECK-DAG: %[[EPS_BCAST:.+]] =  "xla_hlo.dynamic_broadcast_in_dim"(%[[EPS]], %[[TO_DIM_TENSOR]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>, tensor<1xi32>) -> tensor<?xf32>
+  // CHECK-DAG: %[[VARIANCE_EPS:.+]] = xla_hlo.add %[[VARIANCE]], %[[EPS_BCAST]] : tensor<?xf32>
+  // CHECK-DAG: %[[STDDEV:.+]] = "xla_hlo.sqrt"(%[[VARIANCE_EPS]]) : (tensor<?xf32>) -> tensor<?xf32>
+  // CHECK-DAG: %[[INPUT_DIM_0:.+]] = dim %[[X]], 0 : tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[INPUT_INDEX_CAST_0:.+]] = index_cast %[[INPUT_DIM_0]] : index to i32
+  // CHECK-DAG: %[[INPUT_DIM_1:.+]] = dim %[[X]], 1 : tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[INPUT_INDEX_CAST_1:.+]] = index_cast %[[INPUT_DIM_1]] : index to i32
+  // CHECK-DAG: %[[INPUT_DIM_2:.+]] = dim %[[X]], 2 : tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[INPUT_INDEX_CAST_2:.+]] = index_cast %[[INPUT_DIM_2]] : index to i32
+  // CHECK-DAG: %[[INPUT_DIM_3:.+]] = dim %[[X]], 3 : tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[INPUT_INDEX_CAST_3:.+]] = index_cast %[[INPUT_DIM_3]] : index to i32
+  // CHECK-DAG: %[[TO_INPUT_DIM_TENSOR:.+]] = "xla_hlo.scalars_to_dimension_tensor"(%[[INPUT_INDEX_CAST_0]], %[[INPUT_INDEX_CAST_1]], %[[INPUT_INDEX_CAST_2]], %[[INPUT_INDEX_CAST_3]]) : (i32, i32, i32, i32) -> tensor<4xi32>
+  // CHECK-DAG: %[[STDDEV_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[STDDEV]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xi32>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[SCALE_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[SCALE]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xi32>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[OFFSET_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[OFFSET]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xi32>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[MEAN_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[MEAN]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xi32>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[X_CENTER:.+]] = xla_hlo.sub %[[X]], %[[MEAN_BCAST]] : tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[X_SCALED:.+]] = xla_hlo.mul %[[X_CENTER]], %[[SCALE_BCAST]] : tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[X_NORMED:.+]] = xla_hlo.div %[[X_SCALED]], %[[STDDEV_BCAST]] : tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[RESULT:.+]] = xla_hlo.add %[[X_NORMED]], %[[OFFSET_BCAST]] : tensor<?x?x?x?xf32>
+  %0 = "xla_hlo.batch_norm_inference"(%x, %scale, %offset, %mean, %variance)
+      {epsilon = 0.001 : f32, feature_index = 1 : i64} :
+      (tensor<?x?x?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>,
+        tensor<?xf32>) -> tensor<?x?x?x?xf32>
+  return %0 : tensor<?x?x?x?xf32>
+}
diff --git a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
index 29d399c68fa..cc6ca472c23 100644
--- a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
@@ -30,7 +30,7 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"  // TF:llvm-project
 #include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.h"
-#include "tensorflow/compiler/mlir/xla/transforms/hlo_shape_derivation.h"
+#include "tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
 #include "tensorflow/compiler/mlir/xla/transforms/rewriters.h"
 
@@ -117,7 +117,7 @@ Value InsertAllocAndDealloc(Location loc, Value result,
   return alloc;
 }
 
-template <typename HloOpTy, typename LhloOpTy>
+template <typename HloOpTy>
 class HloToLhloOpConverter : public ConversionPattern {
  public:
   explicit HloToLhloOpConverter(MLIRContext* context)
@@ -138,23 +138,24 @@ class HloToLhloOpConverter : public ConversionPattern {
         buffer_args.push_back(
             InsertAllocAndDealloc(op->getLoc(), result.value(), &rewriter));
       } else {
-        Value shape_value = ShapeDerivation<HloOpTy>::impl::deriveShapeFromOp(
-            op, result.index(), &rewriter);
-        if (!shape_value) {
+        SmallVector<Value, 1> results_shape;
+        auto shape_type_op = dyn_cast<InferShapedTypeOpInterface>(op);
+        if (!shape_type_op) return matchFailure();
+        if (failed(
+                shape_type_op.reifyReturnTypeShapes(rewriter, results_shape)))
           return matchFailure();
-        }
         buffer_args.push_back(InsertDynamicAllocAndDealloc(
-            op->getLoc(), result.value(), shape_value, &rewriter));
+            op->getLoc(), result.value(), results_shape.front(), &rewriter));
       }
     }
-    rewriter.create<LhloOpTy>(op->getLoc(), llvm::None, buffer_args,
-                              op->getAttrs());
+    rewriter.create<xla_hlo::HloToLhloOp<HloOpTy>>(op->getLoc(), llvm::None,
+                                                   buffer_args, op->getAttrs());
     rewriter.replaceOp(op, ArrayRef<Value>(buffer_args).slice(operands.size()));
     return matchSuccess();
   }
 };
 
-struct HloToLHloDynamicBroadcastInDimOpConverter
+struct HloToLhloDynamicBroadcastInDimOpConverter
     : public OpConversionPattern<xla_hlo::DynamicBroadcastInDimOp> {
  public:
   using OpConversionPattern::OpConversionPattern;
@@ -178,7 +179,7 @@ struct HloToLHloDynamicBroadcastInDimOpConverter
   }
 };
 
-struct HloToLHloReduceOpConverter
+struct HloToLhloReduceOpConverter
     : public OpConversionPattern<xla_hlo::ReduceOp> {
  public:
   using OpConversionPattern::OpConversionPattern;
@@ -272,14 +273,14 @@ class HloToLhloTensorStoreOpConverter : public ConversionPattern {
 //   "xla_lhlo.fusion"() ({
 //     %0 = tensor_load %arg1 : memref<2x2xf32>
 //     %1 = tensor_load %arg2 : memref<2x2xf32>
-//     %2 = "xla_hlo.add"(%0, %1) {name = "add"} :
+//     %2 = "xla_hlo.add"(%0, %1) :
 //         (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
 //     %3 = tensor_load %arg0 : memref<2x2xf32>
-//     %4 = "xla_hlo.mul"(%2, %3) {name = "multiply"} :
+//     %4 = "xla_hlo.mul"(%2, %3) :
 //         (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
 //     tensor_store %4, %arg3 : memref<2x2xf32>
 //     "xla_lhlo.terminator"() : () -> ()
-//   }) {name = "fusion"} : () -> ()
+//   }) : () -> ()
 //   return
 // }
 //
@@ -289,14 +290,14 @@ class HloToLhloTensorStoreOpConverter : public ConversionPattern {
 //              %arg2: memref<2x2xf32>,
 //              %arg3: memref<2x2xf32>) {
 //   "xla_lhlo.fusion"() ( {
-//     %0 = alloc() {temp = true} : memref<2x2xf32>
+//     %0 = alloc() : memref<2x2xf32>
 //     "xla_lhlo.add"(%arg1, %arg2, %0) :
 //         (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
 //     "xla_lhlo.mul"(%0, %arg0, %arg3) :
 //         (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
 //     dealloc %0 : memref<2x2xf32>
 //     "xla_lhlo.terminator"() : () -> ()
-//   }) {name = "fusion"} : () -> ()
+//   }) : () -> ()
 //   return
 //  }
 // }
@@ -304,9 +305,9 @@ class HloToLhloTensorStoreOpConverter : public ConversionPattern {
 // FuncOp signature conversion example:
 //
 // func @func_op(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-//   %0 = xla_hlo.max %arg0, %arg1 {name = "maximum.47"} : tensor<4xf32>
-//   %1 = xla_hlo.add %arg0, %0 {name = "maximum.47"} : tensor<4xf32>
-//   return %1 : tensor<4xf32>
+//   %0 = "xla_hlo.max"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) ->
+//   tensor<4xf32> %1 = "xla_hlo.add"(%arg0, %0)  : (tensor<4xf32>,
+//   tensor<4xf32>) -> tensor<4xf32> return %1 : tensor<4xf32>
 // }
 //
 // Transformed function with an extra argument for the result. The types have
@@ -315,11 +316,14 @@ class HloToLhloTensorStoreOpConverter : public ConversionPattern {
 // func @func_op(%arg0: memref<4xf32>,
 //               %arg1: memref<4xf32>,
 //               %arg2: memref<4xf32>) {
-//   %0 = alloc() {temp = true} : memref<4xf32>
-//   "xla_lhlo.max"(%arg0, %arg1, %0) {name = "maximum.47"} :
+//   %0 = alloc() : memref<4xf32>
+//   %1 = alloc() : memref<4xf32>
+//   "xla_lhlo.max"(%arg0, %arg1, %0) :
 //         (memref<4xf32>, memref<4xf32>, memref<4xf32>) -> ()
-//   "xla_lhlo.add"(%arg0, %0, %arg2) {name = "maximum.47"} :
+//   "xla_lhlo.add"(%arg0, %0, %1) :
 //         (memref<4xf32>, memref<4xf32>, memref<4xf32>) -> ()
+//   "xla_lhlo.copy"(%1, %arg2) : (memref<4xf32>, memref<4xf32>) -> ()
+//   dealloc %0 : memref<4xf32>
 //   dealloc %1 : memref<4xf32>
 //   "xla_lhlo.terminator"() : () -> ()
 // }
@@ -438,90 +442,47 @@ void populateHLOToLHLOConversionPattern(MLIRContext* context,
                                         OwningRewritePatternList* patterns) {
   // clang-format off
   patterns->insert<
-      HloToLHloDynamicBroadcastInDimOpConverter,
+      HloToLhloDynamicBroadcastInDimOpConverter,
       HloToLhloFuncOpConverter,
-      HloToLhloOpConverter<xla_hlo::AbsOp, xla_lhlo::AbsOp>,
-      HloToLhloOpConverter<xla_hlo::AddOp, xla_lhlo::AddOp>,
-      HloToLhloOpConverter<xla_hlo::AndOp, xla_lhlo::AndOp>,
-      HloToLhloOpConverter<xla_hlo::BroadcastInDimOp,
-                           xla_lhlo::BroadcastInDimOp>,
-      HloToLhloOpConverter<xla_hlo::CeilOp, xla_lhlo::CeilOp>,
-      HloToLhloOpConverter<xla_hlo::CompareOp, xla_lhlo::CompareOp>,
-      HloToLhloOpConverter<xla_hlo::ConstOp, xla_lhlo::ConstOp>,
-      HloToLhloOpConverter<xla_hlo::ConvertOp, xla_lhlo::ConvertOp>,
-      HloToLhloOpConverter<xla_hlo::CopyOp, xla_lhlo::CopyOp>,
-      HloToLhloOpConverter<xla_hlo::CosOp, xla_lhlo::CosOp>,
-      HloToLhloOpConverter<xla_hlo::DivOp, xla_lhlo::DivOp>,
-      HloToLhloOpConverter<xla_hlo::ExpOp, xla_lhlo::ExpOp>,
-      HloToLhloOpConverter<xla_hlo::IotaOp, xla_lhlo::IotaOp>,
-      HloToLhloOpConverter<xla_hlo::MaxOp, xla_lhlo::MaxOp>,
-      HloToLhloOpConverter<xla_hlo::MinOp, xla_lhlo::MinOp>,
-      HloToLhloOpConverter<xla_hlo::MulOp, xla_lhlo::MulOp>,
-      HloToLhloOpConverter<xla_hlo::NegOp, xla_lhlo::NegOp>,
-      HloToLhloOpConverter<xla_hlo::RemOp, xla_lhlo::RemOp>,
-      HloToLhloOpConverter<xla_hlo::SelectOp, xla_lhlo::SelectOp>,
-      HloToLhloOpConverter<xla_hlo::SignOp, xla_lhlo::SignOp>,
-      HloToLhloOpConverter<xla_hlo::SubOp, xla_lhlo::SubOp>,
-      HloToLhloOpConverter<xla_hlo::TanhOp, xla_lhlo::TanhOp>,
-      HloToLHloReduceOpConverter,
-      StdToLhloReturnOpConverter,
+      HloToLhloOpConverter<xla_hlo::AbsOp>,
+      HloToLhloOpConverter<xla_hlo::AddOp>,
+      HloToLhloOpConverter<xla_hlo::AndOp>,
+      HloToLhloOpConverter<xla_hlo::BroadcastInDimOp>,
+      HloToLhloOpConverter<xla_hlo::CeilOp>,
+      HloToLhloOpConverter<xla_hlo::CompareOp>,
+      HloToLhloOpConverter<xla_hlo::ConstOp>,
+      HloToLhloOpConverter<xla_hlo::ConvertOp>,
+      HloToLhloOpConverter<xla_hlo::CopyOp>,
+      HloToLhloOpConverter<xla_hlo::CosOp>,
+      HloToLhloOpConverter<xla_hlo::DivOp>,
+      HloToLhloOpConverter<xla_hlo::ExpOp>,
+      HloToLhloOpConverter<xla_hlo::IotaOp>,
+      HloToLhloOpConverter<xla_hlo::LogOp>,
+      HloToLhloOpConverter<xla_hlo::MaxOp>,
+      HloToLhloOpConverter<xla_hlo::MinOp>,
+      HloToLhloOpConverter<xla_hlo::MulOp>,
+      HloToLhloOpConverter<xla_hlo::NegOp>,
+      HloToLhloOpConverter<xla_hlo::RemOp>,
+      HloToLhloOpConverter<xla_hlo::RsqrtOp>,
+      HloToLhloOpConverter<xla_hlo::SelectOp>,
+      HloToLhloOpConverter<xla_hlo::SignOp>,
+      HloToLhloOpConverter<xla_hlo::SqrtOp>,
+      HloToLhloOpConverter<xla_hlo::SubOp>,
+      HloToLhloOpConverter<xla_hlo::TanhOp>,
+      HloToLhloReduceOpConverter,
       HloToLhloTensorLoadOpConverter,
-      HloToLhloTensorStoreOpConverter
+      HloToLhloTensorStoreOpConverter,
+      StdToLhloReturnOpConverter
   >(context);
   // clang-format on
 }
 
-/// Removes Lhlo.CopyOp that copies from an allocated buffer to the block
-/// argument. All uses of the buffer are replaced with the block argument.
-struct RedundantCopiesRemoval : mlir::FunctionPass<RedundantCopiesRemoval> {
-  void runOnFunction() override {
-    llvm::SmallVector<mlir::Operation*, 2> eraseList;
-    getFunction().walk([&](mlir::xla_lhlo::CopyOp copyOp) {
-      auto arguments = copyOp.getOperation()->getBlock()->getArguments();
-      if (std::any_of(arguments.begin(), arguments.end(),
-                      [&](mlir::BlockArgument arg) {
-                        return copyOp.output() == arg;
-                      }) &&
-          std::none_of(arguments.begin(), arguments.end(),
-                       [&](mlir::BlockArgument arg) {
-                         return copyOp.operand() == arg;
-                       })) {
-        mlir::Value operand = copyOp.operand();
-        mlir::Value output = copyOp.output();
-        copyOp.erase();
-        for (auto op : operand.getUsers()) {
-          if (!mlir::isa<mlir::DeallocOp>(op)) {
-            op->replaceUsesOfWith(operand, output);
-          }
-        }
-        auto allocOp = operand.getDefiningOp();
-        if (auto deallocOp =
-                mlir::dyn_cast<mlir::DeallocOp>(*allocOp->getUsers().begin())) {
-          eraseList.push_back(deallocOp);
-          eraseList.push_back(allocOp);
-        }
-      }
-    });
-    for (auto op : eraseList) {
-      op->erase();
-    }
-  };
-};
-
 std::unique_ptr<OpPassBase<ModuleOp>> createLegalizeToLhloPass() {
   return absl::make_unique<HloLegalizeToLhlo>();
 }
 
-std::unique_ptr<OpPassBase<FuncOp>> createLhloCopyRemovalPass() {
-  return absl::make_unique<RedundantCopiesRemoval>();
-}
-
 static PassRegistration<HloLegalizeToLhlo> legalize_pass(
     "hlo-legalize-to-lhlo", "Legalize from HLO dialect to LHLO dialect");
 
-static PassRegistration<RedundantCopiesRemoval> copies_removal_pass(
-    "lhlo-redundant-copies-removal",
-    "Legalize from HLO dialect to LHLO dialect");
-
 }  // namespace xla_hlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/transforms/hlo_shape_derivation.h b/tensorflow/compiler/mlir/xla/transforms/hlo_shape_derivation.h
deleted file mode 100644
index d2a1f47e540..00000000000
--- a/tensorflow/compiler/mlir/xla/transforms/hlo_shape_derivation.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_HLO_SHAPE_DERIVATION_H_
-#define TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_HLO_SHAPE_DERIVATION_H_
-
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
-#include "mlir/IR/Attributes.h"  // TF:llvm-project
-#include "mlir/IR/Builders.h"  // TF:llvm-project
-#include "mlir/IR/Location.h"  // TF:llvm-project
-#include "mlir/IR/MLIRContext.h"  // TF:llvm-project
-#include "mlir/IR/Operation.h"  // TF:llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // TF:llvm-project
-#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
-
-namespace mlir {
-namespace xla_hlo {
-
-// This file contains implementations for shape derivation functions that,
-// given some operation and a result number, produce IR that computes the
-// shape of the given result at runtime based on operands of the provided
-// operation.
-// These should be generated at some point based on annotations on the HLO
-// using the new shape dialect. While this is still in the works, we hardcode
-// the expected IR here to unblock progress.
-// The implementation is based on templates to allow for using these derivation
-// functions in templated code.
-
-namespace impl {
-
-struct UnknownShape {
-  // Default shape derivation function that simply fails with a runtime error.
-  static Value deriveShapeFromOp(Operation* op, int operand_position,
-                                 ConversionPatternRewriter* rewriter) {
-    op->emitOpError()
-        << "dynamic result shapes cannot be derived for this operation";
-    return {};
-  }
-};
-
-struct SameShapeAsFirstOperand {
-  // Shape derivation function that computes the shape of the result based on
-  // the first argument. For a 2-dimensional input tensor, this produces IR of
-  // the form
-  //
-  //  %0 = dim %arg0, 0 : memref<?x?xf32>
-  //  %1 = index_cast %0 : index to i64
-  //  %2 = dim %arg0, 1 : memref<?x?xf32>
-  //  %3 = index_cast %2 : index to i64
-  //  %4 = "xla_hlo.scalars_to_dimension_tensor"(%1, %3)
-  //    : (i64, i64) -> tensor<2xi64>
-  //
-  // and returns %4 as the shape value.
-  static Value deriveShapeFromOp(Operation* op, int result_postion,
-                                 ConversionPatternRewriter* rewriter) {
-    Value operand = op->getOperand(0);
-    ShapedType operand_type = operand.getType().dyn_cast<ShapedType>();
-    if (!operand_type) {
-      op->emitOpError() << "first operand has no shaped type";
-      return {};
-    }
-    auto loc = op->getLoc();
-    SmallVector<Value, 4> shape_values;
-    shape_values.reserve(operand_type.getRank());
-    auto shape_scalar_type = rewriter->getIntegerType(64);
-    for (auto element : llvm::enumerate(operand_type.getShape())) {
-      if (element.value() == ShapedType::kDynamicSize) {
-        Value dim = rewriter->create<DimOp>(loc, operand, element.index());
-        shape_values.push_back(
-            rewriter->create<IndexCastOp>(loc, dim, shape_scalar_type));
-      } else {
-        shape_values.push_back(rewriter->create<ConstantOp>(
-            loc, rewriter->getI64IntegerAttr(element.value())));
-      }
-    }
-    return rewriter->create<ScalarsToDimensionTensorOp>(
-        loc, RankedTensorType::get({operand_type.getRank()}, shape_scalar_type),
-        shape_values);
-  }
-};
-
-}  // namespace impl
-
-// Default template to cover HLO operations whose shape derivation is unknown.
-template <typename HloOpTy>
-struct ShapeDerivation {
-  using impl = impl::UnknownShape;
-};
-
-// Element-wise operations that have the shape of their first operand.
-
-#define SAME_SHAPE_AS_FIRST_OPERAND(Op)         \
-  template <>                                   \
-  struct ShapeDerivation<Op> {                  \
-    using impl = impl::SameShapeAsFirstOperand; \
-  };
-
-SAME_SHAPE_AS_FIRST_OPERAND(AbsOp)
-SAME_SHAPE_AS_FIRST_OPERAND(AddOp)
-SAME_SHAPE_AS_FIRST_OPERAND(AndOp)
-SAME_SHAPE_AS_FIRST_OPERAND(CeilOp)
-SAME_SHAPE_AS_FIRST_OPERAND(CosOp)
-SAME_SHAPE_AS_FIRST_OPERAND(DivOp)
-SAME_SHAPE_AS_FIRST_OPERAND(ExpOp)
-SAME_SHAPE_AS_FIRST_OPERAND(MaxOp)
-SAME_SHAPE_AS_FIRST_OPERAND(MinOp)
-SAME_SHAPE_AS_FIRST_OPERAND(MulOp)
-SAME_SHAPE_AS_FIRST_OPERAND(NegOp)
-SAME_SHAPE_AS_FIRST_OPERAND(RemOp)
-SAME_SHAPE_AS_FIRST_OPERAND(SubOp)
-SAME_SHAPE_AS_FIRST_OPERAND(TanhOp)
-
-#undef SAME_SHAPE_AS_FIRST_OPERAND
-
-}  // namespace xla_hlo
-}  // namespace mlir
-
-#endif  // TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_HLO_SHAPE_DERIVATION_H_
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
index 8f955d6944a..7d4b17ef291 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
@@ -46,7 +46,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
 #include "tensorflow/compiler/xla/client/padding.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/kernels/conv_grad_shape_utils.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
@@ -427,6 +427,42 @@ static DenseIntElementsAttr Get2DTransposePerm(BoolAttr transpose, Builder *b) {
   return GetI64ElementsAttr({0, 1}, b);
 }
 
+//===----------------------------------------------------------------------===//
+// MatrixBandPart op utilities.
+//===----------------------------------------------------------------------===//
+
+// Gets the size of the dimension `dim_from_end` from the end of `input`.
+// Requires that `input` is a tensor.
+static int GetDimensionSizeFromEnd(Value input, int dim_from_end) {
+  // Note: the verifier enforces that `input` is a ranked tensor.
+  auto input_type = input.getType().cast<TensorType>();
+  auto input_shape = input_type.getShape();
+  int dim = (input_shape.size() - 1) - dim_from_end;
+  return input_shape[dim];
+}
+
+// Gets a 2D tensor type with shape {dim_0, dim_1}, where `dim_0` and `dim_1`
+// have the same size as the last two dimensions of `input` (the second-to-last
+// dimension and last dimension, respectively). The element type of the
+// outputted RankedTensorType will match the element type of `input`.
+// Requires that `input` is a tensor.
+static RankedTensorType Get2DTensorType(Value input) {
+  // `dim_0` refers to the second-to-last dimension; `dim_1` refers to the last.
+  int dim_0 = GetDimensionSizeFromEnd(input, 1);
+  int dim_1 = GetDimensionSizeFromEnd(input, 0);
+  auto element_type = input.getType().cast<TensorType>().getElementType();
+  return RankedTensorType::get({dim_0, dim_1}, element_type);
+}
+
+// Creates a HLO ConvertOp, converting `input` to have the same element type as
+// `elem_type_tensor`. Requires `elem_type_tensor` to be a tensor.
+static Value CreateConvertOp(OpBuilder *builder, Location loc, Value input,
+                             Value elem_type_tensor) {
+  auto element_type =
+      elem_type_tensor.getType().cast<TensorType>().getElementType();
+  return builder->create<xla_hlo::ConvertOp>(loc, input, element_type);
+}
+
 //===----------------------------------------------------------------------===//
 // Pad op utilities.
 //===----------------------------------------------------------------------===//
@@ -1559,6 +1595,82 @@ class ConvertSizeOp : public OpRewritePattern<TF::SizeOp> {
   }
 };
 
+static void BroadcastBatchMatMulV2Operands(Value lhs, Value rhs, Location loc,
+                                           Value *out_lhs, Value *out_rhs,
+                                           PatternRewriter *rewriter) {
+  auto lhs_type = lhs.getType().cast<RankedTensorType>();
+  auto rhs_type = rhs.getType().cast<RankedTensorType>();
+  // The last two dimensions are the matrix row/col dimensions. Don't
+  // broadcast them.
+  SmallVector<int64_t, 6> result_batch_shape;
+  OpTrait::util::getBroadcastedShape(lhs_type.getShape().drop_back(2),
+                                     rhs_type.getShape().drop_back(2),
+                                     result_batch_shape);
+  auto handle_one_side = [rewriter, &result_batch_shape, loc](
+                             Value side, RankedTensorType type,
+                             Value *out_side) {
+    ArrayRef<int64_t> matrix_dims = type.getShape().take_back(2);
+    auto result_shape = result_batch_shape;
+    result_shape.append(matrix_dims.begin(), matrix_dims.end());
+    auto result_type =
+        RankedTensorType::get(result_shape, type.getElementType());
+    auto shape = rewriter->create<TF::ConstOp>(
+        loc, GetI64ElementsAttr(result_shape, rewriter));
+    *out_side =
+        rewriter->create<TF::BroadcastToOp>(loc, result_type, side, shape);
+  };
+  handle_one_side(lhs, lhs_type, out_lhs);
+  handle_one_side(rhs, rhs_type, out_rhs);
+}
+
+class ConvertBatchMatMulV2Op : public OpRewritePattern<TF::BatchMatMulV2Op> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(TF::BatchMatMulV2Op op,
+                                     PatternRewriter &rewriter) const override {
+    // TODO(silvasean): Handle adj_x/adj_y
+    // Should be able to just set the contracting_dimensions attribute
+    // appropriately.
+    // For complex types, need to do a complex conjugation.
+    if (op.adj_x() || op.adj_y()) return matchFailure();
+
+    Value lhs = op.x();
+    Value rhs = op.y();
+    auto lhs_type = lhs.getType().dyn_cast<RankedTensorType>();
+    auto rhs_type = rhs.getType().dyn_cast<RankedTensorType>();
+    if (!lhs_type || !rhs_type) return matchFailure();
+    // TODO(silvasean): Support dynamic shapes.
+    if (!lhs_type.hasStaticShape() || !rhs_type.hasStaticShape()) {
+      return matchFailure();
+    }
+
+    // Broadcast both operands.
+    BroadcastBatchMatMulV2Operands(lhs, rhs, op.getLoc(), &lhs, &rhs,
+                                   &rewriter);
+    lhs_type = lhs.getType().cast<RankedTensorType>();
+    rhs_type = rhs.getType().cast<RankedTensorType>();
+    assert(lhs_type.getRank() == rhs_type.getRank());
+    int64_t rank = lhs_type.getRank();
+    auto batch_dimensions = GetI64ElementsAttr(
+        llvm::to_vector<4>(llvm::seq<int64_t>(0, rank - 2)), &rewriter);
+    auto lhs_contracting_dimensions =
+        GetI64ElementsAttr(llvm::makeArrayRef({rank - 1}), &rewriter);
+    auto rhs_contracting_dimensions =
+        GetI64ElementsAttr(llvm::makeArrayRef({rank - 2}), &rewriter);
+    auto dimension_numbers = DotDimensionNumbers::get(
+        /*lhs_batching_dimensions=*/batch_dimensions,
+        /*rhs_batching_dimensions=*/batch_dimensions,
+        /*lhs_contracting_dimensions=*/lhs_contracting_dimensions,
+        /*rhs_contracting_dimensions=*/rhs_contracting_dimensions,
+        rewriter.getContext());
+    rewriter.replaceOpWithNewOp<DotGeneralOp>(op, op.getType(), lhs, rhs,
+                                              dimension_numbers,
+                                              /*precision_config=*/nullptr);
+    return matchSuccess();
+  }
+};
+
 // Converts the tf.Split op into a series of HLO slice ops when the tensor to be
 // split has fully static shape and the dimension to split is a constant.
 //
@@ -1890,7 +2002,7 @@ class ConvertStridedSliceGradOp
     Value grad = op.dy();
     Type element_type = grad.getType().cast<ShapedType>().getElementType();
 
-    // Perform reshape to undo any new/shrink axies done by strided slice.
+    // Perform reshape to undo any new/shrink axes done by strided slice.
     grad = rewriter.create<xla_hlo::ReshapeOp>(
         op.getLoc(), RankedTensorType::get(shape, element_type), grad);
 
@@ -2892,22 +3004,22 @@ class ConvertOneHotOp : public OpRewritePattern<TF::OneHotOp> {
   }
 };
 
-// Converts InfeedEnqueueTuple to XLA HLO after_all, infeed and
+// Converts InfeedDequeueTuple to XLA HLO create_token, infeed and
 // get_tuple_element ops.
 //
 // All HLO infeed ops expect a HLO token type operand and produce a tuple
 // containing a token. This HLO token type is used to order multiple infeed
 // operations within a computation. The token type can come from other
-// infeed/outfeed/send/recv ops or can be generated using an after_all op with
-// no operands. Here we emit an after_all op to generate the token type operand
-// of infeed.
+// infeed/outfeed/send/recv ops or can be generated using create_token op with
+// no operands. Here we emit a create_token op to generate the token type
+// operand of infeed.
 //
 // For example the following IR:
 // %0:2 = "tf.InfeedDequeueTuple"() : () -> (tensor<3xi32>, tensor<4xf32>)
 //
 // would be lowered to
 //
-// %token = "xla_hlo.after_all"() : () -> !xla_hlo.token
+// %token = "xla_hlo.create_token"() : () -> !xla_hlo.token
 // %data_and_token = "xla_hlo.infeed"(%token) {infeed_config = ""} :
 //      (!xla_hlo.token) -> tuple<tuple<tensor<3xi32>, tensor<4xf32>>,
 //      !xla_hlo.token>
@@ -2926,21 +3038,20 @@ class ConvertInfeedDequeueTupleOp
     for (auto idx_and_output : llvm::enumerate(op.outputs())) {
       result_types[idx_and_output.index()] = (idx_and_output.value().getType());
     }
-    // Infeed takes a single token operand. Generate the token using after_all
-    // op to pass to the infeed op.
-    auto afterall = rewriter.create<AfterAllOp>(
-        op.getLoc(), xla_hlo::TokenType::get(rewriter.getContext()),
-        ValueRange());
+    // Infeed takes a single token operand. Generate the token using
+    // create_token op to pass to the infeed op.
+    auto token = rewriter.create<CreateTokenOp>(
+        op.getLoc(), xla_hlo::TokenType::get(rewriter.getContext()));
 
     // Emit infeed op.
     // The result type of infeed is a tuple(tuple(result types), token type).
     auto data_tuple_type =
         mlir::TupleType::get(result_types, rewriter.getContext());
     auto data_and_token_type = mlir::TupleType::get(
-        {data_tuple_type, afterall.getType()}, rewriter.getContext());
+        {data_tuple_type, token.getType()}, rewriter.getContext());
 
     auto data_and_token =
-        rewriter.create<InfeedOp>(op.getLoc(), data_and_token_type, afterall,
+        rewriter.create<InfeedOp>(op.getLoc(), data_and_token_type, token,
                                   /*infeed_config=*/rewriter.getStringAttr(""));
 
     // The infeed instruction produces a tuple of the infeed data and a token
@@ -2962,10 +3073,11 @@ class ConvertInfeedDequeueTupleOp
   }
 };
 
-// Converts tf.OutfeedEnqueueTuple to XLA HLO tuple, after_all and outfeed ops.
+// Converts tf.OutfeedEnqueueTuple to XLA HLO tuple, create_token and outfeed
+// ops.
 //
 // XLA HLO outfeed op expects a token, which we generate by emitting an
-// after_all op.
+// create_token op.
 //
 // For example the following IR:
 // "tf.OutfeedEnqueueTuple"(%val_1, %val_2) : (tensor<3xi32>, tensor<4xf32>) ->
@@ -2975,7 +3087,7 @@ class ConvertInfeedDequeueTupleOp
 //
 // %tuple = "xla_hlo.tuple"(%val_1, %val_2) : (tensor<3xi32>, tensor<4xf32>) ->
 //      tuple<tensor<3xi32>, tensor<4xf32>>
-// %token = "xla_hlo.after_all"() : () -> !xla_hlo.token
+// %token = "xla_hlo.create_token"() : () -> !xla_hlo.token
 // %outfeed_token = "xla_hlo.outfeed"(%tuple, %token) {outfeed_config = ""} :
 //      (tuple<tensor<3xi32>, tensor<4xf32>>, !xla_hlo.token) -> !xla_hlo.token
 //
@@ -2988,9 +3100,8 @@ class ConvertOutfeedEnqueueTupleOp
                                      PatternRewriter &rewriter) const override {
     auto token_type = xla_hlo::TokenType::get(rewriter.getContext());
     auto tuple = rewriter.create<TupleOp>(op.getLoc(), op.inputs());
-    auto afterall =
-        rewriter.create<AfterAllOp>(op.getLoc(), token_type, ValueRange());
-    rewriter.create<OutfeedOp>(op.getLoc(), token_type, tuple, afterall,
+    auto token = rewriter.create<CreateTokenOp>(op.getLoc(), token_type);
+    rewriter.create<OutfeedOp>(op.getLoc(), token_type, tuple, token,
                                /*outfeed_config=*/rewriter.getStringAttr(""));
     rewriter.eraseOp(op);
     return matchSuccess();
@@ -3520,10 +3631,13 @@ class ConvertXlaDynamicUpdateSliceOp
   PatternMatchResult matchAndRewrite(TF::XlaDynamicUpdateSliceOp op,
                                      PatternRewriter &rewriter) const override {
     auto indices_type = op.indices().getType().dyn_cast<RankedTensorType>();
-    if (!indices_type) return matchFailure();
+    if (!indices_type || !indices_type.hasStaticShape() ||
+        indices_type.getShape().size() != 1)
+      return matchFailure();
 
-    SmallVector<Type, 2> unpacked_indices_type(
-        2, RankedTensorType::get({}, indices_type.getElementType()));
+    SmallVector<Type, 4> unpacked_indices_type(
+        indices_type.getDimSize(0),
+        RankedTensorType::get({}, indices_type.getElementType()));
     auto unpacked_indices = rewriter.create<TF::UnpackOp>(
         op.getLoc(), unpacked_indices_type, op.indices(),
         IntegerAttr::get(rewriter.getIntegerType(64), 0));
@@ -3533,6 +3647,80 @@ class ConvertXlaDynamicUpdateSliceOp
   }
 };
 
+/// Converts the Cumsum TensorFlow op to the HLO ReduceWindow op by setting
+/// appropriate window dimensions, with 'add' as the reduction function.  The
+/// input tensor needs to have a static shape, and 'axis' must be const.  The
+/// TableGen pattern is not used for this rewrite because it involves regions.
+class ConvertCumsumOp : public OpRewritePattern<TF::CumsumOp> {
+  using OpRewritePattern<TF::CumsumOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(TF::CumsumOp op,
+                                     PatternRewriter &rewriter) const override {
+    auto input = op.x();
+    auto input_type = input.getType().dyn_cast<ShapedType>();
+    if (!input_type || !input_type.hasStaticShape()) {
+      return matchFailure();
+    }
+
+    // TODO(jennik): Add support for the optional 'exclusive' and 'reverse'
+    // arguments.
+    if (op.exclusive() || op.reverse()) {
+      return matchFailure();
+    }
+
+    // We can only match when the axis is a constant scalar.
+    DenseIntElementsAttr axis_attr;
+    if (!matchPattern(op.axis(), m_Constant(&axis_attr))) {
+      return matchFailure();
+    }
+
+    // Convert if we need to enlarge the element type's bitwidth to avoid
+    // precision loss.
+    Type input_element_type = input_type.getElementType();
+    Type sum_element_type = GetSumAccumulationType(input_element_type);
+    input = rewriter.create<ConvertOp>(op.getLoc(), input, sum_element_type);
+
+    ArrayRef<int64_t> input_shape = input_type.getShape();
+    int64_t rank = input_shape.size();
+
+    // Get the dimension to apply the reduction on, and offset properly if it is
+    // negative.
+    int64_t axis = (*axis_attr.begin()).getSExtValue();
+    if (axis < 0) {
+      axis += rank;
+    }
+
+    SmallVector<int64_t, 4> window_dims(rank, 1);
+    SmallVector<int64_t, 4> window_strides(rank, 1);
+    window_dims[axis] = input_shape[axis];
+
+    SmallVector<int64_t, 8> paddings(rank * 2, 0);
+    paddings[axis * 2] = input_shape[axis] - 1;
+    auto paddings_attr = DenseIntElementsAttr::get(
+        RankedTensorType::get({rank, 2}, rewriter.getIntegerType(64)),
+        paddings);
+
+    Value init =
+        GetScalarConstOfType(sum_element_type, op.getLoc(), 0, &rewriter);
+
+    auto reduce = rewriter.create<ReduceWindowOp>(
+        op.getLoc(), input_type, input, init,
+        GetI64ElementsAttr(rewriter.getI64ArrayAttr(window_dims)),
+        GetI64ElementsAttr(rewriter.getI64ArrayAttr(window_strides)),
+        /*base_dilations=*/DenseIntElementsAttr(),
+        /*window_dilations=*/DenseIntElementsAttr(), paddings_attr);
+    BuildReduceBody<AddOp>(sum_element_type, &reduce.body(), &rewriter);
+    Value result = reduce.getResult();
+
+    // Convert back if we enlarged the element type's bitwidth.
+    result =
+        rewriter.create<ConvertOp>(op.getLoc(), result, input_element_type);
+
+    rewriter.replaceOp(op, result);
+    return matchSuccess();
+  }
+};
+
 #include "tensorflow/compiler/mlir/xla/transforms/generated_legalize_tf.inc"
 
 LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion) {
@@ -3547,9 +3735,9 @@ LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion) {
   // here for lowering to HLO.
   TF::PopulateLoweringTFPatterns(context, &patterns);
   patterns.insert<
-      ConvertAllOp, ConvertAnyOp, ConvertArgMaxOp, ConvertBF16FloorDivOp,
-      ConvertConv2D, ConvertConv2DBackpropFilterOp,
-      ConvertConv2DBackpropInputOp, ConvertEinsumOp,
+      ConvertAllOp, ConvertAnyOp, ConvertArgMaxOp, ConvertBatchMatMulV2Op,
+      ConvertBF16FloorDivOp, ConvertConv2D, ConvertConv2DBackpropFilterOp,
+      ConvertConv2DBackpropInputOp, ConvertCumsumOp, ConvertEinsumOp,
       ConvertFusedBatchNormGradOp, ConvertFusedBatchNormGradV2Op,
       ConvertFusedBatchNormGradV3Op, ConvertFusedBatchNormV3Op,
       ConvertInfeedDequeueTupleOp, ConvertLinSpaceOp, ConvertMaxOp,
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
index 519ba9235f1..b9599201601 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
@@ -193,7 +193,7 @@ def : Pat<(TF_FloorModOp AnyStaticShapeTensor:$l, AnyStaticShapeTensor:$r),
 // input and result needs to ranked for computation of the broadcast dimensions.
 def : Pat<(TF_BroadcastToOp:$result AnyRankedTensor:$input, $shape),
           (HLO_BroadcastInDimOp $input,
-           (BinBroadcastDimensions $input, $result)),
+           (BinBroadcastDimensionsNonEmpty $input, $result)),
            [(AnyRankedTensor $result)]>;
 
 //===----------------------------------------------------------------------===//
@@ -357,6 +357,89 @@ def SparseMatMulToMatMul : Pat<(TF_SparseMatMulOp $a, $b, $a_sparse, $b_sparse,
                                (TF_MatMulOp $a, $b, $transpose_a,
                                 $transpose_b)>;
 
+//===----------------------------------------------------------------------===//
+// MatrixBandPart op pattern.
+//===----------------------------------------------------------------------===//
+
+class getIntegerAttr<string x>: NativeCodeCall<
+  "$_builder.getI64IntegerAttr(" # x # ")">;
+
+class GetDimensionSizeFromEnd<string dimFromEnd>: NativeCodeCall<
+  "$_builder.getI64IntegerAttr(GetDimensionSizeFromEnd($0, " # dimFromEnd # "))"
+  >;
+
+// TODO(b/149615308): Enable IotaOp usage as a child operation in a pattern
+// For now, this op needs to be created in C++ because the expected output type
+// cannot be inferred.
+class createIotaOp<string dim>: NativeCodeCall<
+  "$_builder.create<xla_hlo::IotaOp>($0.getOwner()->getLoc(), "
+  "Get2DTensorType($1), $_builder.getI64IntegerAttr(" # dim # "))">;
+
+// This op needs to be created in C++ because the generated Convert Op has no
+// way to specify shape information as an input. In the MatrixBandPart op
+// lowering, ConvertOp is not a root operation and the appropriate types cannot
+// be inferred, so we construct it manually.
+def createConvertOp: NativeCodeCall<
+  "CreateConvertOp(&($_builder), $0.getOwner()->getLoc(), $1, $2)">;
+
+// Performs a substitution of MatrixBandPartOp for XLA HLO ops. Psuedocode is
+// shown below, given a tensor `input` with k dimensions [I, J, K, ..., M, N]
+// and two integers, `num_lower` and `num_upper`:
+//
+//   iota_m = { M x N matrix with 0,1,...M along the M dimension }
+//   iota_n = { M x N matrix with 0,1,...N along the N dimension }
+//   num_lower_or_m = (num_lower < 0) ? m : num_lower
+//   num_upper_or_n = (num_upper < 0) ? n : num_upper
+//   offset = iota_m - iota_n
+//   indicator = (-num_lower_or_m < offset) & (offset < num_upper_or_n)
+//   zero_matrix = { [I, J, K,...M, N] zero matrix }
+//   return (indicator ? input : zero_matrix)
+//
+// TODO(b/149961547): Support dynamic shaped `input` in MatrixBandPartOp.
+def : Pattern<(TF_MatrixBandPartOp:$op AnyRankedTensor:$input, $num_lower, $num_upper),
+         [(HLO_ConstOp:$m_dim (GetDimensionSizeFromEnd<"0"> $input)),
+          (HLO_ConstOp:$n_dim (GetDimensionSizeFromEnd<"1"> $input)),
+          (HLO_SelectOp:$num_lower_or_m
+           (HLO_CompareOp
+            $num_lower, (HLO_ConstOp:$zero (ConstantSplat<"0"> $num_lower)),
+            (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LT
+           ),
+           $m_dim,
+           $num_lower
+          ),
+          (HLO_SelectOp:$num_upper_or_n
+           (HLO_CompareOp
+            $num_upper, $zero,
+            (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LT
+           ),
+           $n_dim,
+           $num_upper
+          ),
+          (HLO_SelectOp
+           (HLO_AndOp
+            (HLO_CompareOp
+             (HLO_NegOp
+              (createConvertOp $op, $num_lower_or_m, $input)
+             ),
+             (HLO_SubOp:$offset
+              (createIotaOp<"1"> $op, $input), (createIotaOp<"0"> $op, $input),
+              (NullDenseIntElementsAttr)
+             ),
+             (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LE
+            ),
+            (HLO_CompareOp
+             $offset,
+             (createConvertOp
+              $op, $num_upper_or_n, $input
+             ),
+             (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LE
+            ),
+            (BinBroadcastDimensions $offset, $input)
+           ),
+           $input,
+           (HLO_ConstOp (ConstantSplat<"0"> $input))
+          )]>;
+
 //===----------------------------------------------------------------------===//
 // Nullary op patterns.
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
new file mode 100644
index 00000000000..962bf97c44d
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
@@ -0,0 +1,389 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/Optional.h"
+#include "mlir/IR/Diagnostics.h"  // TF:llvm-project
+#include "mlir/IR/Function.h"  // TF:llvm-project
+#include "mlir/IR/Location.h"  // TF:llvm-project
+#include "mlir/IR/Module.h"  // TF:llvm-project
+#include "mlir/IR/Operation.h"  // TF:llvm-project
+#include "mlir/IR/StandardTypes.h"  // TF:llvm-project
+#include "mlir/IR/Types.h"  // TF:llvm-project
+#include "mlir/IR/Value.h"  // TF:llvm-project
+#include "mlir/Pass/Pass.h"  // TF:llvm-project
+#include "mlir/Support/LogicalResult.h"  // TF:llvm-project
+#include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h.inc"
+#include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h"
+#include "tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h"
+#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
+#include "tensorflow/compiler/tf2xla/xla_context.h"
+#include "tensorflow/compiler/tf2xla/xla_expression.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/node_properties.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+
+namespace mlir {
+namespace xla_hlo {
+namespace {
+
+template <typename T, size_t N>
+using InlinedVector = tensorflow::gtl::InlinedVector<T, N>;  // non-absl ok
+
+static bool IsOpWhitelisted(Operation* op) {
+  // White-listed TensorFlow ops are known to have well behaved tf2xla kernels
+  // building valid MLIR using MlirHloBuilder.
+  // TODO(hinsu): Drop explicit whitelist when MLIR based bridge is enabled for
+  // all tf2xla kernels.
+  return isa<TF::AbsOp>(op) || isa<TF::Atan2Op>(op) ||
+         isa<TF::SelectV2Op>(op) || isa<TF::CastOp>(op);
+}
+
+static llvm::Optional<absl::string_view> GetJitDevice(
+    const std::string& device_type, const Location& loc) {
+  if (device_type == "XLA_CPU") return absl::string_view("XLA_CPU_JIT");
+  if (device_type == "TPU") return absl::string_view("XLA_TPU_JIT");
+  // TODO(hinsu): Support GPU device along with a test for it.
+
+  emitError(loc) << "unsupported device for legalization with tf2xla kernels: "
+                 << device_type;
+  return llvm::None;
+}
+
+static std::unique_ptr<tensorflow::StaticDeviceMgr> CreateDeviceMgr(
+    const std::string& device_type, const Location& loc) {
+  auto jit_device_or = GetJitDevice(device_type, loc);
+  if (!jit_device_or) return nullptr;
+
+  auto* factory = tensorflow::DeviceFactory::GetFactory(device_type);
+  if (!factory) {
+    emitError(loc) << "failed to create DeviceFactory for device: "
+                   << device_type;
+    return nullptr;
+  }
+  std::vector<std::unique_ptr<tensorflow::Device>> devices;
+  auto status = factory->CreateDevices(
+      tensorflow::SessionOptions(),
+      /*name_prefix=*/"/job:localhost/replica:0/task:0", &devices);
+  if (!status.ok()) {
+    emitError(loc) << status.ToString();
+    return nullptr;
+  }
+
+  auto device = absl::make_unique<tensorflow::XlaCompilationDevice>(
+      tensorflow::SessionOptions(), tensorflow::DeviceType(*jit_device_or));
+  return absl::make_unique<tensorflow::StaticDeviceMgr>(std::move(device));
+}
+
+class FuncLegalizer {
+ public:
+  static LogicalResult Legalize(FuncOp func, const std::string& device_type) {
+    FuncLegalizer legalizer(func, device_type);
+    if (failed(legalizer.PrepareParams())) return failure();
+    return legalizer.Legalize();
+  }
+
+ private:
+  FuncLegalizer(FuncOp func, const std::string& device_type)
+      : func_(func), device_type_(device_type), hlo_builder_(func) {}
+
+  ~FuncLegalizer() { context_->Unref(); }
+
+  // Prepares OpKernelContext params common to all the ops.
+  // Emits an error on failure.
+  LogicalResult PrepareParams();
+
+  // Tries to legalize supported TensorFlow ops.
+  // Emits an error on failure.
+  LogicalResult Legalize();
+
+  // Tries to legalize the specified TensorFlow op, if supported.
+  //
+  // Emits an error and returns failure if an error is encountered during
+  // conversion. Note that success return value doesn't mean successful
+  // legalization.
+  LogicalResult LegalizeOp(Operation* op);
+
+  FuncOp func_;
+  std::string device_type_;
+
+  ::xla::MlirHloBuilder hlo_builder_;
+  tensorflow::OpOrArgLocNameMapper name_mapper_;
+
+  tensorflow::XlaContext* context_;  // Ref-counted.
+
+  std::unique_ptr<tensorflow::StaticDeviceMgr> device_mgr_;
+  tensorflow::Device* device_;  // Owned by device_mgr_;
+  std::unique_ptr<tensorflow::ScopedStepContainer> step_container_;
+  std::unique_ptr<tensorflow::FunctionLibraryDefinition> flib_def_;
+  std::unique_ptr<tensorflow::ProcessFunctionLibraryRuntime> pflr_;
+  tensorflow::OpKernelContext::Params params_;
+};
+
+LogicalResult FuncLegalizer::PrepareParams() {
+  // XlaCompiler within the context is only used by the functional ops to
+  // compile functions. We are not handling those at the moment so XlaCompiler
+  // is not required.
+  context_ = new tensorflow::XlaContext(/*compiler=*/nullptr, &hlo_builder_);
+  context_->Ref();
+
+  mlir::Location loc = func_.getLoc();
+  device_mgr_ = CreateDeviceMgr(device_type_, loc);
+  if (!device_mgr_) return failure();
+
+  // Type of params_.device is DeviceBase* so store it as Device* to access
+  // derived class method.
+  device_ = device_mgr_->ListDevices().front();
+  params_.device = device_;
+  params_.resource_manager = device_->resource_manager();
+
+  // Resources are cleared at the time of device manager destruction so pass
+  // no-op cleanup function.
+  auto cleanup = [](const std::string& name) {};
+  // Use step_id zero as we only have a single context concurrently and
+  // concurrently running each of the MLIR functions create a new device.
+  step_container_ = absl::make_unique<tensorflow::ScopedStepContainer>(
+      /*step_id=*/0, cleanup);
+  tensorflow::Status status = step_container_->Create(
+      device_->resource_manager(),
+      tensorflow::XlaContext::kXlaContextResourceName, context_);
+  if (!status.ok()) {
+    emitError(loc) << "failed to create XlaContext resource: "
+                   << status.ToString();
+    return failure();
+  }
+  params_.step_container = step_container_.get();
+
+  tensorflow::StatusOr<int64_t> version_or =
+      tensorflow::GetTfGraphProducerVersion(
+          func_.getParentOfType<mlir::ModuleOp>());
+  if (!version_or.ok()) {
+    emitError(loc) << version_or.status().ToString();
+    return failure();
+  }
+
+  flib_def_ = absl::make_unique<tensorflow::FunctionLibraryDefinition>(
+      tensorflow::OpRegistry::Global(), tensorflow::FunctionDefLibrary());
+  pflr_ = absl::make_unique<tensorflow::ProcessFunctionLibraryRuntime>(
+      device_mgr_.get(), tensorflow::Env::Default(), /*config=*/nullptr,
+      version_or.ValueOrDie(), flib_def_.get(), tensorflow::OptimizerOptions());
+  params_.function_library = pflr_->GetFLR(device_->name());
+  return success();
+}
+
+LogicalResult FuncLegalizer::Legalize() {
+  // TensorFlow functions don't use CFGs.
+  if (func_.getBlocks().size() > 1) {
+    emitError(func_.getLoc()) << "requires at most one block in a TF function";
+    return failure();
+  }
+  if (func_.getBlocks().empty()) return success();
+  Block& block = func_.getBlocks().front();
+
+  std::vector<Operation*> ops;
+  ops.reserve(block.getOperations().size());
+  for (Operation& op : block.getOperations()) {
+    ops.push_back(&op);
+  }
+
+  for (Operation* op : ops) {
+    if (failed(LegalizeOp(op))) return failure();
+  }
+  return success();
+}
+
+LogicalResult FuncLegalizer::LegalizeOp(Operation* op) {
+  if (!IsOpWhitelisted(op)) return success();
+
+  // Only static shaped operands are supported in XLA builders for now.
+  for (Type ty : op->getOperandTypes()) {
+    auto ranked_ty = ty.cast<RankedTensorType>();
+    if (!ranked_ty || !ranked_ty.hasStaticShape()) {
+      op->emitRemark() << "lowering requires static shaped operands";
+      return success();
+    }
+  }
+
+  auto nodedef_or = tensorflow::ConvertTFDialectOpToNodeDef(
+      op, name_mapper_.GetUniqueName(op), /*ignore_unregistered_attrs=*/true);
+  if (!nodedef_or.ok()) {
+    op->emitRemark() << "failed to convert op to NodeDef: "
+                     << nodedef_or.status().ToString();
+    return success();
+  }
+
+  std::shared_ptr<const tensorflow::NodeProperties> props;
+  tensorflow::Status status = tensorflow::NodeProperties::CreateFromNodeDef(
+      *nodedef_or.ValueOrDie(),
+      params_.function_library->GetFunctionLibraryDefinition(), &props);
+  if (!status.ok()) {
+    op->emitRemark() << "failed to create NodeProperties: "
+                     << status.ToString();
+    return success();
+  }
+  tensorflow::OpKernel* op_kernel_raw;
+  status = params_.function_library->CreateKernel(props, &op_kernel_raw);
+  if (!status.ok()) {
+    op->emitRemark() << "failed to create tf2xla kernel: " << status.ToString();
+    return success();
+  }
+  // Transfer ownership of the kernel to a local smart pointer.
+  auto op_kernel = absl::WrapUnique(op_kernel_raw);
+
+  // TensorValue in inputs are backed by tensors which in turn depend on
+  // expressions. So, pre-allocate them to the required size.
+  InlinedVector<tensorflow::XlaExpression, 4> expressions;
+  InlinedVector<tensorflow::Tensor, 4> tensors;
+  InlinedVector<tensorflow::TensorValue, 4> inputs;
+  expressions.reserve(op->getNumOperands());
+  tensors.reserve(op->getNumOperands());
+  inputs.reserve(op->getNumOperands());
+
+  // Prepare the list of Tensor inputs for the kernel.
+  for (Value operand : op->getOperands()) {
+    // Skip this op if XLA doesn't support this operand type.
+    auto xla_op_or = hlo_builder_.MakeXlaOp(operand);
+    if (!xla_op_or.ok()) {
+      op->emitRemark() << "skipping legalization due to "
+                       << xla_op_or.status().ToString();
+      return success();
+    }
+    ::xla::XlaOp xla_op = xla_op_or.ValueOrDie();
+
+    tensorflow::DataType dtype;
+    status = tensorflow::ConvertToDataType(operand.getType(), &dtype);
+    if (!status.ok()) {
+      op->emitRemark() << "skipping legalization due to " << status.ToString();
+      return success();
+    }
+
+    auto expression = tensorflow::XlaExpression::XlaOp(xla_op, dtype);
+    expressions.push_back(expression);
+
+    if (!tensorflow::DataTypeCanUseMemcpy(dtype)) {
+      op->emitRemark() << "skipping legalization due to unsupported type "
+                       << operand.getType();
+      return success();
+    }
+
+    auto shape_or = expression.GetShape();
+    if (!shape_or.ok()) {
+      op->emitRemark() << "failed to get shape for expression. "
+                       << expression.HumanString();
+      return success();
+    }
+
+    tensors.emplace_back(
+        device_->GetAllocator(tensorflow::AllocatorAttributes()), dtype,
+        shape_or.ValueOrDie());
+    tensorflow::Tensor& tensor = tensors.back();
+    tensorflow::XlaOpKernelContext::AssignExpressionToTensor(expression,
+                                                             &tensor);
+    inputs.emplace_back(&tensor);
+  }
+
+  params_.inputs = &inputs;
+  params_.op_kernel = op_kernel.get();
+  llvm::SmallVector<tensorflow::AllocatorAttributes, 4> output_attr(
+      op->getNumResults());
+  params_.output_attr_array = output_attr.data();
+
+  hlo_builder_.setInsertionPoint(op);
+  hlo_builder_.SetLocation(op->getLoc());
+
+  // Execute the kernel.
+  tensorflow::OpKernelContext op_context(&params_, op->getNumResults());
+  device_->Compute(params_.op_kernel, &op_context);
+  if (!op_context.status().ok()) {
+    op->emitRemark() << "compilation to HLO failed: "
+                     << op_context.status().ToString();
+    return success();
+  }
+
+  // Replace uses of old results using the corresponding value after the
+  // lowering.
+  for (int i = 0, e = op->getNumResults(); i < e; i++) {
+    tensorflow::Tensor* output = op_context.mutable_output(i);
+    const tensorflow::XlaExpression* expr =
+        tensorflow::XlaOpKernelContext::CastExpressionFromTensor(*output);
+    if (expr->kind() != tensorflow::XlaExpression::Kind::kXlaOp)
+      return op->emitError(
+          "expects XlaExpression of kind kXlaOp in compiled output");
+    auto value = hlo_builder_.GetValue(expr->handle());
+    op->getResult(i).replaceAllUsesWith(value);
+  }
+
+  op->erase();
+  return success();
+}
+
+class LegalizeTF : public FunctionPass<LegalizeTF> {
+ public:
+  LegalizeTF() = default;
+
+  LegalizeTF(const LegalizeTF&) {}
+
+  void runOnFunction() override {
+    if (failed(FuncLegalizer::Legalize(getFunction(), device_type_)))
+      signalPassFailure();
+  }
+
+ private:
+  // TODO(hinsu): Support finer grained device type assignment instead of a
+  // global device type for all TensorFlow ops.
+  Option<std::string> device_type_{
+      *this, "device-type",
+      llvm::cl::desc("XLA device type for execution of TensorFlow ops. "
+                     "Supports XLA_CPU and TPU for now.")};
+};
+
+static PassRegistration<LegalizeTF> pass(
+    "xla-legalize-tf-with-tf2xla",
+    "Legalize from TensorFlow to the HLO dialect using tf2xla kernels");
+
+}  // end namespace
+
+}  // end namespace xla_hlo
+}  // end namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard.cc
index 1c0f3d8f242..aeaceeb27d5 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard.cc
@@ -87,20 +87,20 @@ class CompareFConvert : public OpRewritePattern<xla_hlo::CompareOp> {
       return matchFailure();
 
     auto comparison_direction = op.comparison_direction();
-    CmpFPredicate compare_predicate =
-        llvm::StringSwitch<CmpFPredicate>(comparison_direction)
+    auto compare_predicate =
+        llvm::StringSwitch<Optional<CmpFPredicate>>(comparison_direction)
             .Case("EQ", CmpFPredicate::OEQ)
             .Case("NE", CmpFPredicate::UNE)
             .Case("LT", CmpFPredicate::OLT)
             .Case("LE", CmpFPredicate::OLE)
             .Case("GT", CmpFPredicate::OGT)
             .Case("GE", CmpFPredicate::OGE)
-            .Default(CmpFPredicate::NumPredicates);
+            .Default(llvm::None);
 
-    if (compare_predicate == CmpFPredicate::NumPredicates)
-      return matchFailure();
+    if (!compare_predicate.hasValue()) return matchFailure();
 
-    rewriter.replaceOpWithNewOp<CmpFOp>(op, compare_predicate, lhs, rhs);
+    rewriter.replaceOpWithNewOp<CmpFOp>(op, compare_predicate.getValue(), lhs,
+                                        rhs);
     return matchSuccess();
   }
 };
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_copy_removal.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_copy_removal.cc
new file mode 100644
index 00000000000..86125126390
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_copy_removal.cc
@@ -0,0 +1,105 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file implements a pass to remove redundant LHLO copy operations.
+
+#include "absl/memory/memory.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
+#include "mlir/IR/Operation.h"  // TF:llvm-project
+#include "mlir/Pass/Pass.h"  // TF:llvm-project
+#include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.h"
+#include "tensorflow/compiler/mlir/xla/transforms/passes.h"
+
+namespace mlir {
+namespace xla_lhlo {
+namespace {
+
+// Removes LHLO copy operations that copy from allocated buffers to block
+// arguments. All uses of each buffer are replaced with the corresponding block
+// argument and the buffer is freed. Note that this pass only works in regions
+// with a single block.
+struct LhloCopyRemoval : mlir::OperationPass<LhloCopyRemoval> {
+  void runOnOperation() override {
+    llvm::SmallVector<mlir::Operation*, 2> eraseList;
+    auto operation = getOperation();
+    operation->walk([&](mlir::xla_lhlo::CopyOp copyOp) {
+      // If this region contains more than one block, then ignore this copy
+      // operation.
+      if (copyOp.getParentRegion()->getBlocks().size() > 1) {
+        return;
+      }
+
+      mlir::Value fromOperand = copyOp.operand();
+      mlir::Value toOperand = copyOp.output();
+
+      // If the fromOperand value is a block argument or the toOperand
+      // value is not a block argument, then ignore this copy operation.
+      if (!fromOperand.getDefiningOp() || toOperand.getDefiningOp()) {
+        return;
+      }
+
+      // The copy operation removal is illegal if there is at least a single use
+      // of toOperand value that lies between the first use of fromOperand value
+      // and the copy operation.
+      auto fromOperandUsers = fromOperand.getUsers();
+      auto firstUser = *fromOperandUsers.begin();
+      for (auto op : fromOperandUsers) {
+        if (op->isBeforeInBlock(firstUser)) firstUser = op;
+      }
+      for (auto op : toOperand.getUsers()) {
+        if (op->isBeforeInBlock(copyOp) && firstUser->isBeforeInBlock(op)) {
+          return;
+        }
+      }
+
+      // TODO(DFKI): Use live variable analysis to solve aliasing issues among
+      // block arguments.
+
+      // Remove the associated alloc operation.
+      auto allocOp = fromOperand.getDefiningOp();
+      eraseList.push_back(allocOp);
+
+      // Iterate over all uses of the fromOperand to find the associated
+      // deallocOp (if any).
+      for (auto op : fromOperandUsers) {
+        if (isa<mlir::DeallocOp>(op)) {
+          eraseList.push_back(op);
+          break;
+        }
+      }
+
+      // Replace all uses of the fromOperand with the toOperand. This rewires
+      // all references pointing to the original alloc operation to the new
+      // target operation in order to safely remove the copy op.
+      fromOperand.replaceAllUsesWith(toOperand);
+      copyOp.erase();
+    });
+    for (auto op : eraseList) {
+      op->erase();
+    }
+  };
+};
+
+}  // namespace
+
+std::unique_ptr<Pass> createLhloCopyRemovalPass() {
+  return absl::make_unique<LhloCopyRemoval>();
+}
+
+static PassRegistration<LhloCopyRemoval> copy_removal_pass(
+    "lhlo-copy-removal", "Removes redundant LHLO copy operations");
+
+}  // namespace xla_lhlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
index 8f34034d6d3..a27a27b3760 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
@@ -63,8 +63,7 @@ class LhloFuseLinalg : public FunctionPass<LhloFuseLinalg> {
       SmallVector<int64_t, 2> tile_sizes(tile_sizes_.begin(),
                                          tile_sizes_.end());
       if (tile_sizes.empty()) {
-        tile_sizes =
-            SmallVector<int64_t, 2>(generic_op.getNumInputsAndOutputs(), 1);
+        tile_sizes = SmallVector<int64_t, 2>(generic_op.getNumLoops(), 1);
       }
       auto op = cast<LinalgOp>(generic_op.getOperation());
       for (const Value result : op.getOutputBuffers()) {
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc
index 2c550465302..32053950fed 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc
@@ -31,11 +31,11 @@ namespace mlir {
 namespace xla_lhlo {
 namespace {
 
-template <typename LhloOp>
-struct BinaryOpConverter : public OpRewritePattern<LhloOp> {
-  using OpRewritePattern<LhloOp>::OpRewritePattern;
+template <typename LhloOpTy>
+struct BinaryOpConverter : public OpRewritePattern<LhloOpTy> {
+  using OpRewritePattern<LhloOpTy>::OpRewritePattern;
 
-  PatternMatchResult matchAndRewrite(LhloOp op,
+  PatternMatchResult matchAndRewrite(LhloOpTy op,
                                      PatternRewriter& rewriter) const override {
     const auto& lhs = op.lhs();
     const auto& rhs = op.rhs();
@@ -56,8 +56,8 @@ struct BinaryOpConverter : public OpRewritePattern<LhloOp> {
     }
     auto l = rewriter.create<LoadOp>(loc, lhs, induction_vars);
     auto r = rewriter.create<LoadOp>(loc, rhs, induction_vars);
-    Value opResult = MapXlaOpToStdScalarOp<LhloOp>(
-        llvm::cast<LhloOp>(op), element_type, {l, r}, &rewriter);
+    Value opResult = xla_lhlo::XlaOpToStdScalarOp::map<LhloOpTy>(
+        op, element_type, {l, r}, &rewriter);
     if (opResult == nullptr) {
       return this->matchFailure();
     }
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc
new file mode 100644
index 00000000000..8ef08e4f9f3
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc
@@ -0,0 +1,244 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/memory/memory.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"  // TF:llvm-project
+#include "mlir/Dialect/LoopOps/LoopOps.h"  // TF:llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
+#include "mlir/IR/StandardTypes.h"  // TF:llvm-project
+#include "mlir/Pass/Pass.h"  // TF:llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // TF:llvm-project
+#include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.h"
+
+namespace mlir {
+namespace xla_lhlo {
+namespace {
+
+// Converts `xla_lhlo.ReduceOp` into two loop::ParallelOp and a loop::ReduceOp.
+// The outper `ParallelOp` refers to the parallel loops if there are
+// any. The inner `ParalleOp` refers to the reduction loops and `ReduceOp`
+// contains the reduction operator.
+//
+// Example:
+//
+//  "xla_lhlo.reduce"(%buffer, %init_buf, %result) ( {
+//    ^bb0(%lhs: memref<f32>, %rhs: memref<f32>, %res: memref<f32>):
+//      <LHLO ops>
+//    } ) {dimensions = dense<[1]> : tensor<1xi64>}
+//      : (memref<100x10x5xf32>, memref<f32>, memref<100x5xf32>) -> ()
+//
+//  is converted into:
+//
+//  %init = load %init_buf[] : memref<f32>
+//  loop.parallel (%i, %k) = (%c0, %c0) to (%c100, %c5) step (%c1, %c1) {
+//    %result = loop.parallel (%j) = (%c0) to (%c10) step (%c1) init (%init) {
+//      %elem_to_reduce = load %buffer[%i, %j, %k] : memref<100x10x5xf32>
+//      loop.reduce(%elem_to_reduce)  {
+//        ^bb0(%elem: f32, %acc: f32):   // no predecessors
+//          elem_buf = alloc() : memref<f32>
+//          store %elem, elem_buf[] : memref<f32>
+//          acc_buf = alloc() : memref<f32>
+//          store %acc, acc_buf[] : memref<f32>
+//          <LHLO_ops>
+//          %acc_result = load acc_buf[] : memref<f32>
+//          loop.reduce.return %acc_result : f32
+//      } : f32
+//      loop.yield
+//    } : f32
+//    loop.yield
+//  }
+class ReduceOpConverter : public OpConversionPattern<xla_lhlo::ReduceOp> {
+ public:
+  using OpConversionPattern<xla_lhlo::ReduceOp>::OpConversionPattern;
+
+  PatternMatchResult matchAndRewrite(
+      xla_lhlo::ReduceOp xla_reduce_op, ArrayRef<Value> args,
+      ConversionPatternRewriter& rewriter) const final {
+    // TODO(b/137624192) Implement variadic reduce.
+    if (xla_reduce_op.out().size() != 1) return matchFailure();
+
+    loop::ReduceOp reduce_op =
+        CreateParallelLoopsWithReduceOp(xla_reduce_op, args, &rewriter);
+    ConvertReductionOperator(xla_reduce_op,
+                             &reduce_op.reductionOperator().front(), &rewriter);
+    rewriter.replaceOp(xla_reduce_op, llvm::None);
+    return matchSuccess();
+  }
+
+ private:
+  // Creates nested `loop.parallel` ops with `loop.reduce`. The outer ParallelOp
+  // refers to the parallel dimensions of `xla_reduce_op` if any and the inner
+  // ParallelOp refers to the reduction dimensions. The loop.reduce op is
+  // returned.
+  //
+  // If the reduction argument is a memref<100x10x5xf32> and the
+  // reduction is performed along dimension 1 then this method will generate
+  //
+  //  %init = load %init_buf[] : memref<f32>
+  //  loop.parallel (%i, %k) = (%c0, %c0) to (%c100, %c5) step (%c1, %c1) {
+  //    %result = loop.parallel (%j) = (%c0) to (%c10) step (%c1) init (%init) {
+  //      %elem_to_reduce = load %buffer[%i, %j, %k] : memref<100x10x5xf32>
+  //      loop.reduce(%elem_to_reduce)  {
+  //        <THE BLOCK PTR TO BE RETURNED>
+  //      } : f32
+  //      loop.yield
+  //    } : f32
+  //    loop.yield
+  //  }
+  loop::ReduceOp CreateParallelLoopsWithReduceOp(
+      xla_lhlo::ReduceOp xla_reduce_op, ArrayRef<Value> args,
+      ConversionPatternRewriter* rewriter) const {
+    auto loc = xla_reduce_op.getLoc();
+    DenseSet<int> reducing_dims;
+    for (auto rdim : xla_reduce_op.dimensions().getIntValues()) {
+      reducing_dims.insert(rdim.getSExtValue());
+    }
+
+    Value operand = *xla_reduce_op.operands().begin();
+    Value out = *xla_reduce_op.out().begin();
+    SmallVector<Value, 2> parallel_lower, parallel_upper, parallel_step;
+    SmallVector<Value, 2> reduce_lower, reduce_upper, reduce_step;
+    auto operand_shape = operand.getType().cast<MemRefType>().getShape();
+    Type index_type = rewriter->getIndexType();
+    for (auto dim : llvm::enumerate(operand_shape)) {
+      const bool is_reducing_dim = reducing_dims.count(dim.index());
+
+      Value ub =
+          dim.value() == ShapedType::kDynamicSize
+              ? rewriter->create<DimOp>(loc, operand, dim.index()).getResult()
+              : rewriter->create<mlir::ConstantOp>(
+                    loc, index_type,
+                    rewriter->getIntegerAttr(index_type, dim.value()));
+      Value lb = rewriter->create<mlir::ConstantOp>(
+          loc, index_type, rewriter->getIntegerAttr(index_type, 0));
+      Value step = rewriter->create<mlir::ConstantOp>(
+          loc, index_type, rewriter->getIntegerAttr(index_type, 1));
+      (is_reducing_dim ? reduce_lower : parallel_lower).push_back(lb);
+      (is_reducing_dim ? reduce_upper : parallel_upper).push_back(ub);
+      (is_reducing_dim ? reduce_step : parallel_step).push_back(step);
+    }
+    // Load initial value from memref<element_type>.
+    SmallVector<Value, 1> init_value = {
+        rewriter->create<LoadOp>(loc, *xla_reduce_op.init_values().begin())};
+    // Outer ParallelOp is not needed if it is a reduction across all dims.
+    loop::ParallelOp outer;
+    if (!parallel_lower.empty()) {
+      outer = rewriter->create<loop::ParallelOp>(loc, parallel_lower,
+                                                 parallel_upper, parallel_step);
+      rewriter->setInsertionPointToStart(outer.getBody());
+    }
+    loop::ParallelOp inner = rewriter->create<loop::ParallelOp>(
+        loc, reduce_lower, reduce_upper, reduce_step, init_value);
+    Value reduction_result = *inner.getResults().begin();
+
+    SmallVector<Value, 1> out_indices;
+    if (outer != nullptr) {
+      out_indices.reserve(outer.getNumLoops());
+      for (auto& iv : outer.getInductionVars()) {
+        out_indices.push_back(iv);
+      }
+    } else {
+      out_indices.push_back(rewriter->create<mlir::ConstantOp>(
+          loc, index_type, rewriter->getIntegerAttr(index_type, 0)));
+    }
+
+    rewriter->create<StoreOp>(loc, reduction_result, out, out_indices);
+
+    // Load the element to reduce.
+    SmallVector<Value, 2> indices;
+    indices.reserve(operand_shape.size());
+    Block::args_iterator outer_ivs_it =
+        outer ? outer.getInductionVars().begin() : nullptr;
+    Block::args_iterator inner_ivs_it = inner.getInductionVars().begin();
+    for (unsigned i = 0, e = operand_shape.size(); i < e; ++i) {
+      indices.push_back(reducing_dims.count(i) ? *inner_ivs_it++
+                                               : *outer_ivs_it++);
+    }
+
+    rewriter->setInsertionPointToStart(inner.getBody());
+    Value elem = rewriter->create<mlir::LoadOp>(
+        loc, *xla_reduce_op.operands().begin(), indices);
+    return rewriter->create<loop::ReduceOp>(loc, elem);
+  }
+
+  // Converts `xla_lhlo.reduce` reduction operator into `loop.reduce` op by
+  // doing buffer allocation for scalar arguments and the result of
+  // `loop.reduce` to make it compatible with LHLO ops.
+  void ConvertReductionOperator(xla_lhlo::ReduceOp xla_reduce_op,
+                                Block* loop_reduce_op_body,
+                                ConversionPatternRewriter* rewriter) const {
+    rewriter->setInsertionPointToStart(loop_reduce_op_body);
+
+    // Allocate buffers to hold arguments of reduction operator block to stay
+    // compatible with the LHLO dialect ops in the reduction body.
+    auto loc = xla_reduce_op.getLoc();
+    Value elem_arg = xla_reduce_op.body().front().getArgument(0);
+    Value elem_buf =
+        rewriter->create<AllocOp>(loc, elem_arg.getType().cast<MemRefType>());
+    rewriter->create<StoreOp>(loc, loop_reduce_op_body->getArgument(0),
+                              elem_buf);
+    Value acc_arg = xla_reduce_op.body().front().getArgument(1);
+    Value acc_buf =
+        rewriter->create<AllocOp>(loc, acc_arg.getType().cast<MemRefType>());
+    rewriter->create<StoreOp>(loc, loop_reduce_op_body->getArgument(1),
+                              acc_buf);
+
+    // Clone the ops from `xla_lhlo.reduce` into reduction operator block.
+    BlockAndValueMapping mapping;
+    mapping.map(xla_reduce_op.body().front().getArguments(),
+                ValueRange{elem_buf, acc_buf, acc_buf});
+    for (auto& nested : xla_reduce_op.body().front().without_terminator()) {
+      auto clone = rewriter->clone(nested, mapping);
+      mapping.map(nested.getResults(), clone->getResults());
+    }
+    Value acc_result = rewriter->create<LoadOp>(loc, acc_buf);
+    rewriter->create<loop::ReduceReturnOp>(loc, acc_result);
+  }
+};
+
+struct LhloLegalizeToParallelLoops
+    : public FunctionPass<LhloLegalizeToParallelLoops> {
+  void runOnFunction() override {
+    auto func = getFunction();
+
+    OwningRewritePatternList patterns;
+    patterns.insert<ReduceOpConverter>(func.getContext());
+
+    ConversionTarget target(getContext());
+    target.addLegalDialect<linalg::LinalgDialect, StandardOpsDialect,
+                           loop::LoopOpsDialect, XlaLhloDialect>();
+    target.addIllegalOp<xla_lhlo::ReduceOp>();
+
+    if (failed(applyPartialConversion(func, target, patterns, nullptr))) {
+      signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OpPassBase<FuncOp>> createLegalizeLhloToParallelLoopsPass() {
+  return absl::make_unique<LhloLegalizeToParallelLoops>();
+}
+
+static PassRegistration<LhloLegalizeToParallelLoops> legalize_lhlo_pass(
+    "lhlo-legalize-to-parallel-loops",
+    "Legalize from LHLO dialect to parallel loops.");
+
+}  // namespace xla_lhlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h b/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h
new file mode 100644
index 00000000000..9d04e82430d
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h
@@ -0,0 +1,72 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_MAP_HLO_TO_LHLO_OP_H_
+#define TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_MAP_HLO_TO_LHLO_OP_H_
+
+#include <type_traits>
+
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
+#include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.h"
+
+namespace mlir {
+namespace xla_hlo {
+
+template <typename HloOpTy>
+struct HloToLhloOpImpl {
+  using Type = std::false_type;
+};
+template <typename HloOpTy>
+using HloToLhloOp = typename HloToLhloOpImpl<HloOpTy>::Type;
+
+#define MAP_HLO_TO_LHLO(OpName)             \
+  template <>                               \
+  struct HloToLhloOpImpl<xla_hlo::OpName> { \
+    using Type = xla_lhlo::OpName;          \
+  }
+
+MAP_HLO_TO_LHLO(AbsOp);
+MAP_HLO_TO_LHLO(AddOp);
+MAP_HLO_TO_LHLO(AndOp);
+MAP_HLO_TO_LHLO(BroadcastInDimOp);
+MAP_HLO_TO_LHLO(CeilOp);
+MAP_HLO_TO_LHLO(ConstOp);
+MAP_HLO_TO_LHLO(CompareOp);
+MAP_HLO_TO_LHLO(ConvertOp);
+MAP_HLO_TO_LHLO(CopyOp);
+MAP_HLO_TO_LHLO(CosOp);
+MAP_HLO_TO_LHLO(DivOp);
+MAP_HLO_TO_LHLO(ExpOp);
+MAP_HLO_TO_LHLO(IotaOp);
+MAP_HLO_TO_LHLO(LogOp);
+MAP_HLO_TO_LHLO(MaxOp);
+MAP_HLO_TO_LHLO(MinOp);
+MAP_HLO_TO_LHLO(MulOp);
+MAP_HLO_TO_LHLO(NegOp);
+MAP_HLO_TO_LHLO(ReduceOp);
+MAP_HLO_TO_LHLO(RemOp);
+MAP_HLO_TO_LHLO(RsqrtOp);
+MAP_HLO_TO_LHLO(SelectOp);
+MAP_HLO_TO_LHLO(SignOp);
+MAP_HLO_TO_LHLO(SqrtOp);
+MAP_HLO_TO_LHLO(SubOp);
+MAP_HLO_TO_LHLO(TanhOp);
+
+#undef MAP_HLO_TO_LHLO
+
+}  // namespace xla_hlo
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_MAP_HLO_TO_LHLO_OP_H_
diff --git a/tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h b/tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h
index 6554942954e..40add223156 100644
--- a/tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h
+++ b/tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h
@@ -21,81 +21,63 @@ limitations under the License.
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.h"
+#include "tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h"
 
 namespace mlir {
 namespace xla_lhlo {
+namespace impl {
 
-template <typename LHLO_BinaryOp>
-struct ScalarOp;
+// A struct to map LhloBinaryOpTy type to the corresponding floating-point and
+// integer scalar operation types.
+template <typename LhloBinaryOpTy>
+struct LhloToScalarOp;
 
 template <>
-struct ScalarOp<xla_lhlo::AddOp> {
+struct LhloToScalarOp<xla_lhlo::AddOp> {
   using FOp = ::mlir::AddFOp;
   using IOp = ::mlir::AddIOp;
 };
 template <>
-struct ScalarOp<xla_hlo::AddOp> {
-  using FOp = ::mlir::AddFOp;
-  using IOp = ::mlir::AddIOp;
-};
-template <>
-struct ScalarOp<xla_lhlo::CompareOp> {
+struct LhloToScalarOp<xla_lhlo::CompareOp> {
   using FOp = ::mlir::CmpFOp;
   using IOp = ::mlir::CmpIOp;
 };
 template <>
-struct ScalarOp<xla_hlo::CompareOp> {
-  using FOp = ::mlir::CmpFOp;
-  using IOp = ::mlir::CmpIOp;
-};
-template <>
-struct ScalarOp<xla_lhlo::DivOp> {
+struct LhloToScalarOp<xla_lhlo::DivOp> {
   using FOp = ::mlir::DivFOp;
   using IOp = ::mlir::SignedDivIOp;
 };
 template <>
-struct ScalarOp<xla_hlo::DivOp> {
-  using FOp = ::mlir::DivFOp;
-  using IOp = ::mlir::SignedDivIOp;
-};
-template <>
-struct ScalarOp<xla_lhlo::MulOp> {
+struct LhloToScalarOp<xla_lhlo::MulOp> {
   using FOp = ::mlir::MulFOp;
   using IOp = ::mlir::MulIOp;
 };
 template <>
-struct ScalarOp<xla_hlo::MulOp> {
-  using FOp = ::mlir::MulFOp;
-  using IOp = ::mlir::MulIOp;
-};
-template <>
-struct ScalarOp<xla_lhlo::RemOp> {
+struct LhloToScalarOp<xla_lhlo::RemOp> {
   using FOp = ::mlir::RemFOp;
   using IOp = ::mlir::SignedRemIOp;
 };
 template <>
-struct ScalarOp<xla_hlo::RemOp> {
-  using FOp = ::mlir::RemFOp;
-  using IOp = ::mlir::SignedRemIOp;
-};
-template <>
-struct ScalarOp<xla_lhlo::SubOp> {
-  using FOp = ::mlir::SubFOp;
-  using IOp = ::mlir::SubIOp;
-};
-template <>
-struct ScalarOp<xla_hlo::SubOp> {
+struct LhloToScalarOp<xla_lhlo::SubOp> {
   using FOp = ::mlir::SubFOp;
   using IOp = ::mlir::SubIOp;
 };
 
-template <typename XLA_BinaryOp>
-using ScalarFOp = typename ScalarOp<XLA_BinaryOp>::FOp;
-template <typename XLA_BinaryOp>
-using ScalarIOp = typename ScalarOp<XLA_BinaryOp>::IOp;
+template <typename LhloBinaryOpTy>
+struct ScalarOp {
+  using FOp = typename LhloToScalarOp<LhloBinaryOpTy>::FOp;
+  using IOp = typename LhloToScalarOp<LhloBinaryOpTy>::IOp;
+};
+
+// Alias for the map from LHLO binary op type to STD floating-point op type.
+template <typename LhloOp>
+using ScalarFOp = typename ScalarOp<LhloOp>::FOp;
+// Alias for the map from LHLO binary op type to STD integer op type.
+template <typename LhloOp>
+using ScalarIOp = typename ScalarOp<LhloOp>::IOp;
 
 template <typename... Args>
-struct MapXlaOpToStdScalarOpImpl {
+struct MapLhloOpToStdScalarOpImpl {
   Value operator()(Location loc, ArrayRef<Type> result_types,
                    ArrayRef<Value> args, OpBuilder* b) {
     return nullptr;
@@ -103,7 +85,7 @@ struct MapXlaOpToStdScalarOpImpl {
 };
 
 template <typename StdScalarOp>
-struct MapXlaOpToStdScalarOpImpl<StdScalarOp> {
+struct MapLhloOpToStdScalarOpImpl<StdScalarOp> {
   Value operator()(Location loc, ArrayRef<Type> result_types,
                    ArrayRef<Value> args, OpBuilder* b) {
     return b->template create<StdScalarOp>(loc, result_types, args, mlir::None);
@@ -111,7 +93,7 @@ struct MapXlaOpToStdScalarOpImpl<StdScalarOp> {
 };
 
 template <typename SupportedType, typename StdScalarOp, typename... Args>
-struct MapXlaOpToStdScalarOpImpl<SupportedType, StdScalarOp, Args...> {
+struct MapLhloOpToStdScalarOpImpl<SupportedType, StdScalarOp, Args...> {
   Value operator()(Location loc, ArrayRef<Type> result_types,
                    ArrayRef<Value> args, OpBuilder* b) {
     Type element_type = args.front().getType();
@@ -119,52 +101,34 @@ struct MapXlaOpToStdScalarOpImpl<SupportedType, StdScalarOp, Args...> {
       return b->template create<StdScalarOp>(loc, result_types, args,
                                              mlir::None);
     }
-    return MapXlaOpToStdScalarOpImpl<Args...>{}(loc, result_types, args, b);
+    return MapLhloOpToStdScalarOpImpl<Args...>{}(loc, result_types, args, b);
   }
 };
 
-template <typename XlaOp>
-inline Value MapXlaOpToStdScalarOp(XlaOp xla_op, ArrayRef<Type> result_types,
-                                   ArrayRef<Value> args, OpBuilder* b) {
-  return MapXlaOpToStdScalarOpImpl<IntegerType, ScalarIOp<XlaOp>, FloatType,
-                                   ScalarFOp<XlaOp>>{}(xla_op.getLoc(),
-                                                       result_types, args, b);
-}
-
-// TODO(ravishankarm): Find a way to reduce code-bloat in HLO and LHLO
-// specialization.
-template <>
-inline Value MapXlaOpToStdScalarOp<xla_lhlo::AbsOp>(xla_lhlo::AbsOp xla_op,
-                                                    ArrayRef<Type> result_types,
-                                                    ArrayRef<Value> args,
-                                                    OpBuilder* b) {
-  return MapXlaOpToStdScalarOpImpl<FloatType, ::mlir::AbsFOp>{}(
-      xla_op.getLoc(), result_types, args, b);
-}
-template <>
-inline Value MapXlaOpToStdScalarOp<xla_hlo::AbsOp>(xla_hlo::AbsOp xla_op,
-                                                   ArrayRef<Type> result_types,
-                                                   ArrayRef<Value> args,
-                                                   OpBuilder* b) {
-  return MapXlaOpToStdScalarOpImpl<FloatType, ::mlir::AbsFOp>{}(
-      xla_op.getLoc(), result_types, args, b);
+// Inserts the computation that corresponds to the body of the loop for lowered
+// LHLO unary/binary op. Returns the value for the result.
+template <typename LhloOpTy>
+inline Value MapLhloOpToStdScalarOp(Location loc, ArrayRef<Type> result_types,
+                                    ArrayRef<Value> args, OpBuilder* b) {
+  return MapLhloOpToStdScalarOpImpl<IntegerType, ScalarIOp<LhloOpTy>, FloatType,
+                                    ScalarFOp<LhloOpTy>>{}(loc, result_types,
+                                                           args, b);
 }
 
 template <>
-inline Value MapXlaOpToStdScalarOp<xla_lhlo::AndOp>(xla_lhlo::AndOp xla_op,
-                                                    ArrayRef<Type> result_types,
-                                                    ArrayRef<Value> args,
-                                                    OpBuilder* b) {
-  return MapXlaOpToStdScalarOpImpl<IntegerType, ::mlir::AndOp>{}(
-      xla_op.getLoc(), result_types, args, b);
+inline Value MapLhloOpToStdScalarOp<xla_lhlo::AbsOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
+    OpBuilder* b) {
+  return MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::AbsFOp>{}(
+      loc, result_types, args, b);
 }
+
 template <>
-inline Value MapXlaOpToStdScalarOp<xla_hlo::AndOp>(xla_hlo::AndOp xla_op,
-                                                   ArrayRef<Type> result_types,
-                                                   ArrayRef<Value> args,
-                                                   OpBuilder* b) {
-  return MapXlaOpToStdScalarOpImpl<IntegerType, ::mlir::AndOp>{}(
-      xla_op.getLoc(), result_types, args, b);
+inline Value MapLhloOpToStdScalarOp<xla_lhlo::AndOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
+    OpBuilder* b) {
+  return MapLhloOpToStdScalarOpImpl<IntegerType, ::mlir::AndOp>{}(
+      loc, result_types, args, b);
 }
 
 template <typename PredicateType>
@@ -176,14 +140,14 @@ inline Optional<PredicateType> getCmpPredicate(
 template <>
 inline Optional<CmpFPredicate> getCmpPredicate<CmpFPredicate>(
     StringRef xla_comparison_direction) {
-  return llvm::StringSwitch<CmpFPredicate>(xla_comparison_direction)
+  return llvm::StringSwitch<Optional<CmpFPredicate>>(xla_comparison_direction)
       .Case("EQ", CmpFPredicate::OEQ)
       .Case("NE", CmpFPredicate::ONE)
       .Case("GE", CmpFPredicate::OGE)
       .Case("GT", CmpFPredicate::OGT)
       .Case("LE", CmpFPredicate::OLE)
       .Case("LT", CmpFPredicate::OLT)
-      .Default(CmpFPredicate::NumPredicates);
+      .Default(llvm::None);
 }
 
 template <>
@@ -200,7 +164,8 @@ inline Optional<CmpIPredicate> getCmpPredicate<CmpIPredicate>(
 }
 
 template <typename XLACompareOpTy>
-inline Value MapXlaCompareOpToStdScalarOp(XLACompareOpTy xla_op,
+inline Value MapXlaCompareOpToStdScalarOp(Location loc,
+                                          StringRef comparison_direction,
                                           ArrayRef<Type> result_types,
                                           ArrayRef<Value> args, OpBuilder* b) {
   const auto& lhs = args[0];
@@ -208,101 +173,60 @@ inline Value MapXlaCompareOpToStdScalarOp(XLACompareOpTy xla_op,
   Type element_type = lhs.getType();
   if (element_type.isSignlessInteger()) {
     Optional<CmpIPredicate> predicate =
-        getCmpPredicate<CmpIPredicate>(xla_op.comparison_direction());
+        getCmpPredicate<CmpIPredicate>(comparison_direction);
     assert(predicate.hasValue() && "expected valid comparison direction");
-    return b->create<ScalarIOp<XLACompareOpTy>>(xla_op.getLoc(),
-                                                predicate.getValue(), lhs, rhs);
+    return b->create<ScalarIOp<XLACompareOpTy>>(loc, predicate.getValue(), lhs,
+                                                rhs);
   }
   if (element_type.isa<FloatType>()) {
     Optional<CmpFPredicate> predicate =
-        getCmpPredicate<CmpFPredicate>(xla_op.comparison_direction());
+        getCmpPredicate<CmpFPredicate>(comparison_direction);
     assert(predicate.hasValue() && "expected valid comparison direction");
-    return b->create<ScalarFOp<XLACompareOpTy>>(xla_op.getLoc(),
-                                                predicate.getValue(), lhs, rhs);
+    return b->create<ScalarFOp<XLACompareOpTy>>(loc, predicate.getValue(), lhs,
+                                                rhs);
   }
   return nullptr;
 }
-template <>
-inline Value MapXlaOpToStdScalarOp<xla_lhlo::CompareOp>(
-    xla_lhlo::CompareOp xla_op, ArrayRef<Type> result_types,
-    ArrayRef<Value> args, OpBuilder* b) {
-  return MapXlaCompareOpToStdScalarOp<xla_lhlo::CompareOp>(xla_op, result_types,
-                                                           args, b);
-}
-template <>
-inline Value MapXlaOpToStdScalarOp<xla_hlo::CompareOp>(
-    xla_hlo::CompareOp xla_op, ArrayRef<Type> result_types,
-    ArrayRef<Value> args, OpBuilder* b) {
-  return MapXlaCompareOpToStdScalarOp<xla_hlo::CompareOp>(xla_op, result_types,
-                                                          args, b);
-}
 
 template <>
-inline Value MapXlaOpToStdScalarOp<xla_lhlo::CopyOp>(
-    xla_lhlo::CopyOp xla_op, ArrayRef<Type> result_types, ArrayRef<Value> args,
+inline Value MapLhloOpToStdScalarOp<xla_lhlo::CopyOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
     OpBuilder* b) {
   return args.front();
 }
-template <>
-inline Value MapXlaOpToStdScalarOp<xla_hlo::CopyOp>(xla_hlo::CopyOp xla_op,
-                                                    ArrayRef<Type> result_types,
-                                                    ArrayRef<Value> args,
-                                                    OpBuilder* b) {
-  return args.front();
-}
 
 template <>
-inline Value MapXlaOpToStdScalarOp<xla_lhlo::ExpOp>(xla_lhlo::ExpOp xla_op,
-                                                    ArrayRef<Type> result_types,
-                                                    ArrayRef<Value> args,
-                                                    OpBuilder* b) {
-  return MapXlaOpToStdScalarOpImpl<FloatType, ::mlir::ExpOp>{}(
-      xla_op.getLoc(), result_types, args, b);
-}
-template <>
-inline Value MapXlaOpToStdScalarOp<xla_hlo::ExpOp>(xla_hlo::ExpOp xla_op,
-                                                   ArrayRef<Type> result_types,
-                                                   ArrayRef<Value> args,
-                                                   OpBuilder* b) {
-  return MapXlaOpToStdScalarOpImpl<FloatType, ::mlir::ExpOp>{}(
-      xla_op.getLoc(), result_types, args, b);
-}
-
-template <>
-inline Value MapXlaOpToStdScalarOp<xla_lhlo::CeilOp>(
-    xla_lhlo::CeilOp xla_op, ArrayRef<Type> result_types, ArrayRef<Value> args,
+inline Value MapLhloOpToStdScalarOp<xla_lhlo::ExpOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
     OpBuilder* b) {
-  return MapXlaOpToStdScalarOpImpl<FloatType, ::mlir::CeilFOp>{}(
-      xla_op.getLoc(), result_types, args, b);
-}
-template <>
-inline Value MapXlaOpToStdScalarOp<xla_hlo::CeilOp>(xla_hlo::CeilOp xla_op,
-                                                    ArrayRef<Type> result_types,
-                                                    ArrayRef<Value> args,
-                                                    OpBuilder* b) {
-  return MapXlaOpToStdScalarOpImpl<FloatType, ::mlir::CeilFOp>{}(
-      xla_op.getLoc(), result_types, args, b);
+  return MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::ExpOp>{}(
+      loc, result_types, args, b);
 }
 
 template <>
-inline Value MapXlaOpToStdScalarOp<xla_lhlo::ConvertOp>(
-    xla_lhlo::ConvertOp xla_op, ArrayRef<Type> result_types,
-    ArrayRef<Value> args, OpBuilder* b) {
+inline Value MapLhloOpToStdScalarOp<xla_lhlo::CeilOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
+    OpBuilder* b) {
+  return MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::CeilFOp>{}(
+      loc, result_types, args, b);
+}
+
+template <>
+inline Value MapLhloOpToStdScalarOp<xla_lhlo::ConvertOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
+    OpBuilder* b) {
   Type sourceType = args.front().getType();
   Type targetType = result_types.front();
 
   if (mlir::SIToFPOp::areCastCompatible(sourceType, targetType)) {
-    return b->create<mlir::SIToFPOp>(xla_op.getLoc(), result_types, args,
-                                     mlir::None);
+    return b->create<mlir::SIToFPOp>(loc, result_types, args, mlir::None);
   } else if (sourceType.isa<FloatType>() && targetType.isa<FloatType>()) {
     FloatType src = sourceType.cast<FloatType>();
     FloatType res = targetType.cast<FloatType>();
     if (src.getWidth() > res.getWidth()) {
-      return b->create<mlir::FPTruncOp>(xla_op.getLoc(), result_types, args,
-                                        mlir::None);
+      return b->create<mlir::FPTruncOp>(loc, result_types, args, mlir::None);
     } else if (src.getWidth() < res.getWidth()) {
-      return b->create<mlir::FPExtOp>(xla_op.getLoc(), result_types, args,
-                                      mlir::None);
+      return b->create<mlir::FPExtOp>(loc, result_types, args, mlir::None);
     }
     // No conversion is needed for the same width floats
     return args.front();
@@ -311,10 +235,9 @@ inline Value MapXlaOpToStdScalarOp<xla_lhlo::ConvertOp>(
     IntegerType src = sourceType.cast<IntegerType>();
     IntegerType res = targetType.cast<IntegerType>();
     if (src.getWidth() > res.getWidth()) {
-      return b->create<mlir::TruncateIOp>(xla_op.getLoc(), result_types, args,
-                                          mlir::None);
+      return b->create<mlir::TruncateIOp>(loc, result_types, args, mlir::None);
     } else if (src.getWidth() < res.getWidth()) {
-      return b->create<mlir::ZeroExtendIOp>(xla_op.getLoc(), result_types, args,
+      return b->create<mlir::ZeroExtendIOp>(loc, result_types, args,
                                             mlir::None);
     }
     // No conversion is needed for the same width integers
@@ -322,35 +245,25 @@ inline Value MapXlaOpToStdScalarOp<xla_lhlo::ConvertOp>(
   }
   // TODO(dfki-ehna): Add other primitive type conversions
   // if (mlir::FpToSiOp::areCastCompatible(sourceType, targetType)) {
-  //   return b.create<mlir::FpToSiOp>(xla_op.getLoc(), result_types,
+  //   return b.create<mlir::FpToSiOp>(loc, result_types,
   //   args,mlir::None);
   // }
-
   return nullptr;
 }
 
 template <>
-inline Value MapXlaOpToStdScalarOp<xla_lhlo::CosOp>(xla_lhlo::CosOp xla_op,
-                                                    ArrayRef<Type> result_types,
-                                                    ArrayRef<Value> args,
-                                                    OpBuilder* b) {
-  return MapXlaOpToStdScalarOpImpl<FloatType, ::mlir::CosOp>{}(
-      xla_op.getLoc(), result_types, args, b);
-}
-template <>
-inline Value MapXlaOpToStdScalarOp<xla_hlo::CosOp>(xla_hlo::CosOp xla_op,
-                                                   ArrayRef<Type> result_types,
-                                                   ArrayRef<Value> args,
-                                                   OpBuilder* b) {
-  return MapXlaOpToStdScalarOpImpl<FloatType, ::mlir::CosOp>{}(
-      xla_op.getLoc(), result_types, args, b);
+inline Value MapLhloOpToStdScalarOp<xla_lhlo::CosOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
+    OpBuilder* b) {
+  return MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::CosOp>{}(
+      loc, result_types, args, b);
 }
 
 /// Implements the conversion of XLA op to scalar op (to use within region of a
 /// linalg.generic op) for compare-select style operations like min/max.
 template <typename... Args>
-struct MapXlaCompareSelectOpToStdScalarOp {
-  Value operator()(Location loc, StringRef comparison_direction,
+struct XlaCompareSelectOpToStdScalarOp {
+  static Value map(Location loc, StringRef comparison_direction,
                    ArrayRef<Type> result_types, ArrayRef<Value> args,
                    OpBuilder* b) {
     return nullptr;
@@ -361,9 +274,9 @@ struct MapXlaCompareSelectOpToStdScalarOp {
 /// dialect with a given predicate based on the element type of the operand.
 template <typename SupportedType, typename StdCompareOp, typename Predicate,
           typename... Args>
-struct MapXlaCompareSelectOpToStdScalarOp<SupportedType, StdCompareOp,
-                                          Predicate, Args...> {
-  Value operator()(Location loc, StringRef comparison_direction,
+struct XlaCompareSelectOpToStdScalarOp<SupportedType, StdCompareOp, Predicate,
+                                       Args...> {
+  static Value map(Location loc, StringRef comparison_direction,
                    ArrayRef<Type> result_types, ArrayRef<Value> args,
                    OpBuilder* b) {
     Type element_type = args.front().getType();
@@ -374,117 +287,142 @@ struct MapXlaCompareSelectOpToStdScalarOp<SupportedType, StdCompareOp,
                                                   args[0], args[1]);
       return b->create<::mlir::SelectOp>(loc, cmp, args[0], args[1]);
     }
-    return MapXlaCompareSelectOpToStdScalarOp<Args...>{}(
+    return XlaCompareSelectOpToStdScalarOp<Args...>::map(
         loc, comparison_direction, result_types, args, b);
   }
 };
 
 template <>
-inline Value MapXlaOpToStdScalarOp<xla_lhlo::MaxOp>(xla_lhlo::MaxOp xla_op,
-                                                    ArrayRef<Type> result_types,
-                                                    ArrayRef<Value> args,
-                                                    OpBuilder* b) {
-  return MapXlaCompareSelectOpToStdScalarOp<
-      IntegerType, ScalarIOp<xla_lhlo::CompareOp>, CmpIPredicate, FloatType,
-      ScalarFOp<xla_lhlo::CompareOp>, CmpFPredicate>{}(xla_op.getLoc(), "GT",
-                                                       result_types, args, b);
-}
-template <>
-inline Value MapXlaOpToStdScalarOp<xla_hlo::MaxOp>(xla_hlo::MaxOp xla_op,
-                                                   ArrayRef<Type> result_types,
-                                                   ArrayRef<Value> args,
-                                                   OpBuilder* b) {
-  return MapXlaCompareSelectOpToStdScalarOp<
-      IntegerType, ScalarIOp<xla_hlo::CompareOp>, CmpIPredicate, FloatType,
-      ScalarFOp<xla_hlo::CompareOp>, CmpFPredicate>{}(xla_op.getLoc(), "GT",
-                                                      result_types, args, b);
-}
-
-template <>
-inline Value MapXlaOpToStdScalarOp<xla_lhlo::MinOp>(xla_lhlo::MinOp xla_op,
-                                                    ArrayRef<Type> result_types,
-                                                    ArrayRef<Value> args,
-                                                    OpBuilder* b) {
-  return MapXlaCompareSelectOpToStdScalarOp<
-      IntegerType, ScalarIOp<xla_lhlo::CompareOp>, CmpIPredicate, FloatType,
-      ScalarFOp<xla_lhlo::CompareOp>, CmpFPredicate>{}(xla_op.getLoc(), "LT",
-                                                       result_types, args, b);
-}
-template <>
-inline Value MapXlaOpToStdScalarOp<xla_hlo::MinOp>(xla_hlo::MinOp xla_op,
-                                                   ArrayRef<Type> result_types,
-                                                   ArrayRef<Value> args,
-                                                   OpBuilder* b) {
-  return MapXlaCompareSelectOpToStdScalarOp<
-      IntegerType, ScalarIOp<xla_hlo::CompareOp>, CmpIPredicate, FloatType,
-      ScalarFOp<xla_hlo::CompareOp>, CmpFPredicate>{}(xla_op.getLoc(), "LT",
-                                                      result_types, args, b);
-}
-
-template <>
-inline Value MapXlaOpToStdScalarOp<xla_lhlo::NegOp>(xla_lhlo::NegOp xla_op,
-                                                    ArrayRef<Type> result_types,
-                                                    ArrayRef<Value> args,
-                                                    OpBuilder* b) {
-  return MapXlaOpToStdScalarOpImpl<FloatType, ::mlir::NegFOp>{}(
-      xla_op.getLoc(), result_types, args, b);
-}
-template <>
-inline Value MapXlaOpToStdScalarOp<xla_hlo::NegOp>(xla_hlo::NegOp xla_op,
-                                                   ArrayRef<Type> result_types,
-                                                   ArrayRef<Value> args,
-                                                   OpBuilder* b) {
-  return MapXlaOpToStdScalarOpImpl<FloatType, ::mlir::NegFOp>{}(
-      xla_op.getLoc(), result_types, args, b);
-}
-
-template <>
-inline Value MapXlaOpToStdScalarOp<xla_lhlo::SelectOp>(
-    xla_lhlo::SelectOp xla_op, ArrayRef<Type> result_types,
-    ArrayRef<Value> args, OpBuilder* b) {
-  return MapXlaOpToStdScalarOpImpl<::mlir::SelectOp>{}(xla_op.getLoc(),
-                                                       result_types, args, b);
-}
-template <>
-inline Value MapXlaOpToStdScalarOp<xla_hlo::SelectOp>(
-    xla_hlo::SelectOp xla_op, ArrayRef<Type> result_types, ArrayRef<Value> args,
+inline Value MapLhloOpToStdScalarOp<xla_lhlo::LogOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
     OpBuilder* b) {
-  return MapXlaOpToStdScalarOpImpl<::mlir::SelectOp>{}(xla_op.getLoc(),
-                                                       result_types, args, b);
+  return MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::LogOp>{}(
+      loc, result_types, args, b);
 }
 
 template <>
-inline Value MapXlaOpToStdScalarOp<xla_lhlo::SignOp>(
-    xla_lhlo::SignOp xla_op, ArrayRef<Type> result_types, ArrayRef<Value> args,
+inline Value MapLhloOpToStdScalarOp<xla_lhlo::MaxOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
+    OpBuilder* b) {
+  return XlaCompareSelectOpToStdScalarOp<
+      IntegerType, ScalarIOp<xla_lhlo::CompareOp>, CmpIPredicate, FloatType,
+      ScalarFOp<xla_lhlo::CompareOp>, CmpFPredicate>::map(loc, "GT",
+                                                          result_types, args,
+                                                          b);
+}
+
+template <>
+inline Value MapLhloOpToStdScalarOp<xla_lhlo::MinOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
+    OpBuilder* b) {
+  return XlaCompareSelectOpToStdScalarOp<
+      IntegerType, ScalarIOp<xla_lhlo::CompareOp>, CmpIPredicate, FloatType,
+      ScalarFOp<xla_lhlo::CompareOp>, CmpFPredicate>::map(loc, "LT",
+                                                          result_types, args,
+                                                          b);
+}
+
+template <>
+inline Value MapLhloOpToStdScalarOp<xla_lhlo::NegOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
+    OpBuilder* b) {
+  return MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::NegFOp>{}(
+      loc, result_types, args, b);
+}
+
+template <>
+inline Value MapLhloOpToStdScalarOp<xla_lhlo::RsqrtOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
+    OpBuilder* b) {
+  return MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::RsqrtOp>{}(
+      loc, result_types, args, b);
+}
+
+template <>
+inline Value MapLhloOpToStdScalarOp<xla_lhlo::SelectOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
+    OpBuilder* b) {
+  return MapLhloOpToStdScalarOpImpl<::mlir::SelectOp>{}(loc, result_types, args,
+                                                        b);
+}
+
+template <>
+inline Value MapLhloOpToStdScalarOp<xla_lhlo::SignOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
     OpBuilder* b) {
   Type element_type = args.front().getType();
   if (element_type.isa<FloatType>()) {
     FloatType float_type = element_type.cast<FloatType>();
     APFloat const_value = float_type.isF32() ? APFloat(1.0f) : APFloat(1.0);
-    Value one = b->create<mlir::ConstantFloatOp>(xla_op.getLoc(), const_value,
-                                                 float_type);
-    return b->create<::mlir::CopySignOp>(xla_op.getLoc(), result_types, one,
-                                         args[0]);
+    Value one = b->create<mlir::ConstantFloatOp>(loc, const_value, float_type);
+    return b->create<::mlir::CopySignOp>(loc, result_types, one, args[0]);
   }
   return nullptr;
 }
 
 template <>
-inline Value MapXlaOpToStdScalarOp<xla_lhlo::TanhOp>(
-    xla_lhlo::TanhOp xla_op, ArrayRef<Type> result_types, ArrayRef<Value> args,
+inline Value MapLhloOpToStdScalarOp<xla_lhlo::SqrtOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
     OpBuilder* b) {
-  return MapXlaOpToStdScalarOpImpl<FloatType, ::mlir::TanhOp>{}(
-      xla_op.getLoc(), result_types, args, b);
+  return MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::SqrtOp>{}(
+      loc, result_types, args, b);
 }
+
 template <>
-inline Value MapXlaOpToStdScalarOp<xla_hlo::TanhOp>(xla_hlo::TanhOp xla_op,
-                                                    ArrayRef<Type> result_types,
-                                                    ArrayRef<Value> args,
-                                                    OpBuilder* b) {
-  return MapXlaOpToStdScalarOpImpl<FloatType, ::mlir::TanhOp>{}(
-      xla_op.getLoc(), result_types, args, b);
+inline Value MapLhloOpToStdScalarOp<xla_lhlo::TanhOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
+    OpBuilder* b) {
+  return MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::TanhOp>{}(
+      loc, result_types, args, b);
 }
 
+}  // namespace impl
+
+struct XlaOpToStdScalarOp {
+  // Implementation for LHLO ops except xla_lhlo::CompareOp.
+  template <typename XlaOpTy, typename LhloOpTy = XlaOpTy,
+            typename = std::enable_if_t<
+                !std::is_same<LhloOpTy, xla_lhlo::CompareOp>::value &&
+                std::is_same<typename xla_hlo::HloToLhloOp<LhloOpTy>,
+                             std::false_type>::value>>
+  static Value map(XlaOpTy op, ArrayRef<Type> result_types,
+                   ArrayRef<Value> args, OpBuilder* b, unsigned i = 0) {
+    return impl::MapLhloOpToStdScalarOp<LhloOpTy>(op.getLoc(), result_types,
+                                                  args, b);
+  }
+
+  // Implementation for HLO ops except xla_hlo::CompareOp.
+  template <typename XlaOpTy, typename LhloOpTy = xla_hlo::HloToLhloOp<XlaOpTy>,
+            typename = std::enable_if_t<
+                !std::is_same<LhloOpTy, xla_lhlo::CompareOp>::value &&
+                !std::is_same<LhloOpTy, std::false_type>::value>>
+  static Value map(XlaOpTy op, ArrayRef<Type> result_types,
+                   ArrayRef<Value> args, OpBuilder* b, int i = 0) {
+    return impl::MapLhloOpToStdScalarOp<LhloOpTy>(op.getLoc(), result_types,
+                                                  args, b);
+  }
+
+  // Implementation for xla_lhlo::CompareOp.
+  template <typename LhloOpTy, typename = std::enable_if_t<std::is_same<
+                                   LhloOpTy, xla_lhlo::CompareOp>::value>>
+  static Value map(xla_lhlo::CompareOp op, ArrayRef<Type> result_types,
+                   ArrayRef<Value> args, OpBuilder* b) {
+    auto comparison_direction = op.comparison_direction();
+    return impl::MapXlaCompareOpToStdScalarOp<xla_lhlo::CompareOp>(
+        op.getLoc(), comparison_direction, result_types, args, b);
+  }
+
+  // Implementation for xla_hlo::CompareOp.
+  template <typename HloOpTy, typename = std::enable_if_t<std::is_same<
+                                  HloOpTy, xla_hlo::CompareOp>::value>>
+  static Value map(xla_hlo::CompareOp op, ArrayRef<Type> result_types,
+                   ArrayRef<Value> args, OpBuilder* b) {
+    auto comparison_direction = op.comparison_direction();
+    return impl::MapXlaCompareOpToStdScalarOp<xla_lhlo::CompareOp>(
+        op.getLoc(), comparison_direction, result_types, args, b);
+  }
+};
+
 }  // namespace xla_lhlo
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/passes.h b/tensorflow/compiler/mlir/xla/transforms/passes.h
index 8c0ed08fb66..b1afd543c2e 100644
--- a/tensorflow/compiler/mlir/xla/transforms/passes.h
+++ b/tensorflow/compiler/mlir/xla/transforms/passes.h
@@ -29,6 +29,7 @@ class ModuleOp;
 class Operation;
 template <typename T>
 class OpPassBase;
+class Pass;
 
 namespace xla_hlo {
 
@@ -59,11 +60,6 @@ std::unique_ptr<OpPassBase<ModuleOp>> createLegalizeToLhloPass();
 // Lowers from HLO dialect to Linalg dialect.
 std::unique_ptr<OpPassBase<FuncOp>> createLegalizeHloToLinalgPass();
 
-// Removes unnecessary LHLO copies which copy from the allocated buffers to the
-// block arguments. These copies have been created by replacing TensorStoreOp
-// with LHLO.CopyOp in HLO to LHLO lowering.
-std::unique_ptr<OpPassBase<FuncOp>> createLhloCopyRemovalPass();
-
 }  // namespace xla_hlo
 
 namespace xla_lhlo {
@@ -89,6 +85,15 @@ std::unique_ptr<OpPassBase<FuncOp>> createLegalizeToGpuPass();
 std::unique_ptr<OpPassBase<FuncOp>> createLhloFuseLinalg(
     bool use_parallel_loops = false, ArrayRef<unsigned> tile_sizes = {});
 
+// Removes unnecessary LHLO copies which copy from the allocated buffers to the
+// block arguments. The block arguments are used instead of all uses of these
+// buffers. The buffers are freed. This pass only works in regions that contain
+// a single block.
+std::unique_ptr<Pass> createLhloCopyRemovalPass();
+
+// Lowers from LHLO dialect to parallel loops.
+std::unique_ptr<OpPassBase<FuncOp>> createLegalizeLhloToParallelLoopsPass();
+
 }  // namespace xla_lhlo
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm.cc b/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm.cc
index 6447c5d6c3f..071cc575656 100644
--- a/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/MLIRContext.h"  // TF:llvm-project
@@ -28,20 +30,47 @@ namespace xla_hlo {
 
 namespace {
 
-// Broadcasts the 1D value tensor to rank.
-Value broadcastToFeatureDim(Location loc, Type result_type, Value value_1d,
+// Broadcasts the 1D value tensor 'value_1d' to the shape of 'result_type'. If
+// 'shape_value' is initialized, creates a dynamic broadcast, otherwise creates
+// a static broadcast.
+Value BroadcastToFeatureDim(Location loc, RankedTensorType result_type,
+                            Value value_1d, Value shape_value,
                             int64_t feature_dim,
-                            ConversionPatternRewriter& rewriter) {
+                            ConversionPatternRewriter& rewriter) {  // NOLINT
   Builder b(rewriter.getContext());
   auto dims_type = RankedTensorType::get({1}, b.getIntegerType(64));
   auto dims = DenseIntElementsAttr::get(dims_type, {feature_dim});
+  if (shape_value) {
+    return rewriter.createOrFold<xla_hlo::DynamicBroadcastInDimOp>(
+        loc, result_type, value_1d, shape_value, dims);
+  }
+  assert(result_type.hasStaticShape());
   return rewriter.create<xla_hlo::BroadcastInDimOp>(loc, result_type, value_1d,
                                                     dims);
 }
 
+// Calculate the shape value of operand, assuming it is a dynamic shape with
+// static rank.
+Value CalculateShapeValue(Location loc, Value operand,
+                          ConversionPatternRewriter& rewriter) {  // NOLINT
+  RankedTensorType result_type = operand.getType().dyn_cast<RankedTensorType>();
+  llvm::SmallVector<Value, 4> shape_values;
+  int64_t rank = result_type.getRank();
+  shape_values.reserve(rank);
+  for (int64_t i = 0; i < rank; ++i) {
+    auto index_value = rewriter.create<mlir::DimOp>(loc, operand, i);
+    shape_values.push_back(rewriter.create<mlir::IndexCastOp>(
+        loc, index_value, rewriter.getIntegerType(32)));
+  }
+  Type shape_element_type = shape_values.front().getType();
+  return rewriter.create<ScalarsToDimensionTensorOp>(
+      loc, RankedTensorType::get({rank}, shape_element_type), shape_values);
+}
+
 Value MaterializeEpsilon(Operation* op, FloatAttr epsilon_attr,
-                         FloatType fp_type, Type broadcast_to_type,
-                         ConversionPatternRewriter& rewriter) {
+                         FloatType fp_type, Value variance,
+                         RankedTensorType broadcast_to_type,
+                         ConversionPatternRewriter& rewriter) {  // NOLINT
   Builder b(rewriter.getContext());
   if (epsilon_attr.getType() != fp_type) {
     // Need to convert.
@@ -66,9 +95,16 @@ Value MaterializeEpsilon(Operation* op, FloatAttr epsilon_attr,
       DenseElementsAttr::get(scalar_type, {epsilon_attr.cast<Attribute>()});
   Value epsilon =
       rewriter.create<xla_hlo::ConstOp>(op->getLoc(), epsilon_tensor_attr);
-  epsilon = rewriter.create<xla_hlo::BroadcastInDimOp>(
-      op->getLoc(), broadcast_to_type, epsilon, /*broadcast_dims=*/nullptr);
-  return epsilon;
+  auto dims_type = RankedTensorType::get({0}, b.getIntegerType(64));
+  auto dims = DenseIntElementsAttr::get(dims_type, SmallVector<int64_t, 1>{});
+  if (broadcast_to_type.hasStaticShape()) {
+    return rewriter.create<xla_hlo::BroadcastInDimOp>(
+        op->getLoc(), broadcast_to_type, epsilon, /*broadcast_dims=*/dims);
+  }
+  Value shape_value = CalculateShapeValue(op->getLoc(), variance, rewriter);
+  return rewriter.createOrFold<xla_hlo::DynamicBroadcastInDimOp>(
+      op->getLoc(), broadcast_to_type, epsilon, shape_value,
+      /*broadcast_dims=*/dims);
 }
 
 class UnfuseBatchNormInferencePattern
@@ -84,9 +120,10 @@ class UnfuseBatchNormInferencePattern
     // Enforce type invariants.
     // Note that we deduce the actual element type from the variance,
     // which should not be subject to quantization at a higher level.
-    auto input_type = operands.operand().getType();
-    auto variance_type = operands.variance().getType().dyn_cast<ShapedType>();
-    if (!variance_type) {
+    auto input_type = operands.operand().getType().dyn_cast<RankedTensorType>();
+    auto variance_type =
+        operands.variance().getType().dyn_cast<RankedTensorType>();
+    if (!input_type || !variance_type) {
       return matchFailure();
     }
     auto fp_type = variance_type.getElementType().dyn_cast<FloatType>();
@@ -97,8 +134,9 @@ class UnfuseBatchNormInferencePattern
 
     // Add epsilon to the variance and sqrt to get stddev:
     // stddev = sqrt(variance + epsilon)
-    auto epsilon = MaterializeEpsilon(bn_op.getOperation(), bn_op.epsilonAttr(),
-                                      fp_type, variance_type, rewriter);
+    auto epsilon =
+        MaterializeEpsilon(bn_op.getOperation(), bn_op.epsilonAttr(), fp_type,
+                           operands.variance(), variance_type, rewriter);
     if (!epsilon) {
       return matchFailure();
     }
@@ -108,14 +146,22 @@ class UnfuseBatchNormInferencePattern
     stddev = rewriter.create<xla_hlo::SqrtOp>(bn_op.getLoc(), stddev);
 
     // Broadcast all terms.
-    auto broadcast_scale = broadcastToFeatureDim(
-        bn_op.getLoc(), input_type, operands.scale(), feature_dim, rewriter);
-    auto broadcast_offset = broadcastToFeatureDim(
-        bn_op.getLoc(), input_type, operands.offset(), feature_dim, rewriter);
-    auto broadcast_mean = broadcastToFeatureDim(
-        bn_op.getLoc(), input_type, operands.mean(), feature_dim, rewriter);
-    auto broadcast_stddev = broadcastToFeatureDim(
-        bn_op.getLoc(), input_type, stddev, feature_dim, rewriter);
+    Value shape_value;
+    if (!input_type.hasStaticShape()) {
+      shape_value =
+          CalculateShapeValue(bn_op.getLoc(), operands.operand(), rewriter);
+    }
+    auto broadcast_scale =
+        BroadcastToFeatureDim(bn_op.getLoc(), input_type, operands.scale(),
+                              shape_value, feature_dim, rewriter);
+    auto broadcast_offset =
+        BroadcastToFeatureDim(bn_op.getLoc(), input_type, operands.offset(),
+                              shape_value, feature_dim, rewriter);
+    auto broadcast_mean =
+        BroadcastToFeatureDim(bn_op.getLoc(), input_type, operands.mean(),
+                              shape_value, feature_dim, rewriter);
+    auto broadcast_stddev = BroadcastToFeatureDim(
+        bn_op.getLoc(), input_type, stddev, shape_value, feature_dim, rewriter);
 
     // Compute:
     // scale * (input - mean) / stddev + offset
diff --git a/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm_pass.cc b/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm_pass.cc
index 039d6ed45e2..ccec4d73b6e 100644
--- a/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm_pass.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm_pass.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
 #include "mlir/IR/MLIRContext.h"  // TF:llvm-project
 #include "mlir/IR/Operation.h"  // TF:llvm-project
 #include "mlir/IR/PatternMatch.h"  // TF:llvm-project
@@ -33,6 +34,7 @@ struct TestUnfuseBatchNormPass : public FunctionPass<TestUnfuseBatchNormPass> {
 
     // Consider the xla_hlo dialect legal for tests.
     conversionTarget.addLegalDialect<XlaHloDialect>();
+    conversionTarget.addLegalDialect<StandardOpsDialect>();
     conversionTarget.addIllegalOp<xla_hlo::BatchNormInferenceOp>();
 
     PopulateUnfuseBatchNormPatterns(&getContext(), &conversionPatterns);
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
index 7f7060fef64..0daec32fbab 100644
--- a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
@@ -149,8 +149,8 @@ class PointwiseToLinalgConverter : public OpConversionPattern<OpTy> {
     rewriter.setInsertionPointToEnd(block);
     // TODO(ravishankarm) : For now use the method in xla_lhlo namespace. That
     // method needs to be moved out of there.
-    Value opResult = xla_lhlo::MapXlaOpToStdScalarOp<OpTy>(
-        llvm::cast<OpTy>(op), bodyResultTypes, bodyArgs, &rewriter);
+    Value opResult = xla_lhlo::XlaOpToStdScalarOp::map<OpTy>(
+        op, bodyResultTypes, bodyArgs, &rewriter);
     if (!opResult) {
       return ConversionPattern::matchFailure();
     }
@@ -180,9 +180,9 @@ class ScalarPointwiseToStandardConverter : public OpConversionPattern<LhloOp> {
     auto lhs = rewriter.create<LoadOp>(loc, lhlo_op.lhs());
     auto rhs = rewriter.create<LoadOp>(loc, lhlo_op.rhs());
     // TODO(ravishankarm) : Move this method out of xla_lhlo namespace.
-    Value opResult = xla_lhlo::MapXlaOpToStdScalarOp<LhloOp>(
-        llvm::cast<LhloOp>(lhlo_op), argType.getElementType(),
-        llvm::ArrayRef<Value>{lhs, rhs}, &rewriter);
+    Value opResult = xla_lhlo::XlaOpToStdScalarOp::map<LhloOp>(
+        lhlo_op, argType.getElementType(), llvm::ArrayRef<Value>{lhs, rhs},
+        &rewriter);
     rewriter.create<StoreOp>(loc, opResult, lhlo_op.out());
     rewriter.eraseOp(lhlo_op);
     return ConversionPattern::matchSuccess();
@@ -208,9 +208,6 @@ class DataMovementOpConverter : public OpConversionPattern<OpTy> {
     auto resultType = getXLAOpResultType<isLHLO>(op);
     if (!verifyXLAOpBufferOrTensorSemantics<isLHLO>(op))
       return ConversionPattern::matchFailure();
-    // TODO(b/150203558) Enable once tiling/fusion works in this case.
-    if (isLHLO && (operandType.getRank() == 0))
-      return ConversionPattern::matchFailure();
     ArrayAttr indexingMapsAttr =
         static_cast<const Derived&>(*this).getIndexingMapsAttr(op, &rewriter);
     if (!indexingMapsAttr) return ConversionPattern::matchFailure();
@@ -253,14 +250,13 @@ class BroadcastInDimConverter
 
     auto operandShape = operandType.getShape();
     SmallVector<AffineExpr, 4> dimExprs;
+    AffineMap inputMap = AffineMap::get(b->getContext());
     {
       dimExprs.reserve(nloops);
 
       if (broadcastOp.broadcast_dimensions()) {
         for (const auto& broadcastDim :
-             enumerate(broadcastOp.broadcast_dimensions()
-                           .getValue()
-                           .getIntValues())) {
+             enumerate(broadcastOp.broadcast_dimensions().getIntValues())) {
           int size = broadcastDim.value().getSExtValue();
           // TODO(pifon): Add support for args with dynamic shapes for the case
           // when a dimension of size 1 is broadcasted into dim of size N.
@@ -272,58 +268,13 @@ class BroadcastInDimConverter
       }
       if (dimExprs.empty()) {
         // The input is a scalar, i.e. this is a scalar broadcast op.
-        dimExprs.push_back(b->getAffineConstantExpr(0));
+        inputMap = AffineMap::get(nloops, /*symbolCount=*/0, b->getContext());
+      } else {
+        inputMap = AffineMap::get(nloops, /*symbolCount=*/0, dimExprs);
       }
     }
     return b->getAffineMapArrayAttr(
-        {AffineMap::get(nloops, /*symbolCount=*/0, dimExprs),
-         b->getMultiDimIdentityMap(nloops)});
-  }
-};
-
-// Special case for scalar broadcast in lhlo.
-// TODO(b/150203558) Remove once the bug is fixed.
-class ScalarBroadcastInDimConverter
-    : public OpConversionPattern<xla_lhlo::BroadcastInDimOp> {
- public:
-  using OpConversionPattern<xla_lhlo::BroadcastInDimOp>::OpConversionPattern;
-
-  PatternMatchResult matchAndRewrite(
-      xla_lhlo::BroadcastInDimOp broadcastOp, ArrayRef<Value> args,
-      ConversionPatternRewriter& rewriter) const final {
-    auto operandMemrefType =
-        broadcastOp.operand().getType().dyn_cast<MemRefType>();
-    // Only support scalar operands.
-    if (operandMemrefType.getRank() != 0) return matchFailure();
-    auto resultMemrefType =
-        broadcastOp.output().getType().dyn_cast<MemRefType>();
-    if (!operandMemrefType || !resultMemrefType) return matchFailure();
-    auto broadcastDims = broadcastOp.broadcast_dimensions();
-    if (!broadcastDims.hasValue()) return matchFailure();
-
-    unsigned nloops = resultMemrefType.getRank();
-    SmallVector<Attribute, 1> indexingMaps{
-        AffineMapAttr::get(rewriter.getMultiDimIdentityMap(nloops))};
-    auto loc = broadcastOp.getLoc();
-    auto linalgOp = rewriter.create<linalg::GenericOp>(
-        loc, ArrayRef<Type>{}, broadcastOp.output(),
-        rewriter.getI64IntegerAttr(0),  // args_in
-        rewriter.getI64IntegerAttr(1),  // args_out
-        rewriter.getArrayAttr(indexingMaps),
-        GetNParallelLoopsAttrs(nloops, &rewriter),
-        /*doc=*/nullptr, /*fun=*/nullptr, /*library_call=*/nullptr);
-
-    // Add a block to the region.
-    auto* region = &linalgOp.region();
-    auto* block = rewriter.createBlock(region, region->end());
-    block->addArguments(resultMemrefType.getElementType());
-
-    rewriter.setInsertionPointToEnd(block);
-    auto scalar =
-        rewriter.create<LoadOp>(loc, broadcastOp.operand(), llvm::None);
-    rewriter.create<linalg::YieldOp>(loc, scalar.getResult());
-    rewriter.eraseOp(broadcastOp);
-    return matchSuccess();
+        {inputMap, b->getMultiDimIdentityMap(nloops)});
   }
 };
 
@@ -537,21 +488,24 @@ void populateLHLOToLinalgConversionPattern(MLIRContext* context,
                    PointwiseToLinalgConverter<xla_lhlo::CeilOp>,
                    PointwiseToLinalgConverter<xla_lhlo::CompareOp>,
                    PointwiseToLinalgConverter<xla_lhlo::ConvertOp>,
+                   // TODO(ataei): Remove this pattern, CopyOp is folded away.
                    PointwiseToLinalgConverter<xla_lhlo::CopyOp>,
                    PointwiseToLinalgConverter<xla_lhlo::CosOp>,
                    PointwiseToLinalgConverter<xla_lhlo::DivOp>,
                    PointwiseToLinalgConverter<xla_lhlo::ExpOp>,
+                   PointwiseToLinalgConverter<xla_lhlo::LogOp>,
                    PointwiseToLinalgConverter<xla_lhlo::MaxOp>,
                    PointwiseToLinalgConverter<xla_lhlo::MinOp>,
                    PointwiseToLinalgConverter<xla_lhlo::MulOp>,
                    PointwiseToLinalgConverter<xla_lhlo::NegOp>,
                    PointwiseToLinalgConverter<xla_lhlo::RemOp>,
+                   PointwiseToLinalgConverter<xla_lhlo::RsqrtOp>,
                    PointwiseToLinalgConverter<xla_lhlo::SelectOp>,
                    PointwiseToLinalgConverter<xla_lhlo::SignOp>,
+                   PointwiseToLinalgConverter<xla_lhlo::SqrtOp>,
                    PointwiseToLinalgConverter<xla_lhlo::SubOp>,
                    PointwiseToLinalgConverter<xla_lhlo::TanhOp>,
                    ReshapeAddRemoveDimConverter<xla_lhlo::ReshapeOp>,
-                   ScalarBroadcastInDimConverter,
                    ScalarPointwiseToStandardConverter<xla_lhlo::AddOp>,
                    SliceConverter
                   >(context);
@@ -632,12 +586,15 @@ void populateHLOToLinalgConversionPattern(MLIRContext* context,
                    PointwiseToLinalgConverter<xla_hlo::CopyOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::DivOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::ExpOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::LogOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::MaxOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::MinOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::MulOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::NegOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::RemOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::RsqrtOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::SelectOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::SqrtOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::SubOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::TanhOp, false>>(context);
 }
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index f3ee4e38f31..77cd3dc074c 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -338,6 +338,21 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "searchsorted_op_test",
+    size = "small",
+    timeout = "moderate",
+    srcs = ["searchsorted_op_test.py"],
+    python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "svd_op_test",
     size = "medium",
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index f42d51dbb3a..8543e8ea2be 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import itertools
+import os
 
 import numpy as np
 
@@ -1600,4 +1601,8 @@ class BinaryOpsTest(xla_test.XLATestCase):
 
 
 if __name__ == "__main__":
+  # TODO(b/130689556): XLA CPU does not honor inf/nan which causes problems
+  os.environ[
+      "XLA_FLAGS"] = "--xla_cpu_enable_fast_math=false " + os.environ.get(
+          "XLA_FLAGS", "")
   googletest.main()
diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py
index a2da9815b18..31eb514b14c 100644
--- a/tensorflow/compiler/tests/image_ops_test.py
+++ b/tensorflow/compiler/tests/image_ops_test.py
@@ -514,6 +514,13 @@ class ResizeNearestNeighborTest(xla_test.XLATestCase):
                            [7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9]],
                           dtype=np.float32))
 
+  def testBFloat16(self):
+    img = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+                   dtype=dtypes.bfloat16.as_numpy_dtype)
+    self._assertForwardOpMatchesExpected(img, [4, 4], expected=np.array(
+        [[1, 2, 2, 3], [4, 5, 5, 6], [4, 5, 5, 6], [7, 8, 8, 9]],
+        dtype=np.float32))
+
   def testAlignCorners3x3To12x12_uint8(self):
     # TODO(b/72099414): enable the test for TPU when the issue is fixed.
     if (self.device not in ["XLA_GPU", "XLA_CPU"]):
@@ -590,12 +597,14 @@ class ResizeBilinearTest(parameterized.TestCase, xla_test.XLATestCase):
       ("256x256To299x299", 256, 256, 299, 299),
       ("512x512To299x299", 512, 512, 299, 299),
       ("224x224To224x224", 224, 224, 224, 224),
+      ("224x224To224x224-bfloat", 224, 224, 224, 224,
+       dtypes.bfloat16.as_numpy_dtype),
       # This test is disabled because it is very slow. It is slow because
       # 383 is prime, 383 and 2047 are coprime, and 2048 is large.
       # ("Disabled_384x72To2048x384", 384, 72, 2048, 384),
   )
 
-  def test(self, src_y, src_x, dst_y, dst_x):
+  def test(self, src_y, src_x, dst_y, dst_x, dtype=np.float32):
     if test.is_built_with_rocm():
       self.skipTest("Disabled on ROCm, because it runs out of memory")
 
@@ -613,7 +622,7 @@ class ResizeBilinearTest(parameterized.TestCase, xla_test.XLATestCase):
     ]
 
     self._assertForwardOpMatchesExpected(
-        np.array(input_data, dtype=np.float32), [dst_y, dst_x],
+        np.array(input_data, dtype=dtype), [dst_y, dst_x],
         expected=np.array(result, dtype=np.float32),
         large_tolerance=True)
 
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index dfa5bc106ed..8bad4da0524 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -54,6 +54,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/compiler/tests/searchsorted_op_test.py b/tensorflow/compiler/tests/searchsorted_op_test.py
new file mode 100644
index 00000000000..d77bd0902d3
--- /dev/null
+++ b/tensorflow/compiler/tests/searchsorted_op_test.py
@@ -0,0 +1,75 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test for XLA implementation of tf.searchsorted."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class SearchSorteddOpTest(xla_test.XLATestCase):
+
+  def test1D(self):
+    # Test against NumPy implementation (which is 1D only).
+    np.random.seed(1)
+    for side in ['left', 'right']:
+      for dtype in [np.float32, np.int32]:
+        values = np.random.uniform(
+            low=-1000, high=1000, size=(10,)).astype(dtype)
+        unsorted = np.random.uniform(
+            low=-1000, high=1000, size=(20,)).astype(dtype)
+
+        sorted_sequence = np.sort(unsorted)
+        np_ans = np.searchsorted(sorted_sequence, values, side=side)
+
+        with self.session() as session:
+          with self.test_scope():
+            tf_ans = array_ops.searchsorted(sorted_sequence, values, side=side)
+          tf_out = session.run(tf_ans)
+          self.assertAllEqual(np_ans, tf_out)
+
+  def _test2DExample(self, dtype, side, sorted_sequence, values, correct_ans):
+
+    with self.session() as session:
+      with self.test_scope():
+        tf_ans = array_ops.searchsorted(sorted_sequence, values, side=side)
+      tf_out = session.run(tf_ans)
+      self.assertAllEqual(correct_ans, tf_out)
+
+  def testLowerBound2DExample(self):
+    # 2D TensorFlow documentation example.
+    for dtype in self.float_types | self.int_types:
+      sorted_sequence = np.array([[0, 3, 9, 9, 10], [1, 2, 3, 4, 5]], dtype)
+      values = np.array([[2, 4, 9], [0, 2, 6]], dtype)
+      correct_ans = np.array([[1, 2, 2], [0, 1, 5]], dtype)
+      self._test2DExample(dtype, 'left', sorted_sequence, values, correct_ans)
+
+  def testUpperBound2DExample(self):
+    # 2D TensorFlow documentation example.
+    for dtype in self.float_types | self.int_types:
+      sorted_sequence = np.array([[0, 3, 9, 9, 10], [1, 2, 3, 4, 5]], dtype)
+      values = np.array([[2, 4, 9], [0, 2, 6]], dtype)
+      correct_ans = np.array([[1, 2, 4], [0, 2, 5]], dtype)
+      self._test2DExample(dtype, 'right', sorted_sequence, values, correct_ans)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/compiler/tests/while_test.py b/tensorflow/compiler/tests/while_test.py
index 420dc04bec3..f1f8b6c353c 100644
--- a/tensorflow/compiler/tests/while_test.py
+++ b/tensorflow/compiler/tests/while_test.py
@@ -240,6 +240,22 @@ class WhileTest(xla_test.XLATestCase):
       self.assertAllEqual(r, np.array([(x + 3) * 2 for x in nums]))
       xla_context.Exit()
 
+  @test_util.enable_control_flow_v2
+  def testMapBackPropFalse(self):
+    if is_compile_on_demand():
+      self.skipTest("list_ops are not supported in cpu_ondemand")
+    with self.session(), self.test_scope():
+      xla_context = control_flow_ops.XLAControlFlowContext()
+      xla_context.Enter()
+      nums = [1, 2, 3, 4, 5, 6]
+      elems = constant_op.constant(nums, name="data")
+      r = map_fn.map_fn(
+          lambda x: math_ops.multiply(math_ops.add(x, 3), 2),
+          elems,
+          back_prop=False)
+      self.assertAllEqual(r, np.array([(x + 3) * 2 for x in nums]))
+      xla_context.Exit()
+
 
 def is_compile_on_demand():
   return ("TF_XLA_FLAGS" in os.environ and
diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD
index b26b509b067..371a5804008 100644
--- a/tensorflow/compiler/tf2tensorrt/BUILD
+++ b/tensorflow/compiler/tf2tensorrt/BUILD
@@ -13,9 +13,15 @@ load(
     "tf_gen_op_wrapper_py",
     "tf_gpu_kernel_library",
 )
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
-load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "pybind_extension")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_additional_all_protos",
@@ -235,8 +241,8 @@ tf_custom_op_py_library(
     name = "trt_ops_loader",
     srcs_version = "PY2AND3",
     deps = [
+        ":_pywrap_py_utils",
         ":trt_ops",
-        ":wrap_py_utils",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:platform",
@@ -547,12 +553,16 @@ cc_library(
     ]),
 )
 
-tf_py_wrap_cc(
-    name = "wrap_py_utils",
-    srcs = ["utils/py_utils.i"],
-    copts = tf_copts(),
+pybind_extension(
+    name = "_pywrap_py_utils",
+    srcs = ["utils/py_utils_wrapper.cc"],
+    link_in_framework = True,
+    module_name = "_pywrap_py_utils",
     deps = [
         ":py_utils",
-        "//third_party/python_runtime:headers",
+        "//tensorflow/core/platform:env",
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:status",
+        "@pybind11",
     ],
 )
diff --git a/tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc b/tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc
index 50d0ae8c000..82e68cbb28d 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/logger_registry.cc
@@ -47,7 +47,7 @@ class LoggerRegistryImpl : public LoggerRegistry {
  private:
   mutable mutex mu_;
   mutable std::unordered_map<string, std::unique_ptr<nvinfer1::ILogger>>
-      registry_ GUARDED_BY(mu_);
+      registry_ TF_GUARDED_BY(mu_);
 };
 
 LoggerRegistry* GetLoggerRegistry() {
diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
index 7995163ed44..d9d8a4461a3 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
@@ -44,6 +44,7 @@ Status TRTOptimizationPass::Init(
   if (config == nullptr) {
     return Status::OK();
   }
+  VLOG(1) << "config = " << config->DebugString();
   const auto params = config->parameter_map();
   if (params.count("minimum_segment_size")) {
     minimum_segment_size_ = params.at("minimum_segment_size").i();
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
index 05e6575dc1c..a0524f4a90e 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
@@ -123,7 +123,15 @@ class TRTEngineOp : public AsyncOpKernel {
   // input and 2) The index of the IExecutionContext compatible with the input.
   StatusOr<std::pair<EngineContext*, int>> GetEngine(
       const std::vector<TensorShape>& input_concrete_shapes,
-      OpKernelContext* ctx, TRTEngineCacheResource* cache_res);
+      OpKernelContext* ctx, TRTEngineCacheResource* cache_resource);
+
+  // Builds and returns a cuda engine for the input shapes. If building the
+  // engine fails, enters a dummy entry into the cache_resource cache so we
+  // don't continually try to build the same failing engine.
+  StatusOr<TrtUniquePtrType<nvinfer1::ICudaEngine>> BuildEngine(
+      const std::vector<TensorShape>& input_concrete_shapes, int batch_size,
+      bool use_calibration, TRTInt8Calibrator* calibrator,
+      TRTEngineCacheResource* cache_resource);
 
   // Verify that the input shapes are consistent and can be handled by this op.
   Status VerifyInputShapes(const std::vector<TensorShape>& shapes);
@@ -881,6 +889,40 @@ Status TRTEngineOp::GetEngineCacheResource(OpKernelContext* ctx,
       }});
 }
 
+StatusOr<TrtUniquePtrType<nvinfer1::ICudaEngine>> TRTEngineOp::BuildEngine(
+    const std::vector<TensorShape>& input_concrete_shapes, int batch_size,
+    bool use_calibration, TRTInt8Calibrator* calibrator,
+    TRTEngineCacheResource* cache_resource) {
+  VLOG(1) << "Building a new TensorRT engine for " << name()
+          << " with input shapes: "
+          << TensorShapeUtils::ShapeListString(input_concrete_shapes);
+
+  // Use concrete shapes for implicit batch mode and partial shapes for
+  // explicit batch mode.
+  const std::vector<PartialTensorShape>& conversion_input_shapes =
+      use_implicit_batch_
+          ? std::vector<PartialTensorShape>(input_concrete_shapes.begin(),
+                                            input_concrete_shapes.end())
+          : input_partial_shapes_;
+  TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
+  auto status = convert::ConvertGraphDefToEngine(
+      segment_graph_def_, precision_mode_, batch_size, workspace_size_,
+      conversion_input_shapes, &logger, cache_resource->allocator_.get(),
+      calibrator, &engine, use_calibration, use_implicit_batch_, nullptr,
+      &cache_resource->profiles_);
+  if (!status.ok()) {
+    LOG(WARNING) << "Engine creation for " << name() << " failed. "
+                 << "The native segment will be used instead. "
+                 << "Reason: " << status;
+    // Store an empty engine in the cache for these input shapes so we don't try
+    // to build the same failing engine again.
+    cache_resource->cache_.emplace(input_concrete_shapes,
+                                   absl::make_unique<EngineContext>());
+    return status;
+  }
+  return engine;
+}
+
 StatusOr<std::pair<EngineContext*, int>> TRTEngineOp::GetEngine(
     const std::vector<TensorShape>& input_concrete_shapes, OpKernelContext* ctx,
     TRTEngineCacheResource* cache_res) {
@@ -918,7 +960,32 @@ StatusOr<std::pair<EngineContext*, int>> TRTEngineOp::GetEngine(
         infer->deserializeCudaEngine(serialized_segment_.c_str(),
                                      serialized_segment_.size(), nullptr));
     if (!static_engine) {
-      return std::pair<EngineContext*, int>(&empty_context, 0);
+      if (!allow_build_at_runtime_) {
+        // Store an empty engine in the cache so we don't try to load the same
+        // failing engine again.
+        cache.emplace(input_concrete_shapes,
+                      absl::make_unique<EngineContext>());
+        return std::pair<EngineContext*, int>(&empty_context, 0);
+      }
+      if (segment_graph_def_.node().empty()) {
+        FunctionLibraryRuntime* lib = ctx->function_library();
+        auto status = ConstructFunctionHandle(lib, ctx->device()->name());
+        if (status.ok()) {
+          status =
+              FunctionDefToGraphDef(func_handle_, lib, &segment_graph_def_);
+        }
+        if (!status.ok()) {
+          LOG(WARNING) << "Getting segment graph for " << name() << " failed. "
+                       << "Reason: " << status;
+        }
+      }
+      auto result = BuildEngine(input_concrete_shapes, batch_size,
+                                /*use_calibration=*/false,
+                                /*calibrator=*/nullptr, cache_res);
+      if (!result.ok()) {
+        return std::pair<EngineContext*, int>(&empty_context, 0);
+      }
+      static_engine = std::move(result.ValueOrDie());
     }
     auto raw_static_engine = static_engine.get();
     const auto max_batch_size = raw_static_engine->getMaxBatchSize();
@@ -977,36 +1044,16 @@ StatusOr<std::pair<EngineContext*, int>> TRTEngineOp::GetEngine(
       cache.emplace(input_concrete_shapes, absl::make_unique<EngineContext>());
       return std::pair<EngineContext*, int>(&empty_context, 0);
     }
-    TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
-    bool convert_successfully = false;
-    LOG(INFO) << "Building a new TensorRT engine for " << name()
-              << " with input shapes: "
-              << TensorShapeUtils::ShapeListString(input_concrete_shapes);
-
-    // Use concrete shapes for implicit batch mode and partial shapes for
-    // explicit batch mode.
-    const std::vector<PartialTensorShape>& conversion_input_shapes =
-        use_implicit_batch_
-            ? std::vector<PartialTensorShape>(input_concrete_shapes.begin(),
-                                              input_concrete_shapes.end())
-            : input_partial_shapes_;
 
     // Up to this point, calibrator_ can never be empty, since otherwise it
     // means calibration_mode_ is true and this path won't get executed.
-    auto status = convert::ConvertGraphDefToEngine(
-        segment_graph_def_, precision_mode_, batch_size, workspace_size_,
-        conversion_input_shapes, &logger, allocator, calibrator_.get(), &engine,
-        use_calibration_, use_implicit_batch_, &convert_successfully,
-        &cache_res->profiles_);
-    if (!status.ok()) {
-      LOG(WARNING) << "Engine creation for " << name() << " failed. "
-                   << "The native segment will be used instead. "
-                   << "Reason: " << status;
-      // Store an empty engine in the cache for these input shapes so we don't
-      // try to build the same failing engine again.
-      cache.emplace(input_concrete_shapes, absl::make_unique<EngineContext>());
+    auto result = BuildEngine(input_concrete_shapes, batch_size,
+                              use_calibration_, calibrator_.get(), cache_res);
+    if (!result.ok()) {
       return std::pair<EngineContext*, int>(&empty_context, 0);
     }
+    TrtUniquePtrType<nvinfer1::ICudaEngine> engine =
+        std::move(result.ValueOrDie());
     std::vector<TrtUniquePtrType<nvinfer1::IExecutionContext>> exec_context;
     TF_RETURN_IF_ERROR(cache_res->profiles_.CreateExecutionContexts(
         engine.get(), exec_context));
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
index 8ef72ba44d5..2c5821df6ac 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
@@ -71,7 +71,7 @@ class CreateTRTResourceHandle : public OpKernel {
   string resource_name_;
   Tensor handle_;
   mutex mutex_;
-  bool initialized_ GUARDED_BY(mutex_) = false;
+  bool initialized_ TF_GUARDED_BY(mutex_) = false;
 
   TF_DISALLOW_COPY_AND_ASSIGN(CreateTRTResourceHandle);
 };
diff --git a/tensorflow/compiler/tf2tensorrt/utils/py_utils.i b/tensorflow/compiler/tf2tensorrt/utils/py_utils.i
deleted file mode 100644
index d6e8eac5836..00000000000
--- a/tensorflow/compiler/tf2tensorrt/utils/py_utils.i
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/* Wrap trt_conversion */
-%{
-#define SWIG_FILE_WITH_INIT
-%}
-
-%{
-struct version_struct{
-  int vmajor;
-  int vminor;
-  int vpatch;
-};
-
-PyObject* version_helper(version_struct* in) {
-  PyObject *tuple(nullptr);
-  tuple = Py_BuildValue("(iii)", in->vmajor, in->vminor, in->vpatch);
-  if (!tuple) {
-    if (!PyErr_Occurred()) {
-      PyErr_SetString(PyExc_TypeError,
-                      "Tuple creation from version structure failed!");
-    }
-    return NULL;
-  }
-  return tuple;
-}
-
-%}
-
-%typemap(out) version_struct {
-  PyObject *tuple = version_helper(&$1);
-  if (!tuple) SWIG_fail;
-  $result = tuple;
-}
-
-%{
-#include "tensorflow/compiler/tf2tensorrt/utils/py_utils.h"
-%}
-
-%ignore "";
-%rename("%s") get_linked_tensorrt_version;
-%rename("%s") get_loaded_tensorrt_version;
-%rename("%s") is_tensorrt_enabled;
-
-%{
-
-version_struct get_linked_tensorrt_version() {
-  // Return the version at the link time.
-  version_struct s;
-  tensorflow::tensorrt::GetLinkedTensorRTVersion(
-      &s.vmajor, &s.vminor, &s.vpatch);
-  return s;
-}
-
-version_struct get_loaded_tensorrt_version() {
-  // Return the version from the loaded library.
-  version_struct s;
-  tensorflow::tensorrt::GetLoadedTensorRTVersion(
-      &s.vmajor, &s.vminor, &s.vpatch);
-  return s;
-}
-
-bool is_tensorrt_enabled() {
-  return tensorflow::tensorrt::IsGoogleTensorRTEnabled();
-}
-
-%}
-
-version_struct get_linked_tensorrt_version();
-version_struct get_loaded_tensorrt_version();
-bool is_tensorrt_enabled();
-
-%rename("%s") "";
diff --git a/tensorflow/compiler/tf2tensorrt/utils/py_utils_wrapper.cc b/tensorflow/compiler/tf2tensorrt/utils/py_utils_wrapper.cc
new file mode 100644
index 00000000000..0d7819931b1
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/utils/py_utils_wrapper.cc
@@ -0,0 +1,43 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <tuple>
+
+#include "include/pybind11/pybind11.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/py_utils.h"
+
+std::tuple<int, int, int> get_linked_tensorrt_version() {
+  int major, minor, patch;
+  tensorflow::tensorrt::GetLinkedTensorRTVersion(&major, &minor, &patch);
+  return std::tuple<int, int, int>{major, minor, patch};
+}
+
+std::tuple<int, int, int> get_loaded_tensorrt_version() {
+  int major, minor, patch;
+  tensorflow::tensorrt::GetLoadedTensorRTVersion(&major, &minor, &patch);
+  return std::tuple<int, int, int>{major, minor, patch};
+}
+
+PYBIND11_MODULE(_pywrap_py_utils, m) {
+  m.doc() = "_pywrap_py_utils: Various TensorRT utilities";
+  m.def("get_linked_tensorrt_version", get_linked_tensorrt_version,
+        "Return the compile time TensorRT library version as the tuple "
+        "(Major, Minor, Patch).");
+  m.def("get_loaded_tensorrt_version", get_loaded_tensorrt_version,
+        "Return the runtime time TensorRT library version as the tuple "
+        "(Major, Minor, Patch).");
+  m.def("is_tensorrt_enabled", tensorflow::tensorrt::IsGoogleTensorRTEnabled,
+        "Returns True if TensorRT is enabled.");
+}
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
index 97995fa186a..8e345254f75 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
@@ -136,7 +136,7 @@ struct EngineContext {
   TrtUniquePtrType<nvinfer1::ICudaEngine> cuda_engine;
 
   Status GetExecutionContext(int idx, nvinfer1::IExecutionContext** exec_ctx)
-      EXCLUSIVE_LOCKS_REQUIRED(mu) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu) {
     if (idx >= execution_context.size()) {
       return errors::Internal("Requested engine context with index ", idx,
                               ", but only ", execution_context.size(),
@@ -152,7 +152,7 @@ struct EngineContext {
   // for inference at a time therefore we need a mutex. More details at
   // https://docs.nvidia.com/deeplearning/sdk/tensorrt-best-practices/index.html#thread-safety
   std::vector<TrtUniquePtrType<nvinfer1::IExecutionContext>> execution_context
-      GUARDED_BY(mu);
+      TF_GUARDED_BY(mu);
 };
 
 // Contains the context required to build the calibration data.
@@ -174,8 +174,8 @@ class CalibrationContext {
 
  private:
   mutex mu_;
-  bool terminated_ GUARDED_BY(mu_) = false;
-  std::string calibration_table_ GUARDED_BY(mu_);
+  bool terminated_ TF_GUARDED_BY(mu_) = false;
+  std::string calibration_table_ TF_GUARDED_BY(mu_);
 };
 
 ABSL_CONST_INIT extern const absl::string_view kTfTrtContainerName;
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 30bd1eff8eb..a6f88df7e40 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -5,6 +5,7 @@ load(
 )
 load(
     "//tensorflow/core/platform:build_config.bzl",
+    "tf_additional_tensor_coding_deps",
     "tf_proto_library",
     "tf_proto_library_cc",
 )
@@ -37,6 +38,7 @@ package_group(
         "//learning/brain/tools/tf_replay/...",
         "//tensorflow/...",
         "//tensorflow_models/...",
+        "//third_party/mlperf/submissions/training/v0_7/models/...",
     ],
 )
 
@@ -176,8 +178,8 @@ cc_library(
         "//tensorflow/compiler/mlir/xla:mlir_hlo_to_hlo",
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -236,18 +238,20 @@ cc_library(
     features = ["fully_static_link"],
     linkstatic = 1,
     visibility = [":friends"],
-    # Note, we specifically remove MKL dependencies so the standalone does
-    # not require the MKL binary blob.
+    # Note, we specifically removed MKL and multithreaded dependencies so the
+    # standalone does not require the MKL binary blob or threading libraries.
+    #
+    # TODO(ebrevdo): Remove tf_additoinal_tensor_coding_deps in favor of
+    # absl/strings:cord when we update absl to a newer version.
     deps = [
-        "//tensorflow/core/framework:numeric_types",
-        "//third_party/eigen3",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:dynamic_annotations",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:cord",
         "@com_google_absl//absl/synchronization",
-    ],
+        "//third_party/eigen3",
+        "//tensorflow/core/framework:numeric_types",
+    ] + tf_additional_tensor_coding_deps(),
     alwayslink = 1,
 )
 
@@ -692,6 +696,7 @@ cc_library(
     srcs = ["mlir_bridge_pass.cc"],
     hdrs = ["mlir_bridge_pass.h"],
     deps = [
+        "//tensorflow/compiler/mlir:mlir_graph_optimization_pass",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
         "//tensorflow/compiler/mlir/tensorflow:device_util",
@@ -711,6 +716,7 @@ cc_library(
     ],
     deps = [
         ":mlir_bridge_pass",
+        "//tensorflow/compiler/mlir:mlir_graph_optimization_pass_registration",
         "//tensorflow/core:core_cpu",
     ],
     alwayslink = 1,
diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc
index f0aebc9b543..eadd05fcee0 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler.cc
@@ -54,6 +54,7 @@ namespace tensorflow {
 namespace {
 Status PrepareArguments(XlaOpKernelContext* ctx, Graph* graph,
                         const std::vector<const XlaExpression*>& expressions,
+                        const NameAttrList& func,
                         std::vector<XlaCompiler::Argument>* args) {
   auto client = ctx->compiler()->client();
   std::vector<bool> arg_must_be_compile_time_constant(expressions.size());
@@ -78,9 +79,10 @@ Status PrepareArguments(XlaOpKernelContext* ctx, Graph* graph,
           TF_ASSIGN_OR_RETURN(absl::optional<Tensor> value,
                               expressions[i]->ResolveConstant(client));
           if (!value.has_value()) {
-            return errors::InvalidArgument(
-                "Argument to function must be a compile-time constant, but "
-                "unable to resolve argument value to a constant.");
+            return errors::InvalidArgument(absl::StrCat(
+                "Argument ", i, " to function '", func.name(),
+                "' must be a compile-time constant, but ",
+                "unable to resolve argument value to a constant."));
           }
           arg.kind = XlaCompiler::Argument::kConstant;
           arg.constant_value = *value;
@@ -249,8 +251,8 @@ Status GraphCompiler::CompileFunctionalNode(Node* n,
 
   auto graph = compiler->GetGraph(fbody);
 
-  TF_RETURN_IF_ERROR(
-      PrepareArguments(&xla_op_context, graph.get(), expressions, &arguments));
+  TF_RETURN_IF_ERROR(PrepareArguments(&xla_op_context, graph.get(), expressions,
+                                      func, &arguments));
 
   bool add_token_input_output =
       func.attr().find(kXlaTokenInputNodesAttrName) != func.attr().end();
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 8571c503299..5f1c2f28ba4 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -55,6 +55,7 @@ tf_kernel_library(
         "index_ops.cc",
         "l2loss_op.cc",
         "listdiff_op.cc",
+        "lower_upper_bound_ops.cc",
         "lrn_ops.cc",
         "matmul_op.cc",
         "matrix_band_part_op.cc",
@@ -149,6 +150,7 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla/lib:util",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:array4d",
+        "//tensorflow/compiler/xla:comparison_util",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
index 9f0ec65bb71..b60a13972a7 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
@@ -29,10 +29,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/ops_util.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
index ba11b12fa2a..63e3f185421 100644
--- a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/util/tensor_format.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/identity_op.cc b/tensorflow/compiler/tf2xla/kernels/identity_op.cc
index 8b27e8e85a3..38d8056d3e5 100644
--- a/tensorflow/compiler/tf2xla/kernels/identity_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/identity_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 
@@ -25,10 +26,15 @@ class IdentityOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     for (int i = 0; i < ctx->num_inputs(); ++i) {
-      // Forwards using the underlying op_kernel_context so both tensor and
-      // resource values are forwarded correctly.
-      ctx->op_kernel_context()->set_output(i,
-                                           ctx->op_kernel_context()->input(i));
+      if (IsTensorListInput(ctx, i)) {
+        ctx->SetTensorListOutput(i, ctx->Input(i));
+      } else {
+        DCHECK(ctx->input_type(i) != DT_VARIANT);
+        // Forwards using the underlying op_kernel_context so both tensor and
+        // resource values are forwarded correctly.
+        ctx->op_kernel_context()->set_output(
+            i, ctx->op_kernel_context()->input(i));
+      }
     }
   }
 
@@ -48,7 +54,7 @@ REGISTER_XLA_OP(Name("IdentityN")
                 IdentityOp);
 REGISTER_XLA_OP(Name("PlaceholderWithDefault"), IdentityOp);
 REGISTER_XLA_OP(Name("PreventGradient"), IdentityOp);
-REGISTER_XLA_OP(Name("StopGradient"), IdentityOp);
+REGISTER_XLA_OP(Name("StopGradient").AllowVariantTypes(), IdentityOp);
 REGISTER_XLA_OP(Name("Snapshot"), IdentityOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/lower_upper_bound_ops.cc b/tensorflow/compiler/tf2xla/kernels/lower_upper_bound_ops.cc
new file mode 100644
index 00000000000..0eacf8812f1
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/lower_upper_bound_ops.cc
@@ -0,0 +1,116 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/comparison_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+namespace {
+
+// Builds a LowerBound or UpperBound op, the distinction lying in
+// comparison_direction: GT => LowerBoundOp, GE => UpperBoundOp.
+// Note that this is an O(MN) algorithm: all entries in each sorted_inputs row
+// are considered, and their sorted nature is not fully exploited.
+void BuildLowerUpperBoundOp(XlaOpKernelContext* ctx, DataType out_dtype,
+                            xla::ComparisonDirection comparison_direction) {
+  const TensorShape sorted_inputs_shape = ctx->InputShape("sorted_inputs");
+  const TensorShape values_shape = ctx->InputShape("values");
+  const xla::XlaOp sorted_inputs = ctx->Input("sorted_inputs");
+  const xla::XlaOp values = ctx->Input("values");
+
+  // We are assuming both inputs are 2D, which they will be given the current
+  // implementation of tf.searchsorted.
+  OP_REQUIRES(ctx, sorted_inputs_shape.dims() == 2,
+              errors::FailedPrecondition("sorted_inputs must be 2D"));
+  OP_REQUIRES(ctx, values_shape.dims() == 2,
+              errors::FailedPrecondition("values must be 2D"));
+
+  // Add a new inner dimension to values, to allow broadcasting along the inner
+  // dimension of sorted_sequence.
+  auto new_values_shape = values_shape;
+  new_values_shape.InsertDim(/* d */ 2, /* size */ 1);
+  auto values_reshaped = xla::Reshape(values, new_values_shape.dim_sizes());
+
+  // Add a new penultimate dimension to sorted_inputs, to allow broadcasting of
+  // sorted_sequence entries for each value.
+  auto new_sorted_inputs_shape = sorted_inputs_shape;
+  new_sorted_inputs_shape.InsertDim(/* d */ 1, /* size */ 1);
+  auto sorted_inputs_reshaped =
+      xla::Reshape(sorted_inputs, new_sorted_inputs_shape.dim_sizes());
+
+  // We are relying on broadcasting to compare each value against each entry in
+  // the associated sorted_inputs row.
+  // The reshapes above leave the tensors with equal rank of 3, so broadcast
+  // dimensions are not explicitly specified.
+  auto comparison = xla::Compare(values_reshaped, sorted_inputs_reshaped, {},
+                                 comparison_direction);
+
+  const DataType accumulation_type = XlaHelpers::SumAccumulationType(out_dtype);
+
+  // Convert boolean comparison results to integers so we can sum them.
+  auto comparison_int =
+      XlaHelpers::ConvertElementType(comparison, accumulation_type);
+
+  // Sum the comparison results over the inner dimension to find the index for
+  // each value.
+  xla::XlaBuilder* builder = ctx->builder();
+  auto reduced =
+      xla::Reduce(comparison_int, XlaHelpers::Zero(builder, accumulation_type),
+                  *ctx->GetOrCreateAdd(accumulation_type), {2});
+
+  ctx->SetOutput(0, reduced);
+}
+
+class LowerBoundOp : public XlaOpKernel {
+ public:
+  explicit LowerBoundOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("out_type", &out_dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    BuildLowerUpperBoundOp(ctx, out_dtype_, xla::ComparisonDirection::kGt);
+  }
+
+ private:
+  DataType out_dtype_;
+};
+
+REGISTER_XLA_OP(Name("LowerBound"), LowerBoundOp);
+
+class UpperBoundOp : public XlaOpKernel {
+ public:
+  explicit UpperBoundOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("out_type", &out_dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    BuildLowerUpperBoundOp(ctx, out_dtype_, xla::ComparisonDirection::kGe);
+  }
+
+ private:
+  DataType out_dtype_;
+};
+
+REGISTER_XLA_OP(Name("UpperBound"), UpperBoundOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
index 7ac4cb8fb06..6d0d569724f 100644
--- a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
+++ b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
@@ -17,125 +17,32 @@ limitations under the License.
 
 #include <string>
 
-#include "absl/container/flat_hash_set.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/raw_os_ostream.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/bridge.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 
-// Dumps the MLIR module to disk.
-// This require the TF_DUMP_GRAPH_PREFIX to be set to a path that exist (or can
-// be created).
-static void DumpModule(mlir::ModuleOp module, llvm::StringRef file_prefix) {
-  std::string prefix = GetDumpDirFromEnvVar();
-  if (prefix.empty()) {
-    return;
-  }
-
-  auto* env = tensorflow::Env::Default();
-  auto status = env->RecursivelyCreateDir(prefix);
-  if (!status.ok()) {
-    LOG(WARNING) << "cannot create directory '" + prefix +
-                        "': " + status.error_message();
-    return;
-  }
-  prefix += "/" + file_prefix.str();
-  if (!tensorflow::Env::Default()->CreateUniqueFileName(&prefix, ".mlir")) {
-    LOG(WARNING) << "cannot create unique filename, won't dump MLIR module.";
-    return;
-  }
-
-  std::unique_ptr<WritableFile> file_writer;
-  status = env->NewWritableFile(prefix, &file_writer);
-  if (!status.ok()) {
-    LOG(WARNING) << "cannot open file '" + prefix +
-                        "': " + status.error_message();
-    return;
-  }
-
-  // Print the module to a string before writing to the file.
-  std::string txt_module;
-  {
-    llvm::raw_string_ostream os(txt_module);
-    module.print(os);
-  }
-
-  status = file_writer->Append(txt_module);
-  if (!status.ok()) {
-    LOG(WARNING) << "error writing to file '" + prefix +
-                        "': " + status.error_message();
-    return;
-  }
-  (void)file_writer->Close();
-  VLOG(1) << "Dumped MLIR module to " << prefix;
-}
-
 // This runs the first phase of the "bridge", transforming the graph in a form
 // that can be executed with delegation of some computations to an accelerator.
 // This builds on the model of XLA where a subset of the graph is encapsulated
 // and attached to a "compile" operation, whose result is fed to an "execute"
 // operation. The kernel for these operations is responsible to lower the
 // encapsulated graph to a particular device.
-Status MlirBridgePass::Run(const DeviceSet& device_set,
-                           const ConfigProto& config_proto,
-                           std::unique_ptr<Graph>* graph,
-                           FunctionLibraryDefinition* flib_def,
-                           std::vector<std::string>* control_ret_node_names,
-                           bool* control_rets_updated) {
+Status MlirBridgePass::Run(const ConfigProto& config_proto,
+                           mlir::ModuleOp module) {
   if (!config_proto.experimental().enable_mlir_bridge()) {
     VLOG(1) << "Skipping MLIR Bridge Pass, session flag not enabled";
     return Status::OK();
   }
 
   VLOG(1) << "Running MLIR Bridge Pass";
-
-  GraphDebugInfo debug_info;
-  mlir::MLIRContext context;
-  GraphImportConfig import_config;
-  import_config.graph_as_function = true;
-  import_config.control_outputs = *control_ret_node_names;
-  TF_ASSIGN_OR_RETURN(auto module_ref,
-                      ConvertGraphToMlir(**graph, debug_info, *flib_def,
-                                         import_config, &context));
-
-  AddDevicesToOp(*module_ref, &device_set);
-
-  if (VLOG_IS_ON(1)) DumpModule(*module_ref, "mlir_bridge_before_");
-
-  // Run the bridge now
   TF_RETURN_IF_ERROR(
-      mlir::TFTPU::TPUBridge(*module_ref, /*enable_logging=*/VLOG_IS_ON(1)));
-
-  if (VLOG_IS_ON(1)) DumpModule(*module_ref, "mlir_bridge_after_");
-
-  GraphExportConfig export_config;
-  export_config.graph_as_function = true;
-  absl::flat_hash_set<Node*> control_ret_nodes;
-  TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      ConvertMlirToGraph(*module_ref, export_config, graph, flib_def,
-                         &control_ret_nodes),
-      "Error converting MLIR module back to graph");
-
-  control_ret_node_names->clear();
-  control_ret_node_names->reserve(control_ret_nodes.size());
-  for (const auto* node : control_ret_nodes)
-    control_ret_node_names->push_back(node->name());
-
-  *control_rets_updated = true;
+      mlir::TFTPU::TPUBridge(module, /*enable_logging=*/VLOG_IS_ON(1)));
 
   return Status::OK();
 }
-
-Status MlirBridgeV1CompatPass::Run(
-    const GraphOptimizationPassOptions& options) {
+Status MlirBridgeV1CompatPass::Run(const GraphOptimizationPassOptions& options,
+                                   mlir::ModuleOp module) {
   // Skip function graphs as MlirBridgePass will be used instead.
   if (options.is_function_graph) return Status::OK();
 
@@ -145,31 +52,8 @@ Status MlirBridgeV1CompatPass::Run(
   }
 
   VLOG(1) << "Running MLIR Bridge V1 Compat Pass";
-
-  GraphDebugInfo debug_info;
-  mlir::MLIRContext context;
-  GraphImportConfig import_config;
-  import_config.upgrade_legacy = true;
-  TF_ASSIGN_OR_RETURN(
-      auto module_ref,
-      ConvertGraphToMlir(**options.graph, debug_info, *options.flib_def,
-                         import_config, &context));
-
-  AddDevicesToOp(*module_ref, options.device_set);
-
-  if (VLOG_IS_ON(1)) DumpModule(*module_ref, "mlir_bridge_v1_compat_before_");
-
-  // Run the bridge now
-  TF_RETURN_IF_ERROR(mlir::TFTPU::TPUBridgeV1Compat(
-      *module_ref, /*enable_logging=*/VLOG_IS_ON(1)));
-
-  if (VLOG_IS_ON(1)) DumpModule(*module_ref, "mlir_bridge_v1_compat_after_");
-
-  GraphExportConfig export_config;
-  TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      ConvertMlirToGraph(*module_ref, export_config, options.graph,
-                         options.flib_def),
-      "Error converting MLIR module back to graph");
+  TF_RETURN_IF_ERROR(
+      mlir::TFTPU::TPUBridgeV1Compat(module, /*enable_logging=*/VLOG_IS_ON(1)));
 
   return Status::OK();
 }
diff --git a/tensorflow/compiler/tf2xla/mlir_bridge_pass.h b/tensorflow/compiler/tf2xla/mlir_bridge_pass.h
index e7f3fee79ca..b7f8ef203f7 100644
--- a/tensorflow/compiler/tf2xla/mlir_bridge_pass.h
+++ b/tensorflow/compiler/tf2xla/mlir_bridge_pass.h
@@ -16,28 +16,42 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_MLIR_BRIDGE_PASS_H_
 #define TENSORFLOW_COMPILER_TF2XLA_MLIR_BRIDGE_PASS_H_
 
-#include "tensorflow/core/common_runtime/function_optimization_registry.h"
-#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "llvm/ADT/StringRef.h"
+#include "tensorflow/compiler/mlir/mlir_graph_optimization_pass.h"
 
 namespace tensorflow {
 
 // This pass uses MLIR to implement all the conversion steps to target XLA from
 // a TensorFlow Function Graph. It is meant to expose a very limited set of
 // functionalities during the bring-up of MLIR-based bridge.
-class MlirBridgePass : public FunctionOptimizationPass {
+class MlirBridgePass : public MlirOptimizationPass {
  public:
-  Status Run(const DeviceSet& device_set, const ConfigProto& config_proto,
-             std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def,
-             std::vector<std::string>* control_ret_node_names,
-             bool* control_rets_updated) override;
+  llvm::StringRef name() const override { return "bridge"; }
+
+  bool IsEnabled(const ConfigProto& config_proto) const override {
+    return config_proto.experimental().enable_mlir_bridge();
+  }
+
+  // This should be used as a thin mapper around mlir::ModulePass::runOnModule
+  // API integrated with the Tensorflow runtime.
+  Status Run(const ConfigProto& config_proto, mlir::ModuleOp module) override;
 };
 
 // This pass uses MLIR to implement all the conversion steps to target XLA from
 // a TensorFlow V1 Graph. It is meant to expose a very limited set of
 // functionalities during the bring-up of MLIR-based bridge.
-class MlirBridgeV1CompatPass : public GraphOptimizationPass {
+class MlirBridgeV1CompatPass : public MlirV1CompatOptimizationPass {
  public:
-  Status Run(const GraphOptimizationPassOptions& options) override;
+  llvm::StringRef name() const override { return "bridge"; }
+
+  bool IsEnabled(const ConfigProto& config_proto) const override {
+    return config_proto.experimental().enable_mlir_bridge();
+  }
+
+  // This should be used as a thin mapper around mlir::ModulePass::runOnModule
+  // API integrated with the Tensorflow runtime.
+  Status Run(const GraphOptimizationPassOptions& options,
+             mlir::ModuleOp module) override;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/mlir_bridge_pass_registration.cc b/tensorflow/compiler/tf2xla/mlir_bridge_pass_registration.cc
index ac6e54d4e76..21791ff4427 100644
--- a/tensorflow/compiler/tf2xla/mlir_bridge_pass_registration.cc
+++ b/tensorflow/compiler/tf2xla/mlir_bridge_pass_registration.cc
@@ -16,15 +16,18 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/tf2xla/mlir_bridge_pass.h"
-#include "tensorflow/core/common_runtime/function_optimization_registry.h"
-#include "tensorflow/core/common_runtime/optimization_registry.h"
 
 namespace tensorflow {
+namespace {
+constexpr int kMlirBridgePriority = 10;
+}
 
-static function_optimization_registration::FunctionOptimizationPassRegistration
-    register_mlir_bridge_pass(std::make_unique<MlirBridgePass>());
+static mlir_pass_registration::MlirOptimizationPassRegistration
+    register_mlir_bridge_pass(kMlirBridgePriority,
+                              std::make_unique<MlirBridgePass>());
 
-REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 0,
-                      MlirBridgeV1CompatPass);
+static mlir_pass_registration::MlirV1CompatOptimizationPassRegistration
+    register_v1_compat_mlir_bridge_pass(
+        kMlirBridgePriority, std::make_unique<MlirBridgeV1CompatPass>());
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/mlir_tf2xla.cc b/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
index 3c02f9dd2e2..9303e2e9330 100644
--- a/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
@@ -86,9 +86,10 @@ Status ConvertOutputInfo(const tf2xla::Config& config,
 
 }  // namespace
 
-Status ConvertGraphDefToXlaViaMlir(GraphDef graph_def,
-                                   const tf2xla::Config& config,
-                                   xla::XlaComputation* computation) {
+Status ConvertGraphDefToXlaViaMlir(
+    GraphDef graph_def, const tf2xla::Config& config,
+    xla::XlaComputation* computation, absl::string_view debug_info_filename,
+    absl::string_view debug_info_path_begin_marker) {
   // AddPlaceholdersForFeeds prepares for PruneGraphDefInto and serves two
   // purposes: (1) It creates a placeholder node for each feed, so that
   // PruneGraphDefInfo can prune away the node containing the feed. (2) It
@@ -115,7 +116,24 @@ Status ConvertGraphDefToXlaViaMlir(GraphDef graph_def,
   TF_RETURN_IF_ERROR(ConvertOutputInfo(config, &specs));
 
   GraphDebugInfo debug_info;
+  if (!debug_info_filename.empty()) {
+    TF_RETURN_IF_ERROR(LoadProtoFromFile(debug_info_filename, &debug_info));
+
+    if (!debug_info_path_begin_marker.empty()) {
+      for (size_t i = 0, e = debug_info.files_size(); i < e; ++i) {
+        std::string* file_name = debug_info.mutable_files(i);
+        size_t location =
+            file_name->rfind(std::string(debug_info_path_begin_marker));
+        if (location != -1) {
+          *file_name = file_name->substr(location +
+                                         debug_info_path_begin_marker.length());
+        }
+      }
+    }
+  }
+
   mlir::MLIRContext context;
+
   TF_ASSIGN_OR_RETURN(
       mlir::OwningModuleRef module,
       ConvertGraphdefToMlir(pruned_graph_def, debug_info, specs, &context));
diff --git a/tensorflow/compiler/tf2xla/tf2xla.cc b/tensorflow/compiler/tf2xla/tf2xla.cc
index 9ced6e682fc..bcdfd1c6a8e 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/graph_def_util.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/versions.pb.h"
@@ -42,6 +43,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/dump_graph.h"
@@ -128,19 +130,31 @@ Status ConvertGraphToXla(std::unique_ptr<Graph> graph,
   return Status::OK();
 }
 
-void ConvertVarHandlesToAotVarHandles(GraphDef* graph_def) {
-  for (auto& node : *graph_def->mutable_node()) {
+Status ConvertVarHandlesToAotVarHandles(GraphDef* graph_def) {
+  auto update_var_handle_op_node = [](NodeDef& node) -> Status {
     if (node.op() == "VarHandleOp") {
       node.set_op(tfcompile::kXlaAotOnlyVarHandleOp);
+      const auto& it = node.attr().find("allowed_devices");
+      if (it != node.attr().end()) {
+        if (!it->second.list().s().empty()) {
+          // TODO(b/149512838): Support non-empty allowed devices.
+          return errors::InvalidArgument(
+              "VarHandleOp with non-empty allowed devices is not supported.");
+        }
+        node.mutable_attr()->erase("allowed_devices");
+      }
     }
+    return Status::OK();
+  };
+  for (auto& node : *graph_def->mutable_node()) {
+    TF_RETURN_IF_ERROR(update_var_handle_op_node(node));
   }
   for (auto& fn : *graph_def->mutable_library()->mutable_function()) {
     for (auto& node : *fn.mutable_node_def()) {
-      if (node.op() == "VarHandleOp") {
-        node.set_op(tfcompile::kXlaAotOnlyVarHandleOp);
-      }
+      TF_RETURN_IF_ERROR(update_var_handle_op_node(node));
     }
   }
+  return Status::OK();
 }
 
 }  // namespace
@@ -149,7 +163,7 @@ Status ConvertGraphDefToXla(GraphDef graph_def, const tf2xla::Config& config,
                             xla::Client* client,
                             xla::XlaComputation* computation) {
   std::unique_ptr<Graph> graph;
-  ConvertVarHandlesToAotVarHandles(&graph_def);
+  TF_RETURN_IF_ERROR(ConvertVarHandlesToAotVarHandles(&graph_def));
   TF_RETURN_IF_ERROR(InitGraph(graph_def, config, &graph));
   TF_RETURN_IF_ERROR(
       ConvertGraphToXla(std::move(graph), config, client, computation));
diff --git a/tensorflow/compiler/tf2xla/tf2xla.h b/tensorflow/compiler/tf2xla/tf2xla.h
index a1c8806bba5..587d3c2febf 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.h
+++ b/tensorflow/compiler/tf2xla/tf2xla.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_TF2XLA_H_
 #define TENSORFLOW_COMPILER_TF2XLA_TF2XLA_H_
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
@@ -34,10 +35,16 @@ Status ConvertGraphDefToXla(GraphDef graph_def, const tf2xla::Config& config,
                             xla::Client* client,
                             xla::XlaComputation* computation);
 
-// Similar to ConvertGraphDefToXla, but uses MLIR.
-Status ConvertGraphDefToXlaViaMlir(GraphDef graph_def,
-                                   const tf2xla::Config& config,
-                                   xla::XlaComputation* computation);
+// Similar to ConvertGraphDefToXla, but uses MLIR and handle debug information.
+//
+// debug_info_filename: the file for the debug information proto.
+// debug_info_path_begin_marker: if not empty, file pathes in the debug
+//   information are trimmed from the beginning to the first appearance of the
+//   marker.
+Status ConvertGraphDefToXlaViaMlir(
+    GraphDef graph_def, const tf2xla::Config& config,
+    xla::XlaComputation* computation, absl::string_view debug_info_filename,
+    absl::string_view debug_info_path_begin_marker);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
index 5420cf3e04f..3870a673e4e 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
@@ -28,7 +28,9 @@ XlaCompiledCpuFunction::XlaCompiledCpuFunction(const StaticData& static_data,
       buffer_infos_(static_data.buffer_infos_),
       arg_index_table_(static_data.arg_index_table_),
       num_args_(static_data.num_args_),
+      num_variables_(static_data.num_variables_),
       arg_names_(static_data.arg_names_),
+      variable_names_(static_data.variable_names_),
       result_names_(static_data.result_names_),
       program_shape_(static_data.program_shape_),
       hlo_profile_printer_data_(static_data.hlo_profile_printer_data_) {
@@ -63,6 +65,8 @@ XlaCompiledCpuFunction::~XlaCompiledCpuFunction() {
 
 namespace {
 
+constexpr int kNotFound = -1;
+
 // Linear search through `names` looking for a match with `name`. Returns -1 if
 // the name isn't found, or is empty.
 //
@@ -72,7 +76,6 @@ int LookupNameIndex(const string& name, const char** names) {
   // for AOT try the setting the tfcompile --gen_name_to_index flag.
   assert(names != nullptr);
 
-  constexpr int kNotFound = -1;
   if (name.empty()) {
     return kNotFound;
   }
@@ -90,6 +93,14 @@ int XlaCompiledCpuFunction::LookupArgIndex(const string& name) const {
   return LookupNameIndex(name, arg_names_);
 }
 
+int XlaCompiledCpuFunction::LookupVariableIndex(const string& name) const {
+  int index = LookupNameIndex(name, variable_names_);
+  if (index == kNotFound) {
+    return kNotFound;
+  }
+  return num_args_ - num_variables_ + index;
+}
+
 int XlaCompiledCpuFunction::LookupResultIndex(const string& name) const {
   return LookupNameIndex(name, result_names_);
 }
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
index 5e452b50e71..04d9086ce4c 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
@@ -76,12 +76,16 @@ class XlaCompiledCpuFunction {
     // There are num_args entry parameters.
     int64 num_args_ = 0;
 
+    // There are num_variables variables.
+    int64 num_variables_ = 0;
+
     // The 0-based index of the result tuple, in the temp buffers.
     size_t result_index_ = 0;
 
     // [Optional] Arrays of arg and result names. These are arrays of C-style
     // strings, where the array is terminated by nullptr.
     const char** arg_names_ = nullptr;
+    const char** variable_names_ = nullptr;
     const char** result_names_ = nullptr;
 
     // [Optional] Arg and result shapes.
@@ -150,6 +154,8 @@ class XlaCompiledCpuFunction {
 
   int num_args() const { return num_args_; }
 
+  int num_variables() const { return num_variables_; }
+
   // Returns the size of entry parameter `idx`.
   //
   // There is a static version of this method on tfcompile generated subclasses
@@ -212,10 +218,11 @@ class XlaCompiledCpuFunction {
   // ------------------------------
   // Methods for extracting optional metadata.
 
-  // Returns true iff data is available for the Lookup{Arg,Result}Index methods.
-  // E.g. the data might not be compiled into the binary for AOT.
+  // Returns true iff data is available for the Lookup{Arg,Variable,Result}Index
+  // methods. E.g. the data might not be compiled into the binary for AOT.
   bool HasNameIndices() const {
-    return arg_names_ != nullptr && result_names_ != nullptr;
+    return arg_names_ != nullptr && variable_names_ != nullptr &&
+           result_names_ != nullptr;
   }
 
   // Returns the 0-based index for the argument with the given `name`.
@@ -226,6 +233,14 @@ class XlaCompiledCpuFunction {
   // Recommended usage is to capture this in a variable for re-use.
   int LookupArgIndex(const string& name) const;
 
+  // Returns the 0-based index for the variable with the given `name`.
+  // Returns -1 if the name wasn't found, or data isn't available.
+  //
+  // The index remains constant for every instance of XlaCompiledCpuFunction
+  // generated from the same static data, and might not be cheap to determine.
+  // Recommended usage is to capture this in a variable for re-use.
+  int LookupVariableIndex(const string& name) const;
+
   // Returns the 0-based index for the result with the given `name`.
   // Returns -1 if the name wasn't found, or data isn't available.
   //
@@ -280,6 +295,11 @@ class XlaCompiledCpuFunction {
     static_data->num_args_ = num_args;
   }
 
+  static void set_static_data_num_variables(StaticData* static_data,
+                                            int64 num_variables) {
+    static_data->num_variables_ = num_variables;
+  }
+
   static void set_static_data_result_index(StaticData* static_data,
                                            size_t result_index) {
     static_data->result_index_ = result_index;
@@ -290,6 +310,11 @@ class XlaCompiledCpuFunction {
     static_data->arg_names_ = arg_names;
   }
 
+  static void set_static_data_variable_names(StaticData* static_data,
+                                             const char** variable_names) {
+    static_data->variable_names_ = variable_names;
+  }
+
   static void set_static_data_result_names(StaticData* static_data,
                                            const char** result_names) {
     static_data->result_names_ = result_names;
@@ -334,6 +359,9 @@ class XlaCompiledCpuFunction {
   // The number of incoming arguments.
   const int32 num_args_;
 
+  // The number of incoming variables.
+  const int32 num_variables_;
+
   // Backing memory for buffer_table_ and args_, the latter depending on
   // AllocMode.
   void* alloc_buffer_table_ = nullptr;
@@ -346,6 +374,7 @@ class XlaCompiledCpuFunction {
 
   // Optional metadata.
   const char** arg_names_ = nullptr;
+  const char** variable_names_ = nullptr;
   const char** result_names_ = nullptr;
   const xla::ProgramShapeProto* program_shape_ = nullptr;
   const xla::HloProfilePrinterData* hlo_profile_printer_data_ = nullptr;
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 3ea62882dcb..c30b1c0e17d 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -44,7 +44,6 @@ limitations under the License.
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -1174,51 +1173,6 @@ Status XlaCompiler::BuildArguments(
   return Status::OK();
 }
 
-Status XlaCompiler::CompileSingleOp(
-    const XlaCompiler::CompileOptions& options, const NodeDef& node_def,
-    absl::Span<const XlaCompiler::Argument> args,
-    absl::Span<const DataType> result_types, CompilationResult* result) {
-  // TODO(b/74182462): We implement this by creating a new dummy Graph including
-  // _Arg nodes, and let CompileGraph walk it. This could be optimized.
-  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
-
-  Status status;
-  // First create the actual node we care about computing.
-  Node* main_node = graph->AddNode(node_def, &status);
-  TF_RETURN_IF_ERROR(status);
-
-  // Create dummy _Arg nodes. Link these to `node` and also via a control
-  // dependency edge to the _SOURCE node.
-  for (int64 i = 0; i < args.size(); ++i) {
-    Node* node;
-    string arg_name = absl::StrCat("_arg", i);
-    Status status =
-        NodeBuilder(arg_name, FunctionLibraryDefinition::kArgOp)
-            .ControlInput(graph->source_node())
-            .Attr("T", args[i].kind == Argument::kResource ? DT_RESOURCE
-                                                           : args[i].type)
-            .Attr("index", i)
-            .Finalize(graph.get(), &node);
-    TF_RETURN_IF_ERROR(status);
-    graph->AddEdge(node, 0, main_node, i);
-  }
-
-  // Similarly with return values, create dummy _Retval nodes fed by `node`.
-  for (int64 i = 0; i < result_types.size(); ++i) {
-    Node* node;
-    string retval_name = absl::StrCat("_retval", i);
-    Status status = NodeBuilder(retval_name, FunctionLibraryDefinition::kRetOp)
-                        .Input(main_node, i)
-                        .Attr("T", result_types[i])
-                        .Attr("index", i)
-                        .Finalize(graph.get(), &node);
-    TF_RETURN_IF_ERROR(status);
-  }
-  FixupSourceAndSinkEdges(graph.get());
-
-  return CompileGraph(options, node_def.name(), std::move(graph), args, result);
-}
-
 namespace {
 
 // Check that the ops of all non-functional nodes have been registered.
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 5ec5866632b..6a56136a9f6 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -375,14 +375,6 @@ class XlaCompiler {
       std::unique_ptr<Graph> graph, absl::Span<const Argument> args,
       CompilationResult* result);
 
-  // Compiles a single Op, given by `node_def`, into an
-  // xla::XlaComputation. Similar to CompileFunction but takes a single Op as
-  // input.
-  Status CompileSingleOp(const CompileOptions& options, const NodeDef& node_def,
-                         absl::Span<const Argument> args,
-                         absl::Span<const DataType> result_types,
-                         CompilationResult* result);
-
   // Returns the shape of the XLA parameter for an argument 'arg'.
   // See the class comment for more details about the argument passing
   // convention.
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
index 0392cc7d345..0deaa1ea8fb 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
@@ -49,9 +49,9 @@ xla::StatusOr<size_t> ComputeResultIndex(
   return result_slice.index();
 }
 
-// Collect names from `entries`, where T is one of tf2xla::{Feed,Fetch}. We hold
-// the actual strings in nonempty_names, and hold arrays of pointers in
-// name_ptrs, terminated by a nullptr entry.
+// Collect names from `entries`, where T is one of
+// tf2xla::{Feed,Fetch,Variable}. We hold the actual strings in nonempty_names,
+// and hold arrays of pointers in name_ptrs, terminated by a nullptr entry.
 template <typename T>
 void CollectNames(const T& entries, std::vector<string>* nonempty_names,
                   std::vector<const char*>* name_ptrs) {
@@ -154,14 +154,28 @@ XlaJitCompiledCpuFunction::Compile(
       &jit->static_data_, jit->arg_index_table_.data());
   XlaCompiledCpuFunction::set_static_data_num_args(
       &jit->static_data_, jit->arg_index_table_.size());
+  XlaCompiledCpuFunction::set_static_data_num_variables(&jit->static_data_,
+                                                        config.variable_size());
   XlaCompiledCpuFunction::set_static_data_result_index(&jit->static_data_,
                                                        result_index);
   // Optional metadata is collected and set below.
   CollectNames(config.feed(), &jit->nonempty_arg_names_, &jit->arg_names_);
+
+  auto variable_copy = config.variable();
+  for (auto& var : variable_copy) {
+    if (var.name().empty()) {
+      var.set_name(var.node_name());
+    }
+  }
+  CollectNames(variable_copy, &jit->nonempty_variable_names_,
+               &jit->variable_names_);
+
   CollectNames(config.fetch(), &jit->nonempty_result_names_,
                &jit->result_names_);
   XlaCompiledCpuFunction::set_static_data_arg_names(&jit->static_data_,
                                                     jit->arg_names_.data());
+  XlaCompiledCpuFunction::set_static_data_variable_names(
+      &jit->static_data_, jit->variable_names_.data());
   XlaCompiledCpuFunction::set_static_data_result_names(
       &jit->static_data_, jit->result_names_.data());
   XlaCompiledCpuFunction::set_static_data_program_shape(
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h
index 11fc4571189..107968b184d 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h
@@ -77,8 +77,10 @@ class XlaJitCompiledCpuFunction {
   // nonempty_*_names_, and hold arrays of pointers in *_names_ for the static
   // data to refer to.
   std::vector<string> nonempty_arg_names_;
+  std::vector<string> nonempty_variable_names_;
   std::vector<string> nonempty_result_names_;
   std::vector<const char*> arg_names_;
+  std::vector<const char*> variable_names_;
   std::vector<const char*> result_names_;
 
   // The backing data for the program shape. The proto form of program shape is
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
index f5d6b5231ac..880cb5939b6 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
@@ -210,6 +210,9 @@ TEST(XlaJitCompiledCpuFunction, Sum) {
   EXPECT_EQ(function.LookupResultIndex("x_name"), -1);
   EXPECT_EQ(function.LookupResultIndex("y_name"), -1);
 
+  EXPECT_EQ(0, function.num_variables());
+  EXPECT_EQ(function.LookupVariableIndex("x"), -1);
+
   // Check program shape.
   using xla::ShapeUtil;
   const xla::Shape s32 = ShapeUtil::MakeShape(xla::S32, {});
@@ -252,6 +255,14 @@ TEST(XlaJitCompiledCpuFunction, SumVariable) {
   EXPECT_EQ(*static_cast<int32*>(function.result_data(0)), 100);
   EXPECT_EQ(*static_cast<int32*>(function.result_data(1)), 420);
 
+  // Check name to index lookups.
+  EXPECT_TRUE(function.HasNameIndices());
+
+  EXPECT_EQ(2, function.num_args());
+
+  EXPECT_EQ(1, function.num_variables());
+  EXPECT_EQ(function.LookupVariableIndex("myvar"), 1);
+
   // Check program shape.
   using xla::ShapeUtil;
   const xla::Shape s32 = ShapeUtil::MakeShape(xla::S32, {});
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index a1941cc5fdf..a1c45a4bf30 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -50,7 +50,8 @@ XlaCompiler* XlaOpKernelContext::compiler() const {
 }
 
 // Retrieves an XlaExpression that was allocated by a previous Op.
-static const XlaExpression* CastExpressionFromTensor(const Tensor& tensor) {
+const XlaExpression* XlaOpKernelContext::CastExpressionFromTensor(
+    const Tensor& tensor) {
   const XlaExpression* expression =
       reinterpret_cast<const XlaExpression*>(tensor.tensor_data().data());
   CHECK(expression->kind() != XlaExpression::Kind::kInvalid)
@@ -59,8 +60,8 @@ static const XlaExpression* CastExpressionFromTensor(const Tensor& tensor) {
 }
 
 // Assigns an XlaExpression to a tensor on an XLA compilation device.
-static void AssignExpressionToTensor(Tensor* tensor,
-                                     const XlaExpression& value) {
+void XlaOpKernelContext::AssignExpressionToTensor(const XlaExpression& value,
+                                                  Tensor* tensor) {
   const XlaExpression* expression =
       reinterpret_cast<const XlaExpression*>(tensor->tensor_data().data());
   CHECK(expression->kind() == XlaExpression::Kind::kInvalid)
@@ -396,7 +397,8 @@ namespace {
 Status ReadVariableInputTensor(const Tensor& tensor, DataType type,
                                const XlaOpKernelContext* ctx,
                                TensorShape* shape, xla::XlaOp* value) {
-  const XlaExpression* expression = CastExpressionFromTensor(tensor);
+  const XlaExpression* expression =
+      XlaOpKernelContext::CastExpressionFromTensor(tensor);
   XlaResource* variable = expression->resource();
   TF_RET_CHECK(variable != nullptr);
   TF_RET_CHECK(variable->kind() == XlaResource::kVariable);
@@ -486,7 +488,8 @@ void XlaOpKernelContext::SetOutputExpression(int index,
       TF_ASSIGN_OR_RETURN(TensorShape shape, expression.GetShape());
       TF_RETURN_IF_ERROR(context_->allocate_output(index, shape, &output));
     }
-    AssignExpressionToTensor(context_->mutable_output(index), expression);
+    XlaOpKernelContext::AssignExpressionToTensor(
+        expression, context_->mutable_output(index));
     return Status::OK();
   }();
   if (!status.ok()) {
@@ -536,7 +539,8 @@ namespace {
 Status AssignVariableTensor(const Tensor& tensor, DataType type,
                             const XlaOpKernelContext* ctx, xla::XlaOp handle,
                             xla::XlaBuilder* builder) {
-  const XlaExpression* expression = CastExpressionFromTensor(tensor);
+  const XlaExpression* expression =
+      XlaOpKernelContext::CastExpressionFromTensor(tensor);
   XlaResource* variable = expression->resource();
   TF_RET_CHECK(variable != nullptr);
   TF_RET_CHECK(variable->kind() == XlaResource::kVariable);
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index 27b198f8bee..d72dd3972d3 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -278,6 +278,13 @@ class XlaOpKernelContext {
   // separate specialization of the computation for each DataType.
   const xla::XlaComputation* GetOrCreateMul(const DataType type);
 
+  // Assigns an XlaExpression to a tensor on an XLA compilation device.
+  static void AssignExpressionToTensor(const XlaExpression& value,
+                                       Tensor* tensor);
+
+  // Retrieves an XlaExpression that was assigned to the specified tensor.
+  static const XlaExpression* CastExpressionFromTensor(const Tensor& tensor);
+
  private:
   // Returns the tensor of input `name`.
   const Tensor& GetInputTensorByName(absl::string_view name);
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h
index c6f6ffb2853..7839ae95dc0 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.h
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.h
@@ -229,11 +229,11 @@ class XlaOpRegistry {
   };
 
   // Map from compilation device names to a description of the backend.
-  std::unordered_map<string, Backend> backends_ GUARDED_BY(mutex_);
+  std::unordered_map<string, Backend> backends_ TF_GUARDED_BY(mutex_);
 
   // Map from Tensorflow device names to the corresponding JIT device metadata.
   std::unordered_map<string, DeviceRegistration> compilation_devices_
-      GUARDED_BY(mutex_);
+      TF_GUARDED_BY(mutex_);
 
   // A description of a Tensorflow operator that can be compiled to XLA.
   struct OpRegistration {
@@ -292,7 +292,7 @@ class XlaOpRegistry {
   // Registrations present under the same key must satisfy IsCompatible above,
   // and this is checked during registration.
   std::unordered_map<string, std::vector<std::unique_ptr<OpRegistration>>> ops_
-      GUARDED_BY(mutex_);
+      TF_GUARDED_BY(mutex_);
 
   // Have we already registered the JIT kernels on the JIT devices?
   bool jit_kernels_registered_ = false;
@@ -301,7 +301,7 @@ class XlaOpRegistry {
   // registrations created by RegisterCompilationKernels() and
   // RegisterDeviceKernels().
   std::vector<std::unique_ptr<kernel_factory::OpKernelRegistrar>>
-      kernel_registrars_ GUARDED_BY(mutex_);
+      kernel_registrars_ TF_GUARDED_BY(mutex_);
 };
 
 // REGISTER_XLA_OP() registers an XLA OpKernel by name, for example:
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index d6d154b2506..a2993058321 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -17,6 +17,7 @@ package_group(
         "//tensorflow/compiler/...",
         "//tensorflow/python/tpu/...",
         "//third_party/py/jax/...",
+        "//third_party/tf_runtime/tools/tf_kernel_gen/...",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/client/client_library.h b/tensorflow/compiler/xla/client/client_library.h
index 33d1de370de..4211b9a8b1c 100644
--- a/tensorflow/compiler/xla/client/client_library.h
+++ b/tensorflow/compiler/xla/client/client_library.h
@@ -134,10 +134,10 @@ class ClientLibrary {
 
   tensorflow::mutex service_mutex_;  // Guards the singleton creation state.
   std::unordered_map<se::Platform::Id, std::unique_ptr<LocalInstance>>
-      local_instances_ GUARDED_BY(service_mutex_);
+      local_instances_ TF_GUARDED_BY(service_mutex_);
 
   std::unordered_map<se::Platform::Id, std::unique_ptr<CompileOnlyInstance>>
-      compile_only_instances_ GUARDED_BY(service_mutex_);
+      compile_only_instances_ TF_GUARDED_BY(service_mutex_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(ClientLibrary);
 };
diff --git a/tensorflow/compiler/xla/client/lib/prng.cc b/tensorflow/compiler/xla/client/lib/prng.cc
index 11a8e27af11..17fb4c3c369 100644
--- a/tensorflow/compiler/xla/client/lib/prng.cc
+++ b/tensorflow/compiler/xla/client/lib/prng.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/prng.h"
 
 #include <cmath>
+#include <vector>
 
 #include "absl/base/casts.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
@@ -134,14 +135,26 @@ XlaOp Uint32sToUint64(std::array<XlaOp, 2> u32s) {
                    ConstantR0WithType(builder, U64, 32));
 }
 
-// Given the initial state and the request number of random numbers to be
+// Given the initial state and the request shape of random numbers to be
 // generated, returns the input for the random number generator and a new state.
 std::pair<ThreeFry2x32State, XlaOp> GetThreeFryInputsAndUpdatedState(
-    XlaOp initial_state, const int64 size) {
+    XlaOp initial_state, const Shape& shape) {
   XlaBuilder* builder = initial_state.builder();
-  XlaOp input_u64 = Iota(builder, U64, size);
-  input_u64 = input_u64 + initial_state;
-  XlaOp new_state = initial_state + ConstantR0<uint64>(builder, size);
+  auto u64_shape = ShapeUtil::MakeShape(U64, shape.dimensions());
+  // initial_state is an R1, so reshape it to a scalar.
+  auto input_u64 = Broadcast(Reshape(initial_state, {}), shape.dimensions());
+  int64 trailing_dims_product = 1;
+  for (int64 i = shape.rank() - 1; i >= 0; --i) {
+    if (shape.dimensions(i) < 2) {
+      continue;
+    }
+    input_u64 =
+        input_u64 + (Iota(builder, u64_shape, i) *
+                     ConstantR0<uint64>(builder, trailing_dims_product));
+    trailing_dims_product *= shape.dimensions(i);
+  }
+  XlaOp new_state =
+      initial_state + ConstantR0<uint64>(builder, ShapeUtil::ElementsIn(shape));
   return std::make_pair(Uint64ToUint32s(input_u64), new_state);
 }
 
@@ -149,11 +162,46 @@ std::pair<ThreeFry2x32State, XlaOp> GetThreeFryInputsAndUpdatedState(
 // implementation. Returns the random bits and the new state.
 RngOutput ThreeFryRngBit32(XlaOp key, XlaOp initial_state, const Shape& shape) {
   XlaBuilder* builder = key.builder();
+  // Try to split the shape on a dimension > 1 into two halves, each
+  // representing a U32 value.
+  std::vector<int64> half_shape_dims;
+  std::vector<int64> padded_full_shape_dims;
+  int64 split_dim = -1;
+  for (int64 i = 0; i < shape.rank(); ++i) {
+    if (shape.dimensions(i) > 1 && split_dim < 0) {
+      half_shape_dims.push_back(CeilOfRatio<int64>(shape.dimensions(i), 2));
+      // Create a new trivial dim for the later concat, which is more friendly
+      // to sharding propagation.
+      half_shape_dims.push_back(1);
+      split_dim = i;
+      padded_full_shape_dims.push_back(half_shape_dims[i] * 2);
+    } else {
+      half_shape_dims.push_back(shape.dimensions(i));
+      padded_full_shape_dims.push_back(shape.dimensions(i));
+    }
+  }
+  auto half_shape = ShapeUtil::MakeShape(shape.element_type(), half_shape_dims);
+  if (split_dim >= 0) {
+    std::pair<ThreeFry2x32State, XlaOp> inputs_state =
+        GetThreeFryInputsAndUpdatedState(initial_state, half_shape);
+    ThreeFry2x32State inputs = inputs_state.first;
+    ThreeFry2x32State outputs = ThreeFry2x32(inputs, Uint64ToUint32s(key));
+    XlaOp result = ConcatInDim(builder, outputs, split_dim + 1);
+    result = Reshape(result, padded_full_shape_dims);
+    if (shape.dimensions(split_dim) % 2 != 0) {
+      result = Slice(result, std::vector<int64>(shape.rank(), 0),
+                     shape.dimensions(), std::vector<int64>(shape.rank(), 1));
+    }
+    return {result, inputs_state.second};
+  }
+  // Use an R1 shape if the previous attempt failed.
   const int64 size = ShapeUtil::ElementsIn(shape);
   const int64 half_size = CeilOfRatio<int64>(size, 2);
   const bool size_is_odd = (half_size * 2 != size);
   std::pair<ThreeFry2x32State, XlaOp> inputs_state =
-      GetThreeFryInputsAndUpdatedState(initial_state, half_size);
+      GetThreeFryInputsAndUpdatedState(
+          initial_state,
+          ShapeUtil::MakeShape(shape.element_type(), {half_size}));
   ThreeFry2x32State inputs = inputs_state.first;
   ThreeFry2x32State outputs = ThreeFry2x32(inputs, Uint64ToUint32s(key));
   if (size_is_odd) {
@@ -167,14 +215,12 @@ RngOutput ThreeFryRngBit32(XlaOp key, XlaOp initial_state, const Shape& shape) {
 // Generates random 64bits with the given shape using the Three Fry
 // implementation. Returns the random bits and the new state.
 RngOutput ThreeFryRngBit64(XlaOp key, XlaOp initial_state, const Shape& shape) {
-  const int64 size = ShapeUtil::ElementsIn(shape);
   std::pair<ThreeFry2x32State, XlaOp> inputs_state =
-      GetThreeFryInputsAndUpdatedState(initial_state, size);
+      GetThreeFryInputsAndUpdatedState(initial_state, shape);
   ThreeFry2x32State inputs = inputs_state.first;
   ThreeFry2x32State outputs = ThreeFry2x32(inputs, Uint64ToUint32s(key));
   XlaOp result = Uint32sToUint64(outputs);
-  return {Reshape(result, AsInt64Slice(shape.dimensions())),
-          inputs_state.second};
+  return {result, inputs_state.second};
 }
 
 // The key of the Philox random number generator.
diff --git a/tensorflow/compiler/xla/client/lib/slicing.cc b/tensorflow/compiler/xla/client/lib/slicing.cc
index 7d8f433bac8..1ea713467f8 100644
--- a/tensorflow/compiler/xla/client/lib/slicing.cc
+++ b/tensorflow/compiler/xla/client/lib/slicing.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/slicing.h"
 
+#include <algorithm>
 #include <limits>
+#include <vector>
 
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
@@ -24,6 +26,18 @@ limitations under the License.
 
 namespace xla {
 
+XlaOp DynamicStridedSlice(XlaOp input, absl::Span<const XlaOp> base_indices,
+                          absl::Span<const int64> window_sizes,
+                          absl::Span<const int64> strides) {
+  XlaOp sliced_input = DynamicSlice(input, base_indices, window_sizes);
+  if (std::any_of(strides.begin(), strides.end(),
+                  [](int64 stride) { return stride != 1; })) {
+    sliced_input = Slice(sliced_input, std::vector<int64>(window_sizes.size()),
+                         window_sizes, strides);
+  }
+  return sliced_input;
+}
+
 XlaOp SliceInMinorDims(XlaOp x, absl::Span<const int64> start,
                        absl::Span<const int64> end) {
   XlaBuilder* builder = x.builder();
diff --git a/tensorflow/compiler/xla/client/lib/slicing.h b/tensorflow/compiler/xla/client/lib/slicing.h
index cf83d63cec2..e6b72890b7d 100644
--- a/tensorflow/compiler/xla/client/lib/slicing.h
+++ b/tensorflow/compiler/xla/client/lib/slicing.h
@@ -22,6 +22,13 @@ limitations under the License.
 
 namespace xla {
 
+// Slices input starting from the base_indices and within the window_sizes,
+// using the supplied strides. This is the equivalent of the Python slicing op
+// [base_indices : base_indices+window_sizes : stride].
+XlaOp DynamicStridedSlice(XlaOp input, absl::Span<const XlaOp> base_indices,
+                          absl::Span<const int64> window_sizes,
+                          absl::Span<const int64> strides);
+
 // Updates a slice of 'x', i.e.,
 // x[start[0], ..., start[n]] = update
 XlaOp UpdateSlice(XlaOp x, XlaOp update, absl::Span<const int64> start);
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index df070d97ff7..afe115deda8 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -346,6 +346,23 @@ StatusOr<std::vector<std::unique_ptr<LocalExecutable>>> LocalClient::Compile(
     VLOG(3) << "Set device ordinal to default value of: "
             << updated_options.device_ordinal();
   }
+  if (options.has_device_assignment()) {
+    if (options.device_assignment().replica_count() != options.num_replicas()) {
+      return InvalidArgument(
+          "Mismatched number of replicas for device "
+          "assignment and computation (%d vs %d).\n%s",
+          options.device_assignment().replica_count(), options.num_replicas(),
+          options.device_assignment().ToString());
+    }
+    if (options.device_assignment().computation_count() !=
+        options.num_partitions()) {
+      return InvalidArgument(
+          "Mismatched number of partitions for device "
+          "assignment and computation (%d vs %d).\n%s",
+          options.device_assignment().computation_count(),
+          options.num_partitions(), options.device_assignment().ToString());
+    }
+  }
   TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<Executable>> executables,
                       local_service_->CompileExecutables(
                           computation, argument_layouts, updated_options));
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index da9ac3553ad..888db7536e4 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -489,8 +489,9 @@ StatusOr<XlaOp> XlaBuilder::AddBroadcastSequence(const Shape& output_shape,
   }
 
   // Eliminate the size one dimensions.
-  TF_ASSIGN_OR_RETURN(XlaOp reshaped_operand,
-                      ReshapeInternal(reshaped_shape, operand));
+  TF_ASSIGN_OR_RETURN(
+      XlaOp reshaped_operand,
+      ReshapeInternal(reshaped_shape, operand, /*inferred_dimension=*/-1));
   // Broadcast 'reshape' up to the larger size.
   return InDimBroadcast(broadcast_shape, reshaped_operand,
                         broadcast_dimensions);
@@ -498,12 +499,10 @@ StatusOr<XlaOp> XlaBuilder::AddBroadcastSequence(const Shape& output_shape,
 
 XlaOp XlaBuilder::UnaryOp(HloOpcode unop, XlaOp operand) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(
         Shape shape, ShapeInference::InferUnaryOpShape(unop, *operand_shape));
-    *instr.mutable_shape() = shape.ToProto();
-    return AddInstruction(std::move(instr), unop, {operand});
+    return AddOpWithShape(unop, shape, {operand});
   });
 }
 
@@ -511,31 +510,17 @@ XlaOp XlaBuilder::BinaryOp(HloOpcode binop, XlaOp lhs, XlaOp rhs,
                            absl::Span<const int64> broadcast_dimensions,
                            absl::optional<ComparisonDirection> direction) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape* lhs_shape, GetShapePtr(lhs));
     TF_ASSIGN_OR_RETURN(const Shape* rhs_shape, GetShapePtr(rhs));
     TF_ASSIGN_OR_RETURN(
         Shape shape, ShapeInference::InferBinaryOpShape(
                          binop, *lhs_shape, *rhs_shape, broadcast_dimensions));
-    *instr.mutable_shape() = shape.ToProto();
-    if (binop == HloOpcode::kCompare) {
-      if (!direction.has_value()) {
-        return InvalidArgument(
-            "kCompare expects a ComparisonDirection, but none provided.");
-      }
-      instr.set_comparison_direction(ComparisonDirectionToString(*direction));
-    } else if (direction.has_value()) {
-      return InvalidArgument(
-          "A comparison direction is provided for a non-compare opcode: %s.",
-          HloOpcodeString(binop));
-    }
 
     const int64 lhs_rank = lhs_shape->rank();
     const int64 rhs_rank = rhs_shape->rank();
 
     XlaOp updated_lhs = lhs;
     XlaOp updated_rhs = rhs;
-
     if (!broadcast_dimensions.empty() && lhs_rank != rhs_rank) {
       const bool should_broadcast_lhs = lhs_rank < rhs_rank;
       XlaOp from = should_broadcast_lhs ? lhs : rhs;
@@ -576,13 +561,35 @@ XlaOp XlaBuilder::BinaryOp(HloOpcode binop, XlaOp lhs, XlaOp rhs,
                           AddBroadcastSequence(shape, updated_rhs));
     }
 
-    return AddInstruction(std::move(instr), binop, {updated_lhs, updated_rhs});
+    return BinaryOpNoBroadcast(binop, shape, updated_lhs, updated_rhs,
+                               direction);
+  });
+}
+
+XlaOp XlaBuilder::BinaryOpNoBroadcast(
+    HloOpcode binop, const Shape& shape, XlaOp lhs, XlaOp rhs,
+    absl::optional<ComparisonDirection> direction) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    *instr.mutable_shape() = shape.ToProto();
+    if (binop == HloOpcode::kCompare) {
+      if (!direction.has_value()) {
+        return InvalidArgument(
+            "kCompare expects a ComparisonDirection, but none provided.");
+      }
+      instr.set_comparison_direction(ComparisonDirectionToString(*direction));
+    } else if (direction.has_value()) {
+      return InvalidArgument(
+          "A comparison direction is provided for a non-compare opcode: %s.",
+          HloOpcodeString(binop));
+    }
+
+    return AddInstruction(std::move(instr), binop, {lhs, rhs});
   });
 }
 
 XlaOp XlaBuilder::TernaryOp(HloOpcode triop, XlaOp lhs, XlaOp rhs, XlaOp ehs) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
     XlaOp updated_lhs = lhs;
     XlaOp updated_rhs = rhs;
     XlaOp updated_ehs = ehs;
@@ -635,8 +642,8 @@ XlaOp XlaBuilder::TernaryOp(HloOpcode triop, XlaOp lhs, XlaOp rhs, XlaOp ehs) {
           "%s Input scalar shapes may have been changed to non-scalar shapes.",
           status_or_shape.status().error_message());
     }
-    *instr.mutable_shape() = status_or_shape.ConsumeValueOrDie().ToProto();
-    return AddInstruction(std::move(instr), triop,
+
+    return AddOpWithShape(triop, status_or_shape.ValueOrDie(),
                           {updated_lhs, updated_rhs, updated_ehs});
   });
 }
@@ -749,8 +756,9 @@ XlaOp XlaBuilder::BroadcastInDim(
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     // Output shape, in the case of degenerate broadcast, the out_dim_size is
     // not necessarily the same as the dimension sizes of the output shape.
-    auto output_shape =
-        ShapeUtil::MakeShape(operand_shape->element_type(), out_dim_size);
+    TF_ASSIGN_OR_RETURN(auto output_shape,
+                        ShapeUtil::MakeValidatedShape(
+                            operand_shape->element_type(), out_dim_size));
     if (operand_shape->rank() != broadcast_dimensions.size()) {
       return InvalidArgument(
           "Size of broadcast_dimensions has to match operand's rank; operand "
@@ -1616,12 +1624,10 @@ XlaOp XlaBuilder::Sort(absl::Span<const XlaOp> operands,
 XlaOp XlaBuilder::ConvertElementType(XlaOp operand,
                                      PrimitiveType new_element_type) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferConvertShape(
                                          *operand_shape, new_element_type));
-    *instr.mutable_shape() = shape.ToProto();
-    return AddInstruction(std::move(instr), HloOpcode::kConvert, {operand});
+    return AddOpWithShape(HloOpcode::kConvert, shape, {operand});
   });
 }
 
@@ -2805,6 +2811,13 @@ StatusOr<XlaOp> XlaBuilder::AddInstruction(HloInstructionProto&& instr,
   return op;
 }
 
+StatusOr<XlaOp> XlaBuilder::AddOpWithShape(HloOpcode opcode, const Shape& shape,
+                                           absl::Span<const XlaOp> operands) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape.ToProto();
+  return AddInstruction(std::move(instr), opcode, operands);
+}
+
 void XlaBuilder::AddCalledComputation(const XlaComputation& computation,
                                       HloInstructionProto* instr) {
   absl::flat_hash_map<int64, int64> remapped_ids;
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index dc5c83e0bfb..9d03141715f 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -95,6 +95,7 @@ class XlaOp {
   int64 handle() const { return handle_; }
 
   friend class XlaBuilder;
+  friend class MlirHloBuilder;
 
   // < 0 means "invalid handle".
   int64 handle_;
@@ -139,7 +140,7 @@ class XlaBuilder {
   XlaBuilder(const XlaBuilder&) = delete;
   XlaBuilder& operator=(const XlaBuilder&) = delete;
 
-  ~XlaBuilder();
+  virtual ~XlaBuilder();
 
   // Returns the computation name.
   const string& name() const { return name_; }
@@ -277,7 +278,7 @@ class XlaBuilder {
   StatusOr<Shape> GetShape(XlaOp op) const;
 
   // Returns the shape of the given op.
-  StatusOr<const Shape*> GetShapePtr(XlaOp op) const;
+  virtual StatusOr<const Shape*> GetShapePtr(XlaOp op) const;
 
   // Returns the (inferred) result for the current computation's shape. This
   // assumes the root instruction is the last added instruction.
@@ -645,7 +646,7 @@ class XlaBuilder {
   StatusOr<HloInstructionProto*> LookUpMutableInstructionByHandle(int64 handle);
 
   // Internal helper method that does the building for an arbitrary unary op.
-  XlaOp UnaryOp(HloOpcode unop, XlaOp operand);
+  virtual XlaOp UnaryOp(HloOpcode unop, XlaOp operand);
 
   // Internal helper method that does the building for an arbitrary binary op.
   // broadcast_dimensions specifies which dimensions to use for broadcasting
@@ -655,14 +656,21 @@ class XlaBuilder {
                  absl::Span<const int64> broadcast_dimensions,
                  absl::optional<ComparisonDirection> direction = absl::nullopt);
 
+  // Internal helper method that does the building for an arbitrary binary op
+  // with same ranked operands that doesn't broadcast.
+  virtual XlaOp BinaryOpNoBroadcast(
+      HloOpcode binop, const Shape& shape, XlaOp lhs, XlaOp rhs,
+      absl::optional<ComparisonDirection> direction);
+
   // Internal helper method that does the building for an arbitrary ternary op.
   XlaOp TernaryOp(HloOpcode triop, XlaOp lhs, XlaOp rhs, XlaOp ehs);
 
   XlaOp RngOp(RandomDistribution distribution,
               absl::Span<const XlaOp> parameters, const Shape& shape);
 
-  StatusOr<XlaOp> InDimBroadcast(const Shape& shape, XlaOp operand,
-                                 absl::Span<const int64> broadcast_dimensions);
+  virtual StatusOr<XlaOp> InDimBroadcast(
+      const Shape& shape, XlaOp operand,
+      absl::Span<const int64> broadcast_dimensions);
 
   // Internal helper method that creates a sequence of instructions that
   // performs an explicit broadcast of the operand to the target shape.
@@ -671,8 +679,8 @@ class XlaBuilder {
 
   // Internal helper method for creating a Reshape op with the already inferred
   // shape.
-  StatusOr<XlaOp> ReshapeInternal(const Shape& shape, XlaOp operand,
-                                  int64 inferred_dimension = -1);
+  virtual StatusOr<XlaOp> ReshapeInternal(const Shape& shape, XlaOp operand,
+                                          int64 inferred_dimension);
 
   // Returns the (inferred) result for the program shape using the given root.
   StatusOr<ProgramShape> GetProgramShape(int64 root_id) const;
@@ -1056,15 +1064,20 @@ class XlaBuilder {
   friend XlaOp GetDimensionSize(XlaOp operand, int64 dimension);
   friend XlaOp SetDimensionSize(XlaOp operand, XlaOp val, int64 dimension);
 
+ protected:
+  // Returns OK status if the given op was built using this builder. Otherwise,
+  // returns an error.
+  Status CheckOpBuilder(XlaOp op) const;
+
  private:
   XlaOp ConditionalImpl(
       XlaOp branch_index,
       absl::Span<const XlaComputation* const> branch_computations,
       absl::Span<const XlaOp> branch_operands);
 
-  // Returns OK status if the given op was built using this builder. Otherwise,
-  // returns an error.
-  Status CheckOpBuilder(XlaOp op) const;
+  // Creates an op with the given opcode and the output shape.
+  virtual StatusOr<XlaOp> AddOpWithShape(HloOpcode opcode, const Shape& shape,
+                                         absl::Span<const XlaOp> operands);
 
   // Here, InstructionType is either const HloInstructionProto* or non-const
   // HloInstructionProto*.
diff --git a/tensorflow/compiler/xla/client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc
index fd227ea47f2..115a822b323 100644
--- a/tensorflow/compiler/xla/client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_builder_test.cc
@@ -329,6 +329,17 @@ TEST_F(XlaBuilderTest, BroadcastInDimWithDegeneratedDim) {
               op::Broadcast(op::Reshape(op::Broadcast())));
 }
 
+TEST_F(XlaBuilderTest, BroadcastInDimWithNegativeSize) {
+  XlaBuilder b(TestName());
+  auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {2, 1, 4}), "x");
+  BroadcastInDim(x, {-3, 3, 4},
+                 /*broadcast_dimensions=*/{0, 1, 2});
+  auto statusor = BuildHloModule(&b);
+  ASSERT_FALSE(statusor.ok());
+  EXPECT_THAT(statusor.status().error_message(),
+              HasSubstr("shape's dimensions must not be < 0"));
+}
+
 TEST_F(XlaBuilderTest, OperandFromWrongBuilder) {
   XlaBuilder b1("b1");
   auto p0 = Parameter(&b1, 0, ShapeUtil::MakeShape(F32, {}), "p0");
diff --git a/tensorflow/compiler/xla/executable_run_options.cc b/tensorflow/compiler/xla/executable_run_options.cc
index a0f60408296..1228ad527e3 100644
--- a/tensorflow/compiler/xla/executable_run_options.cc
+++ b/tensorflow/compiler/xla/executable_run_options.cc
@@ -30,6 +30,8 @@ std::string RunId::ToString() const {
   return "RunId: " + std::to_string(data_);
 }
 
+int64 RunId::ToInt() const { return data_; }
+
 ExecutableRunOptions& ExecutableRunOptions::set_device_ordinal(
     int device_ordinal) {
   device_ordinal_ = device_ordinal;
diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index b44d5f13b68..6981b35975f 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -55,6 +55,7 @@ class RunId {
   RunId& operator=(const RunId&) = default;
   friend bool operator==(const RunId& a, const RunId& b);
   std::string ToString() const;
+  int64 ToInt() const;
 
   template <typename H>
   friend H AbslHashValue(H h, const RunId& id) {
diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index 6c7aff3b11e..44e6a3c7bdb 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -133,8 +133,9 @@ void Literal::SetPiece(const Shape& shape, Piece* piece, bool allocate_arrays) {
   } else if (shape.IsArray()) {
     if (allocate_arrays) {
       // Literals can be used as DMA targets, which can require alignment. We
-      // force a 16-byte minimum alignment.
-      constexpr int kMinimumAlignment = 16;
+      // force a tensorflow::Allocator::kAllocatorAlignment-byte minimum
+      // alignment.
+      constexpr int kMinimumAlignment = 64;
       piece->set_buffer(static_cast<char*>(tensorflow::port::AlignedMalloc(
           piece->size_bytes(), kMinimumAlignment)));
     }
diff --git a/tensorflow/compiler/xla/parse_flags_from_env_test.cc b/tensorflow/compiler/xla/parse_flags_from_env_test.cc
index f967a788dec..e3552470f63 100644
--- a/tensorflow/compiler/xla/parse_flags_from_env_test.cc
+++ b/tensorflow/compiler/xla/parse_flags_from_env_test.cc
@@ -143,6 +143,9 @@ TEST(ParseFlagsFromEnv, EnvAndFlag) {
     string stdout_str;
     int child_status = child.Communicate(nullptr, &stdout_str, nullptr);
     CHECK_EQ(child_status, 0) << "test " << i;
+    // On windows, we get CR characters. Remove them.
+    stdout_str.erase(std::remove(stdout_str.begin(), stdout_str.end(), '\r'),
+                     stdout_str.end());
     CHECK_EQ(stdout_str, test[i].expected_value) << "test " << i;
   }
 }
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index d6c1a034859..3c93ec96113 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -1,7 +1,10 @@
 load("//tensorflow/core/platform:build_config.bzl", "pyx_library")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_py_test_deps")
-load("//tensorflow:tensorflow.bzl", "pybind_extension")
 load("//tensorflow:tensorflow.bzl", "py_test", "tf_cc_test")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "pybind_extension")
 
 package(
     default_visibility = ["//tensorflow:internal"],
@@ -59,6 +62,7 @@ cc_library(
     features = ["-use_header_modules"],
     deps = [
         ":bfloat16",
+        ":local_client",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status",
@@ -165,6 +169,7 @@ cc_library(
     name = "local_client",
     srcs = ["local_client.cc"],
     hdrs = ["local_client.h"],
+    visibility = ["//tensorflow/compiler/xla:friends"],
     deps = [
         ":local_device_state",
         ":shared_device_buffer",
@@ -181,6 +186,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/compiler/xla/service/gpu:gpu_executable_run_options",
         "//tensorflow/core:allocator",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/lib:traceme",
@@ -292,15 +298,19 @@ cc_library(
     name = "nvidia_gpu_device",
     srcs = ["nvidia_gpu_device.cc"],
     hdrs = ["nvidia_gpu_device.h"],
+    copts = if_cuda(["-DNCCL_ENABLED=1"]),
     deps = [
         ":local_client",
+        "//tensorflow/compiler/xla/service/gpu:gpu_executable_run_options",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/python/distributed:client",
         "//tensorflow/compiler/xla/service:platform_util",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/core:bfc_allocator",
         "//tensorflow/core:gpu_mem_allocator",
         "//tensorflow/stream_executor:tf_allocator_adapter",
-    ],
+    ] + if_cuda(["@local_config_nccl//:nccl"]),
 )
 
 config_setting(
@@ -355,6 +365,9 @@ pybind_extension(
         "//tensorflow/compiler/xla/client/lib:self_adjoint_eig",
         "//tensorflow/compiler/xla/client/lib:sorting",
         "//tensorflow/compiler/xla/client/lib:svd",
+        "//tensorflow/compiler/xla/python/distributed",
+        "//tensorflow/compiler/xla/python/distributed:client",
+        "//tensorflow/compiler/xla/python/distributed:service",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/compiler/xla/service:hlo",
@@ -383,3 +396,25 @@ pybind_extension(
         "//conditions:default": [],
     }),
 )
+
+tf_cc_test(
+    name = "gpu_multistream_test",
+    srcs = ["gpu_multistream_test.cc"],
+    tags = [
+        # TODO(phawkins): figure out TF test infra such that this only runs under GPU.
+        "no_oss",
+        "requires-gpu-nvidia",
+    ],
+    deps = [
+        ":local_client",
+        ":nvidia_gpu_device",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/client:executable_build_options",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:random",
+    ],
+)
diff --git a/tensorflow/compiler/xla/python/cpu_device.cc b/tensorflow/compiler/xla/python/cpu_device.cc
index 6bb17f12b89..6b55eac0c08 100644
--- a/tensorflow/compiler/xla/python/cpu_device.cc
+++ b/tensorflow/compiler/xla/python/cpu_device.cc
@@ -37,21 +37,21 @@ StatusOr<std::shared_ptr<PyLocalClient>> GetCpuClient(bool asynchronous) {
   TF_ASSIGN_OR_RETURN(LocalClient * client,
                       ClientLibrary::GetOrCreateLocalClient(options));
 
-  std::vector<std::shared_ptr<Device>> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   for (int i = 0; i < client->device_count(); ++i) {
     se::StreamExecutor* executor =
         client->backend().stream_executor(i).ValueOrDie();
     auto device_state = absl::make_unique<LocalDeviceState>(
         executor, client, /*synchronous_deallocation=*/true, asynchronous,
         /*allow_event_reuse=*/false);
-    std::shared_ptr<Device> device =
-        std::make_shared<CpuDevice>(i, std::move(device_state));
+    auto device = absl::make_unique<CpuDevice>(i, std::move(device_state));
     devices.push_back(std::move(device));
   }
 
   return std::make_shared<PyLocalClient>(
       kCpuPlatformName, client, std::move(devices), /*host_id=*/0,
-      /*allocator=*/nullptr, /*host_memory_allocator=*/nullptr);
+      /*allocator=*/nullptr, /*host_memory_allocator=*/nullptr,
+      /*gpu_run_options=*/nullptr);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/distributed/BUILD b/tensorflow/compiler/xla/python/distributed/BUILD
new file mode 100644
index 00000000000..b38084c3395
--- /dev/null
+++ b/tensorflow/compiler/xla/python/distributed/BUILD
@@ -0,0 +1,122 @@
+load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library_cc")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+licenses(["notice"])
+
+package(default_visibility = ["//tensorflow:internal"])
+
+tf_proto_library_cc(
+    name = "protocol_proto",
+    srcs = ["protocol.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    cc_grpc_version = 1,
+    use_grpc_namespace = True,
+)
+
+cc_library(
+    name = "protocol",
+    hdrs = ["protocol.h"],
+)
+
+cc_library(
+    name = "key_value_store",
+    srcs = ["key_value_store.cc"],
+    hdrs = ["key_value_store.h"],
+    deps = [
+        "//tensorflow:grpc++",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+cc_library(
+    name = "service",
+    srcs = ["service.cc"],
+    hdrs = ["service.h"],
+    deps = [
+        ":key_value_store",
+        ":protocol",
+        ":protocol_proto_cc",
+        ":util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+tf_cc_test(
+    name = "service_test",
+    srcs = ["service_test.cc"],
+    deps = [
+        ":protocol_proto_cc",
+        ":service",
+        "//tensorflow/compiler/xla/service:cpu_plugin",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "client",
+    srcs = [
+        "client.cc",
+    ],
+    hdrs = [
+        "client.h",
+    ],
+    deps = [
+        ":protocol",
+        ":protocol_proto_cc",
+        ":util",
+        "//tensorflow:grpc++",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+cc_library(
+    name = "util",
+    hdrs = ["util.h"],
+    deps = [
+        "//tensorflow:grpc++",
+        "//tensorflow/compiler/xla:status",
+    ],
+)
+
+cc_library(
+    name = "distributed",
+    srcs = ["distributed.cc"],
+    hdrs = ["distributed.h"],
+    deps = [
+        ":client",
+        ":service",
+        "//tensorflow:grpc++",
+        "//tensorflow/compiler/xla:statusor",
+    ],
+)
+
+tf_cc_test(
+    name = "client_server_test",
+    srcs = ["client_server_test.cc"],
+    deps = [
+        ":client",
+        ":protocol_proto_cc",
+        ":service",
+        "//tensorflow:grpc++",
+        "//tensorflow/compiler/xla:protobuf_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla/service:cpu_plugin",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/time",
+    ],
+)
diff --git a/tensorflow/compiler/xla/python/distributed/client.cc b/tensorflow/compiler/xla/python/distributed/client.cc
new file mode 100644
index 00000000000..c50c3f50a9d
--- /dev/null
+++ b/tensorflow/compiler/xla/python/distributed/client.cc
@@ -0,0 +1,82 @@
+/* Copyright 2020 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/distributed/client.h"
+
+#include <chrono>  // NOLINT
+
+#include "tensorflow/compiler/xla/python/distributed/protocol.h"
+#include "tensorflow/compiler/xla/python/distributed/util.h"
+
+namespace xla {
+
+DistributedRuntimeClient::DistributedRuntimeClient(
+    std::shared_ptr<::grpc::Channel> channel)
+    : stub_(grpc::DistributedRuntimeService::NewStub(std::move(channel))) {}
+DistributedRuntimeClient::~DistributedRuntimeClient() = default;
+
+xla::Status DistributedRuntimeClient::Connect(
+    const LocalTopologyProto& local_topology,
+    GlobalTopologyProto* global_topology) {
+  ::grpc::ClientContext ctx;
+  ctx.set_fail_fast(false);
+  ctx.set_deadline(absl::ToChronoTime(absl::Now() + rpc_timeout_));
+  ConnectRequest request;
+  request.set_protocol_version(kDistributedRuntimeProtocolVersion);
+  *request.mutable_local_topology() = local_topology;
+  VLOG(10) << "Connect: " << request.DebugString();
+  ConnectResponse response;
+  ::grpc::Status status = stub_->Connect(&ctx, request, &response);
+  if (!status.ok()) {
+    return FromGrpcStatus(status);
+  }
+  VLOG(10) << "Connect() response: " << response.DebugString();
+  response.mutable_global_topology()->Swap(global_topology);
+  return xla::Status::OK();
+}
+
+xla::StatusOr<std::string> DistributedRuntimeClient::BlockingKeyValueGet(
+    std::string key, absl::Duration timeout) {
+  ::grpc::ClientContext ctx;
+  ctx.set_fail_fast(false);
+  ctx.set_deadline(absl::ToChronoTime(absl::Now() + timeout));
+  KeyValueGetRequest request;
+  request.set_key(std::move(key));
+  timeout = std::min(timeout, absl::Minutes(10));  // Avoid overflow
+  request.set_timeout_milliseconds(timeout / absl::Milliseconds(1));
+  VLOG(10) << "BlockingKeyValueGet: " << request.DebugString();
+  KeyValueGetResponse response;
+  ::grpc::Status status = stub_->KeyValueGet(&ctx, request, &response);
+  if (!status.ok()) {
+    return FromGrpcStatus(status);
+  }
+  return response.value();
+}
+
+xla::Status DistributedRuntimeClient::KeyValueSet(std::string key,
+                                                  std::string value) {
+  ::grpc::ClientContext ctx;
+  ctx.set_fail_fast(false);
+  ctx.set_deadline(absl::ToChronoTime(absl::Now() + rpc_timeout_));
+  KeyValueSetRequest request;
+  request.set_key(std::move(key));
+  request.set_value(std::move(value));
+  VLOG(10) << "KeyValueSet: " << request.DebugString();
+  KeyValueSetResponse response;
+  ::grpc::Status status = stub_->KeyValueSet(&ctx, request, &response);
+  return FromGrpcStatus(status);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/distributed/client.h b/tensorflow/compiler/xla/python/distributed/client.h
new file mode 100644
index 00000000000..1ab5292bea8
--- /dev/null
+++ b/tensorflow/compiler/xla/python/distributed/client.h
@@ -0,0 +1,50 @@
+/* Copyright 2020 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_CLIENT_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_CLIENT_H_
+
+#include <memory>
+
+#include "grpcpp/channel.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "tensorflow/compiler/xla/python/distributed/protocol.grpc.pb.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace xla {
+
+class DistributedRuntimeClient {
+ public:
+  explicit DistributedRuntimeClient(std::shared_ptr<::grpc::Channel> channel);
+  ~DistributedRuntimeClient();
+
+  xla::Status Connect(const LocalTopologyProto& local_topology,
+                      GlobalTopologyProto* global_topology);
+
+  xla::StatusOr<std::string> BlockingKeyValueGet(std::string key,
+                                                 absl::Duration timeout);
+
+  xla::Status KeyValueSet(std::string key, std::string value);
+
+ private:
+  const std::unique_ptr<grpc::DistributedRuntimeService::Stub> stub_;
+  const absl::Duration rpc_timeout_ = absl::Seconds(120);
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_CLIENT_H_
diff --git a/tensorflow/compiler/xla/python/distributed/client_server_test.cc b/tensorflow/compiler/xla/python/distributed/client_server_test.cc
new file mode 100644
index 00000000000..e78949933a2
--- /dev/null
+++ b/tensorflow/compiler/xla/python/distributed/client_server_test.cc
@@ -0,0 +1,102 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "grpcpp/security/server_credentials.h"
+#include "absl/time/time.h"
+#include "tensorflow/compiler/xla/protobuf_util.h"
+#include "tensorflow/compiler/xla/python/distributed/client.h"
+#include "tensorflow/compiler/xla/python/distributed/protocol.pb.h"
+#include "tensorflow/compiler/xla/python/distributed/service.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/threadpool.h"
+
+namespace xla {
+namespace {
+
+TEST(ClientServerTest, ConnectToServer) {
+  DistributedRuntimeServiceImpl service(/*num_nodes=*/2);
+  ::grpc::ServerBuilder builder;
+  builder.RegisterService(&service);
+  auto server = builder.BuildAndStart();
+
+  std::vector<LocalTopologyProto> locals(2);
+  locals[0].set_node_id(0);
+  locals[1].set_node_id(1);
+  DeviceProto* d0 = locals[0].add_devices();
+  d0->set_local_device_ordinal(0);
+  DeviceProto* d1 = locals[0].add_devices();
+  d1->set_local_device_ordinal(0);
+  DeviceProto* d2 = locals[0].add_devices();
+  d2->set_local_device_ordinal(707);
+  DeviceProto* d3 = locals[1].add_devices();
+  d3->set_local_device_ordinal(1);
+
+  GlobalTopologyProto expected_topology;
+  auto* node0 = expected_topology.add_nodes();
+  auto* node1 = expected_topology.add_nodes();
+  *node0 = locals[0];
+  node0->mutable_devices(0)->set_global_device_id(0);
+  node0->mutable_devices(1)->set_global_device_id(1);
+  node0->mutable_devices(2)->set_global_device_id(2);
+  *node1 = locals[1];
+  node1->mutable_devices(0)->set_global_device_id(3);
+
+  auto thread0_fn = [&]() -> xla::Status {
+    DistributedRuntimeClient client(
+        server->InProcessChannel(::grpc::ChannelArguments()));
+    GlobalTopologyProto topology;
+    TF_RETURN_IF_ERROR(client.Connect(locals[0], &topology));
+    TF_RET_CHECK(
+        xla::protobuf_util::ProtobufEquals(topology, expected_topology));
+    TF_RETURN_IF_ERROR(client.KeyValueSet("key1", "value1"));
+    TF_ASSIGN_OR_RETURN(
+        std::string value,
+        client.BlockingKeyValueGet("key2", absl::InfiniteDuration()));
+    TF_RET_CHECK(value == "value2");
+    return xla::Status::OK();
+  };
+  auto thread1_fn = [&]() -> xla::Status {
+    DistributedRuntimeClient client(
+        server->InProcessChannel(::grpc::ChannelArguments()));
+    GlobalTopologyProto topology;
+    TF_RETURN_IF_ERROR(client.Connect(locals[1], &topology));
+    TF_RET_CHECK(
+        xla::protobuf_util::ProtobufEquals(topology, expected_topology));
+    TF_ASSIGN_OR_RETURN(
+        std::string value,
+        client.BlockingKeyValueGet("key1", absl::InfiniteDuration()));
+    TF_RET_CHECK(value == "value1");
+    TF_RETURN_IF_ERROR(client.KeyValueSet("key2", "value2"));
+    return xla::Status::OK();
+  };
+
+  std::vector<std::function<xla::Status()>> functions = {thread0_fn,
+                                                         thread1_fn};
+  std::vector<xla::Status> statuses(functions.size());
+  {
+    tensorflow::thread::ThreadPool thread_pool(
+        tensorflow::Env::Default(), "test_threads", functions.size());
+    for (int i = 0; i < functions.size(); ++i) {
+      thread_pool.Schedule([&, i]() { statuses[i] = functions[i](); });
+    }
+  }
+  TF_EXPECT_OK(statuses[0]);
+  TF_EXPECT_OK(statuses[1]);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/distributed/distributed.cc b/tensorflow/compiler/xla/python/distributed/distributed.cc
new file mode 100644
index 00000000000..6afc7b1c4e9
--- /dev/null
+++ b/tensorflow/compiler/xla/python/distributed/distributed.cc
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/distributed/distributed.h"
+
+#include "grpcpp/grpcpp.h"
+
+namespace xla {
+
+StatusOr<std::unique_ptr<DistributedRuntimeService>>
+GetDistributedRuntimeService(std::string address, int num_nodes) {
+  auto credentials = ::grpc::InsecureServerCredentials();
+  return DistributedRuntimeService::Get(address, credentials, num_nodes);
+}
+
+std::shared_ptr<DistributedRuntimeClient> GetDistributedRuntimeClient(
+    std::string address) {
+  std::shared_ptr<::grpc::ChannelCredentials> creds =
+      ::grpc::InsecureChannelCredentials();
+  std::shared_ptr<::grpc::Channel> channel =
+      ::grpc::CreateChannel(address, creds);
+  return absl::make_unique<DistributedRuntimeClient>(channel);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/distributed/distributed.h b/tensorflow/compiler/xla/python/distributed/distributed.h
new file mode 100644
index 00000000000..0475c3e9feb
--- /dev/null
+++ b/tensorflow/compiler/xla/python/distributed/distributed.h
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_DISTRIBUTED_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_DISTRIBUTED_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/compiler/xla/python/distributed/client.h"
+#include "tensorflow/compiler/xla/python/distributed/service.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// APIs for starting the distributed runtime service and client. Note that these
+// variants use insecure credentials; the functions to build the service and
+// client are kept separate so that other implementations using more secure
+// credentials may be provided by the user.
+
+// Builds a distributed runtime service. `address` is the address on which
+// the service should listen, e.g., [::]:1234 . `num_nodes` is the number
+// of nodes in the cluster.
+StatusOr<std::unique_ptr<DistributedRuntimeService>>
+GetDistributedRuntimeService(std::string address, int num_nodes);
+
+// Builds a distributed runtime client, connecting to a service at `address`,
+// where address is a gRPC-style address such as `dns:///localhost:1234`.
+std::shared_ptr<DistributedRuntimeClient> GetDistributedRuntimeClient(
+    std::string address);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_DISTRIBUTED_H_
diff --git a/tensorflow/compiler/xla/python/distributed/key_value_store.cc b/tensorflow/compiler/xla/python/distributed/key_value_store.cc
new file mode 100644
index 00000000000..5966d4ce12b
--- /dev/null
+++ b/tensorflow/compiler/xla/python/distributed/key_value_store.cc
@@ -0,0 +1,44 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/distributed/key_value_store.h"
+
+namespace xla {
+
+KeyValueStore::KeyValueStore() = default;
+
+::grpc::Status KeyValueStore::Get(const std::string& key,
+                                  absl::Duration timeout, std::string* value) {
+  auto key_is_present = [&]() {
+    mu_.AssertHeld();
+    return entries_.find(key) != entries_.end();
+  };
+  absl::MutexLock lock(&mu_);
+  // TODO(phawkins): the synchronization here is very coarse, but probably
+  // sufficient for its current application.
+  if (!mu_.AwaitWithTimeout(absl::Condition(&key_is_present), timeout)) {
+    return ::grpc::Status(::grpc::StatusCode::NOT_FOUND, "");
+  }
+  *value = entries_.find(key)->second;
+  return ::grpc::Status::OK;
+}
+
+::grpc::Status KeyValueStore::Set(const std::string& key, std::string value) {
+  absl::MutexLock lock(&mu_);
+  entries_[key] = std::move(value);
+  return ::grpc::Status::OK;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/distributed/key_value_store.h b/tensorflow/compiler/xla/python/distributed/key_value_store.h
new file mode 100644
index 00000000000..8560305e6f6
--- /dev/null
+++ b/tensorflow/compiler/xla/python/distributed/key_value_store.h
@@ -0,0 +1,53 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_KEY_VALUE_STORE_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_KEY_VALUE_STORE_H_
+
+#include "grpcpp/grpcpp.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+
+namespace xla {
+
+// A simple blocking key-value store class.
+class KeyValueStore {
+ public:
+  KeyValueStore();
+
+  KeyValueStore(const KeyValueStore&) = delete;
+  KeyValueStore(KeyValueStore&&) = delete;
+  KeyValueStore& operator=(const KeyValueStore&) = delete;
+  KeyValueStore&& operator=(KeyValueStore&&) = delete;
+
+  // Looks up `key`. If present, returns its value. If the key is not present,
+  // waits until `timeout` expires for the key to arrive. If the key does not
+  // arrive by the expiry of `timeout`, returns NOT_FOUND.
+  ::grpc::Status Get(const std::string& key, absl::Duration timeout,
+                     std::string* value);
+
+  // Replaces the value of `key` with `value`.
+  ::grpc::Status Set(const std::string& key, std::string value);
+
+ private:
+  absl::Mutex mu_;
+  absl::flat_hash_map<std::string, std::string> entries_ ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_KEY_VALUE_STORE_H_
diff --git a/tensorflow/lite/python/optimize/sparsification_wrapper.i b/tensorflow/compiler/xla/python/distributed/protocol.h
similarity index 69%
rename from tensorflow/lite/python/optimize/sparsification_wrapper.i
rename to tensorflow/compiler/xla/python/distributed/protocol.h
index d7db2854bc2..208c6dab8c5 100644
--- a/tensorflow/lite/python/optimize/sparsification_wrapper.i
+++ b/tensorflow/compiler/xla/python/distributed/protocol.h
@@ -13,14 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-%include "std_string.i"
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_PROTOCOL_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_PROTOCOL_H_
 
+namespace xla {
 
-%{
-#define SWIG_FILE_WITH_INIT
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/python/optimize/sparsification_wrapper.h"
-%}
+static constexpr int kDistributedRuntimeProtocolVersion = 1;
 
+}  // namespace xla
 
-%include "tensorflow/lite/python/optimize/sparsification_wrapper.h"
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_PROTOCOL_H_
diff --git a/tensorflow/compiler/xla/python/distributed/protocol.proto b/tensorflow/compiler/xla/python/distributed/protocol.proto
new file mode 100644
index 00000000000..18bfa221110
--- /dev/null
+++ b/tensorflow/compiler/xla/python/distributed/protocol.proto
@@ -0,0 +1,103 @@
+// Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// ==============================================================================
+//
+// Distributed XLA service protocol.
+//
+// This is a minimal distributed protocol intended primarily for sharing NCCL
+// communicator state between distributed hosts.
+//
+// The intention is to replace this with a more capable distributed runtime at
+// some point in the near future, but this suffices for simple multihost GPU
+// use cases.
+//
+// The intention is that a service is started during cluster initialization and
+// persists for the lifetime of the cluster.
+//
+// TODO(phawkins): add a health-checking mechanism.
+
+syntax = "proto3";
+
+package xla;
+
+// Describes a device local to a host.
+message DeviceProto {
+  int32 local_device_ordinal = 1;
+  string name = 2;
+  string vendor = 3;
+
+  // The following fields are present in the GlobalTopologyProto message
+  // returned by Connect() but not in the LocalTopologyProto messages passed to
+  // Connect(). In other words, the master node determines the global device IDs
+  // during Connect().
+  int32 global_device_id = 4;  // Globally unique ID number.
+}
+
+// Describes the set of devices local to a host.
+message LocalTopologyProto {
+  // We assume that each node knows its globally-unique node ID, provided by
+  // whatever mechanism launches the tasks. Node IDs should form a dense range
+  // of integers [0, num_nodes).
+  int32 node_id = 1;
+  repeated DeviceProto devices = 2;
+}
+
+message GlobalTopologyProto {
+  repeated LocalTopologyProto nodes = 1;
+}
+
+message ConnectRequest {
+  int32 protocol_version = 1;  // Always 1 at present.
+
+  LocalTopologyProto local_topology = 2;
+}
+
+message ConnectResponse {
+  GlobalTopologyProto global_topology = 2;
+}
+
+message KeyValueGetRequest {
+  bytes key = 1;
+  int32 timeout_milliseconds = 2;
+}
+
+message KeyValueGetResponse {
+  bool found = 1;
+  bytes value = 2;
+}
+
+message KeyValueSetRequest {
+  bytes key = 1;
+  bytes value = 2;
+}
+
+message KeyValueSetResponse {}
+
+service DistributedRuntimeService {
+  // Connects a node to the distributed master node. Blocks until all workers
+  // have connected. The service receives the number of nodes to expect as an
+  // option passed to its constructor.
+  rpc Connect(ConnectRequest) returns (ConnectResponse) {}
+
+  // Simple key-value store used for sharing configuration data.
+  // For example, when using NCCL to communicate between multiple GPUs,
+  // the NCCL communicator IDs are stored here.
+
+  // Looks up a key in the key-value service. Blocks until the key is present
+  // or until `timeout` expires.
+  rpc KeyValueGet(KeyValueGetRequest) returns (KeyValueGetResponse) {}
+
+  // Updates the value associated with a key.
+  rpc KeyValueSet(KeyValueSetRequest) returns (KeyValueSetResponse) {}
+}
diff --git a/tensorflow/compiler/xla/python/distributed/service.cc b/tensorflow/compiler/xla/python/distributed/service.cc
new file mode 100644
index 00000000000..cc2b3a5aca2
--- /dev/null
+++ b/tensorflow/compiler/xla/python/distributed/service.cc
@@ -0,0 +1,154 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/distributed/service.h"
+
+#include "tensorflow/compiler/xla/python/distributed/protocol.h"
+#include "tensorflow/compiler/xla/python/distributed/util.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+
+DistributedRuntimeServiceImpl::DistributedRuntimeServiceImpl(int num_nodes) {
+  nodes_.resize(num_nodes);
+  local_topologies_.resize(num_nodes);
+}
+
+// Steals the contents of `local_topologies`.
+void BuildGlobalTopology(absl::Span<LocalTopologyProto> local_topologies,
+                         GlobalTopologyProto* global_topology) {
+  int next_global_device_id = 0;
+  for (LocalTopologyProto& local : local_topologies) {
+    for (DeviceProto& device : *local.mutable_devices()) {
+      device.set_global_device_id(next_global_device_id++);
+    }
+    global_topology->add_nodes()->Swap(&local);
+  }
+}
+
+::grpc::Status DistributedRuntimeServiceImpl::Connect(
+    ::grpc::ServerContext* context, const ConnectRequest* request,
+    ConnectResponse* response) {
+  VLOG(10) << "Connect " << request->DebugString();
+  if (request->protocol_version() != kDistributedRuntimeProtocolVersion) {
+    return ToGrpcStatus(xla::InvalidArgument("Invalid protocol version %d",
+                                             request->protocol_version()));
+  }
+  absl::MutexLock lock(&mu_);
+  if (state_ != State::kInitializing) {
+    return ToGrpcStatus(xla::FailedPrecondition(
+        "Connect() called when system is not initializing."));
+  }
+  int node_id = request->local_topology().node_id();
+  if (node_id < 0 || node_id >= nodes_.size()) {
+    return ToGrpcStatus(
+        xla::InvalidArgument("Invalid node ID %d, must be in the range [0, %d)",
+                             node_id, nodes_.size()));
+  }
+  if (nodes_[node_id].present) {
+    return ToGrpcStatus(xla::InvalidArgument("Duplicate node ID %d", node_id));
+  }
+  nodes_[node_id].present = true;
+  local_topologies_[node_id] = request->local_topology();
+  ++num_nodes_present_;
+
+  auto all_nodes_present = [&]() {
+    mu_.AssertHeld();
+    return num_nodes_present_ == nodes_.size();
+  };
+  if (!mu_.AwaitWithTimeout(absl::Condition(&all_nodes_present),
+                            kConnectTimeout)) {
+    return ToGrpcStatus(tensorflow::errors::DeadlineExceeded(
+        "Timed out after %s waiting for all nodes to call Connect()",
+        absl::FormatDuration(kConnectTimeout)));
+  }
+
+  if (node_id == 0) {
+    BuildGlobalTopology(absl::Span<LocalTopologyProto>(local_topologies_),
+                        &topology_);
+    local_topologies_.clear();
+    state_ = State::kRunning;
+  } else {
+    auto running = [&]() {
+      mu_.AssertHeld();
+      return state_ == State::kRunning;
+    };
+    mu_.Await(absl::Condition(&running));
+  }
+  *response->mutable_global_topology() = topology_;
+  return ::grpc::Status::OK;
+}
+
+::grpc::Status DistributedRuntimeServiceImpl::KeyValueGet(
+    ::grpc::ServerContext* context, const KeyValueGetRequest* request,
+    KeyValueGetResponse* response) {
+  VLOG(10) << "KeyValueGet " << request->DebugString();
+  {
+    absl::MutexLock lock(&mu_);
+    if (state_ != State::kRunning) {
+      return ToGrpcStatus(xla::FailedPrecondition(
+          "KeyValueGet() called when system is not running."));
+    }
+  }
+  return key_value_store_.Get(
+      request->key(), absl::Milliseconds(request->timeout_milliseconds()),
+      response->mutable_value());
+}
+
+::grpc::Status DistributedRuntimeServiceImpl::KeyValueSet(
+    ::grpc::ServerContext* context, const KeyValueSetRequest* request,
+    KeyValueSetResponse* response) {
+  VLOG(10) << "KeyValueSet " << request->DebugString();
+  {
+    absl::MutexLock lock(&mu_);
+    if (state_ != State::kRunning) {
+      return ToGrpcStatus(xla::FailedPrecondition(
+          "KeyValueSet() called when system is not running; clients must call "
+          "Connect() first"));
+    }
+  }
+  return key_value_store_.Set(request->key(), request->value());
+}
+
+xla::StatusOr<std::unique_ptr<DistributedRuntimeService>>
+DistributedRuntimeService::Get(
+    const std::string& address,
+    std::shared_ptr<::grpc::ServerCredentials> credentials, int num_nodes) {
+  auto service = absl::make_unique<DistributedRuntimeService>(num_nodes);
+  ::grpc::ServerBuilder builder;
+  builder.AddListeningPort(address, credentials);
+  VLOG(1) << "Distributed runtmie service address " << address;
+  builder.RegisterService(&service->impl_);
+  service->server_ = builder.BuildAndStart();
+  if (!service->server_) {
+    return xla::Unknown("Failed to start RPC server");
+  }
+  LOG(INFO) << "Jax service listening on " << address;
+  return service;
+}
+
+DistributedRuntimeService::DistributedRuntimeService(int num_nodes)
+    : impl_(num_nodes) {}
+
+DistributedRuntimeService::~DistributedRuntimeService() {
+  if (server_) {
+    LOG(INFO) << "Jax service shutting down";
+    server_->Shutdown();
+    server_->Wait();
+  }
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/distributed/service.h b/tensorflow/compiler/xla/python/distributed/service.h
new file mode 100644
index 00000000000..baf470e4f13
--- /dev/null
+++ b/tensorflow/compiler/xla/python/distributed/service.h
@@ -0,0 +1,101 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_SERVICE_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_SERVICE_H_
+
+#include "absl/time/time.h"
+#include "tensorflow/compiler/xla/python/distributed/key_value_store.h"
+#include "tensorflow/compiler/xla/python/distributed/protocol.grpc.pb.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+typedef int NodeId;
+
+class DistributedRuntimeServiceImpl final
+    : public grpc::DistributedRuntimeService::Service {
+ public:
+  explicit DistributedRuntimeServiceImpl(int num_nodes);
+
+  DistributedRuntimeServiceImpl(const DistributedRuntimeServiceImpl&) = delete;
+  DistributedRuntimeServiceImpl(DistributedRuntimeServiceImpl&&) = delete;
+  DistributedRuntimeServiceImpl& operator=(
+      const DistributedRuntimeServiceImpl&) = delete;
+  DistributedRuntimeServiceImpl&& operator=(DistributedRuntimeServiceImpl&&) =
+      delete;
+
+  ::grpc::Status Connect(::grpc::ServerContext* context,
+                         const ConnectRequest* request,
+                         ConnectResponse* response) override;
+
+  ::grpc::Status KeyValueGet(::grpc::ServerContext* context,
+                             const KeyValueGetRequest* request,
+                             KeyValueGetResponse* response) override;
+
+  ::grpc::Status KeyValueSet(::grpc::ServerContext* context,
+                             const KeyValueSetRequest* request,
+                             KeyValueSetResponse* response) override;
+
+ private:
+  const absl::Duration kConnectTimeout = absl::Seconds(120);
+
+  absl::Mutex mu_;
+  enum class State { kInitializing, kRunning };
+  State state_ GUARDED_BY(mu_) = State::kInitializing;
+
+  std::vector<LocalTopologyProto> local_topologies_ GUARDED_BY(mu_);
+  GlobalTopologyProto topology_ GUARDED_BY(mu_);
+  struct Node {
+    bool present = false;
+  };
+  int num_nodes_present_ GUARDED_BY(mu_) = 0;
+  std::vector<Node> nodes_ GUARDED_BY(mu_);
+
+  KeyValueStore key_value_store_;
+};
+
+class DistributedRuntimeService {
+ public:
+  static xla::StatusOr<std::unique_ptr<DistributedRuntimeService>> Get(
+      const std::string& address,
+      std::shared_ptr<::grpc::ServerCredentials> credentials, int num_nodes);
+
+  explicit DistributedRuntimeService(int num_nodes);
+  ~DistributedRuntimeService();
+
+  DistributedRuntimeService(const DistributedRuntimeService&) = delete;
+  DistributedRuntimeService(DistributedRuntimeService&&) = delete;
+  DistributedRuntimeService& operator=(const DistributedRuntimeService&) =
+      delete;
+  DistributedRuntimeService& operator=(DistributedRuntimeService&&) = delete;
+
+  ::grpc::Server* server() const { return server_.get(); }
+
+ private:
+  DistributedRuntimeServiceImpl impl_;
+  std::unique_ptr<::grpc::Server> server_;
+};
+
+// Everything below this point is exposed only for tests.
+
+// Given a LocalTopologyProto object from each node, builds a
+// GlobalTopologyProto that describes all nodes.
+void BuildGlobalTopology(absl::Span<LocalTopologyProto> local_topologies,
+                         GlobalTopologyProto* global_topology);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_SERVICE_H_
diff --git a/tensorflow/compiler/xla/python/distributed/service_test.cc b/tensorflow/compiler/xla/python/distributed/service_test.cc
new file mode 100644
index 00000000000..08326df2f38
--- /dev/null
+++ b/tensorflow/compiler/xla/python/distributed/service_test.cc
@@ -0,0 +1,44 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/distributed/service.h"
+
+#include "tensorflow/compiler/xla/python/distributed/protocol.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace {
+
+TEST(TopologyTest, BuildGlobalTopology) {
+  std::vector<LocalTopologyProto> locals(2);
+  DeviceProto* d0 = locals[0].add_devices();
+  d0->set_local_device_ordinal(0);
+  DeviceProto* d1 = locals[0].add_devices();
+  d1->set_local_device_ordinal(0);
+  DeviceProto* d2 = locals[1].add_devices();
+  d2->set_local_device_ordinal(0);
+  DeviceProto* d3 = locals[1].add_devices();
+  d3->set_local_device_ordinal(1);
+
+  GlobalTopologyProto global;
+  BuildGlobalTopology(absl::Span<LocalTopologyProto>(locals), &global);
+  EXPECT_EQ(global.nodes_size(), 2);
+  EXPECT_EQ(global.nodes()[0].devices_size(), 2);
+  EXPECT_EQ(global.nodes()[1].devices_size(), 2);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/distributed/util.h b/tensorflow/compiler/xla/python/distributed/util.h
new file mode 100644
index 00000000000..07ae8d1f0ce
--- /dev/null
+++ b/tensorflow/compiler/xla/python/distributed/util.h
@@ -0,0 +1,44 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_UTIL_H_
+
+#include "grpcpp/support/status.h"
+#include "tensorflow/compiler/xla/status.h"
+
+namespace xla {
+
+inline Status FromGrpcStatus(const ::grpc::Status& s) {
+  if (s.ok()) {
+    return Status::OK();
+  } else {
+    return Status(static_cast<tensorflow::error::Code>(s.error_code()),
+                  s.error_message());
+  }
+}
+
+inline ::grpc::Status ToGrpcStatus(const Status& s) {
+  if (s.ok()) {
+    return ::grpc::Status::OK;
+  } else {
+    return ::grpc::Status(static_cast<::grpc::StatusCode>(s.code()),
+                          s.error_message());
+  }
+}
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_DISTRIBUTED_UTIL_H_
diff --git a/tensorflow/compiler/xla/python/dlpack.cc b/tensorflow/compiler/xla/python/dlpack.cc
index b4ae503ba4c..ca34fb504bd 100644
--- a/tensorflow/compiler/xla/python/dlpack.cc
+++ b/tensorflow/compiler/xla/python/dlpack.cc
@@ -210,8 +210,8 @@ StatusOr<DLContext> DLContextForDevice(const Device& device) {
   return context;
 }
 
-StatusOr<std::shared_ptr<Device>> DeviceForDLContext(
-    const PyLocalClient& client, const DLContext& context) {
+StatusOr<Device*> DeviceForDLContext(const PyLocalClient& client,
+                                     const DLContext& context) {
   se::Platform::Id platform_id;
   switch (context.device_type) {
     case kDLCPU:
@@ -224,13 +224,11 @@ StatusOr<std::shared_ptr<Device>> DeviceForDLContext(
       return InvalidArgument("Unknown/unsupported DLPack device type %d",
                              context.device_type);
   }
-  auto it = absl::c_find_if(
-      client.local_devices(), [&](const std::shared_ptr<Device>& device) {
-        return device->local_device_state()->executor()->platform()->id() ==
-                   platform_id &&
-               device->local_device_state()->device_ordinal() ==
-                   context.device_id;
-      });
+  auto it = absl::c_find_if(client.local_devices(), [&](Device* device) {
+    return device->local_device_state()->executor()->platform()->id() ==
+               platform_id &&
+           device->local_device_state()->device_ordinal() == context.device_id;
+  });
   if (it == client.local_devices().end()) {
     return InvalidArgument(
         "No matching device found for DLPack device_type %d device_id %d",
@@ -289,7 +287,7 @@ StatusOr<py::capsule> BufferToDLPackManagedTensor(PyLocalBuffer* buffer) {
 }
 
 StatusOr<std::unique_ptr<PyLocalBuffer>> DLPackManagedTensorToBuffer(
-    const pybind11::capsule& tensor, std::shared_ptr<PyLocalClient> client) {
+    const pybind11::capsule& tensor, PyLocalClient* client) {
   if (absl::string_view(tensor.name()) != kDlTensorCapsuleName) {
     return InvalidArgument(
         "DLPack tensor must be a capsule with name \"dltensor\", got \"%s\". "
@@ -302,7 +300,7 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> DLPackManagedTensorToBuffer(
         "Number of dimensions in DLManagedTensor must be nonnegative, got %d",
         dlmt->dl_tensor.ndim);
   }
-  TF_ASSIGN_OR_RETURN(std::shared_ptr<Device> device,
+  TF_ASSIGN_OR_RETURN(Device * device,
                       DeviceForDLContext(*client, dlmt->dl_tensor.ctx));
   absl::Span<int64 const> dimensions(
       reinterpret_cast<int64*>(dlmt->dl_tensor.shape), dlmt->dl_tensor.ndim);
@@ -329,19 +327,19 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> DLPackManagedTensorToBuffer(
   if (dlmt->deleter) {
     on_delete_callback = [dlmt]() { dlmt->deleter(dlmt); };
   }
+  absl::Span<const std::shared_ptr<BufferDefinitionEvent>> definition_events;
   auto device_buffer = std::make_shared<SharedDeviceBuffer>(
       /*allocator=*/nullptr, dlmt->dl_tensor.ctx.device_id,
       std::initializer_list<se::DeviceMemoryBase>{buffer},
       /*children=*/std::vector<std::shared_ptr<SharedDeviceBuffer>>{},
-      /*definition_event=*/nullptr, std::move(on_delete_callback));
+      definition_events, std::move(on_delete_callback));
 
   // We have taken ownership of the array inside the capsule; make sure the
   // capsule it cannot be used again.
   PyCapsule_SetName(tensor.ptr(), "used_dltensor");
   PyCapsule_SetDestructor(tensor.ptr(), nullptr);
-  return absl::make_unique<PyLocalBuffer>(shape, shape,
-                                          std::move(device_buffer),
-                                          std::move(client), std::move(device));
+  return absl::make_unique<PyLocalBuffer>(
+      shape, shape, std::move(device_buffer), client, device);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/dlpack.h b/tensorflow/compiler/xla/python/dlpack.h
index 92eba687225..09700841ab4 100644
--- a/tensorflow/compiler/xla/python/dlpack.h
+++ b/tensorflow/compiler/xla/python/dlpack.h
@@ -24,7 +24,7 @@ namespace xla {
 StatusOr<pybind11::capsule> BufferToDLPackManagedTensor(PyLocalBuffer* buffer);
 
 StatusOr<std::unique_ptr<PyLocalBuffer>> DLPackManagedTensorToBuffer(
-    const pybind11::capsule& tensor, std::shared_ptr<PyLocalClient> client);
+    const pybind11::capsule& tensor, PyLocalClient* client);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/python/event_pool.h b/tensorflow/compiler/xla/python/event_pool.h
index 56787acd87e..f858b5edef8 100644
--- a/tensorflow/compiler/xla/python/event_pool.h
+++ b/tensorflow/compiler/xla/python/event_pool.h
@@ -68,7 +68,7 @@ class EventPool {
   const bool allow_reuse_;
 
   absl::Mutex mu_;
-  std::stack<std::unique_ptr<se::Event>> free_events_ GUARDED_BY(mu_);
+  std::stack<std::unique_ptr<se::Event>> free_events_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/gpu_multistream_test.cc b/tensorflow/compiler/xla/python/gpu_multistream_test.cc
new file mode 100644
index 00000000000..a633e4dd020
--- /dev/null
+++ b/tensorflow/compiler/xla/python/gpu_multistream_test.cc
@@ -0,0 +1,104 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/executable_build_options.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/python/local_client.h"
+#include "tensorflow/compiler/xla/python/nvidia_gpu_device.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/core/platform/random.h"
+
+namespace xla {
+namespace {
+
+// Regression test that verifies that substreams of a multistream GPU
+// computation wait for the inputs to be produced before executing.
+TEST(GpuMultiStream, Basics) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::shared_ptr<PyLocalClient> client,
+      GetNvidiaGpuClient(/*asynchronous=*/true, GpuAllocatorConfig(),
+                         /*distributed_client=*/nullptr, /*node_id=*/0));
+
+  Device* device = client->local_devices().at(0);
+
+  int n = 1024;
+  Shape shape = ShapeUtil::MakeShape(S32, {n});
+  std::vector<int32> inputs(n);
+  std::vector<int32> expected_outputs(n);
+
+  XlaBuilder builder("acomputation");
+  auto p0 = Parameter(&builder, 0, shape, "param");
+  auto p1 = Parameter(&builder, 1, shape, "param");
+  Tuple(&builder, {Neg(p0), Neg(p1)});
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation, builder.Build());
+
+  CompileOptions compile_options;
+  compile_options.executable_build_options.mutable_debug_options()
+      ->set_xla_gpu_disable_multi_streaming(false);
+  compile_options.executable_build_options.mutable_debug_options()
+      ->set_xla_gpu_use_random_streams(true);
+  DeviceAssignment device_assignment(1, 1);
+  device_assignment(0, 0) = device->id();
+  compile_options.executable_build_options.set_device_assignment(
+      device_assignment);
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PyLocalExecutable> executable,
+      PyLocalExecutable::Compile(computation, client.get(),
+                                 std::move(compile_options)));
+
+  int64 dummy_size = 1 << 20;
+  std::vector<int32> dummy_inputs(dummy_size);
+  Shape dummy_shape = ShapeUtil::MakeShape(S32, {dummy_size});
+
+  for (int i = 0; i < 100; ++i) {
+    for (int i = 0; i < n; ++i) {
+      inputs[i] = tensorflow::random::New64();
+      expected_outputs[i] = -inputs[i];
+    }
+    // Transfer a large dummy buffer, behind which the inputs to the computation
+    // must wait.
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto dummy_buffer,
+        PyLocalBuffer::FromHostBuffer(
+            dummy_inputs.data(), dummy_shape, /*force_copy=*/false,
+            /*buffer_reference=*/nullptr, client.get(), device));
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto in_buffer0,
+        PyLocalBuffer::FromHostBuffer(
+            inputs.data(), shape, /*force_copy=*/false,
+            /*buffer_reference=*/nullptr, client.get(), device));
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto in_buffer1,
+        PyLocalBuffer::FromHostBuffer(
+            inputs.data(), shape, /*force_copy=*/false,
+            /*buffer_reference=*/nullptr, client.get(), device));
+    // The execution may be enqueued before the transfers complete, requiring
+    // adequate device-side synchronization.
+    ExecuteOptions options;
+    options.untuple_result = true;
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto out_buffers,
+        executable->Execute({in_buffer0.get(), in_buffer1.get()}, options));
+
+    TF_ASSERT_OK_AND_ASSIGN(auto out_literal, out_buffers[0]->ToLiteral());
+    LiteralTestUtil::ExpectR1Equal<int32>(expected_outputs, *out_literal);
+    TF_ASSERT_OK_AND_ASSIGN(out_literal, out_buffers[1]->ToLiteral());
+    LiteralTestUtil::ExpectR1Equal<int32>(expected_outputs, *out_literal);
+  }
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/local_client.cc b/tensorflow/compiler/xla/python/local_client.cc
index a35b20f6aa1..c721f3bea8b 100644
--- a/tensorflow/compiler/xla/python/local_client.cc
+++ b/tensorflow/compiler/xla/python/local_client.cc
@@ -95,6 +95,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 
@@ -111,17 +112,55 @@ std::string Device::DebugString() const {
   return absl::StrCat(platform_name(), ":", id());
 }
 
+StatusOr<DeviceAssignment> DevicesToDeviceAssignment(
+    absl::Span<const std::vector<Device*>> devices) {
+  if (devices.empty()) {
+    return InvalidArgument(
+        "Device assignment passed to Compile() must be non-empty.");
+  }
+  if (devices[0].empty()) {
+    return InvalidArgument(
+        "Device assignment passed to Compile() must have a nonzero number of "
+        "partitions per replica; replica 0 had 0 partitions.");
+  }
+  DeviceAssignment xla_assignment(devices.size(), devices[0].size());
+  for (int replica = 0; replica < devices.size(); ++replica) {
+    if (devices[replica].size() != devices[0].size()) {
+      return InvalidArgument(
+          "Device assignment passed to Compile() has different numbers of "
+          "partitions between replicas; %d partitions for replica %d versus %d "
+          "partitions for replica 0.",
+          devices[replica].size(), replica, devices[0].size());
+    }
+    for (int partition = 0; partition < devices[replica].size(); ++partition) {
+      if (devices[0][0]->platform_name() !=
+          devices[replica][partition]->platform_name()) {
+        return InvalidArgument(
+            "Device assignment passed to Compile() must have devices of a "
+            "single kind, got %s for replica 0 partition 0 and %s for replica "
+            "%d partition %d.",
+            devices[0][0]->platform_name(),
+            devices[replica][partition]->platform_name(), replica, partition);
+      }
+      xla_assignment(replica, partition) = devices[replica][partition]->id();
+    }
+  }
+  return xla_assignment;
+}
+
 PyLocalClient::PyLocalClient(
     std::string platform_name, LocalClient* client,
-    std::vector<std::shared_ptr<Device>> devices, int host_id,
+    std::vector<std::unique_ptr<Device>> devices, int host_id,
     std::unique_ptr<se::DeviceMemoryAllocator> allocator,
-    std::unique_ptr<tensorflow::Allocator> host_memory_allocator)
+    std::unique_ptr<tensorflow::Allocator> host_memory_allocator,
+    std::unique_ptr<GpuExecutableRunOptions> gpu_run_options)
     : platform_name_(std::move(platform_name)),
       client_(client),
       devices_(std::move(devices)),
       host_id_(host_id),
       owned_allocator_(std::move(allocator)),
       host_memory_allocator_(std::move(host_memory_allocator)),
+      gpu_run_options_(std::move(gpu_run_options)),
       h2d_transfer_pool_(tensorflow::Env::Default(), "py_xla_h2d_transfer",
                          client->device_count()) {
   if (owned_allocator_ != nullptr) {
@@ -130,8 +169,8 @@ PyLocalClient::PyLocalClient(
     allocator_ = client_->backend().memory_allocator();
   }
 
-  for (const std::shared_ptr<Device>& device : devices_) {
-    CHECK(id_to_device_.insert({device->id(), device}).second)
+  for (const std::unique_ptr<Device>& device : devices_) {
+    CHECK(id_to_device_.insert({device->id(), device.get()}).second)
         << "Duplicate device id: " << device->id();
 
     if (device->local_device_state()) {
@@ -140,7 +179,7 @@ PyLocalClient::PyLocalClient(
         local_devices_.resize(idx + 1);
       }
       CHECK(local_devices_[idx] == nullptr) << idx;
-      local_devices_[idx] = device;
+      local_devices_[idx] = device.get();
     }
   }
   for (int idx = 0; idx < local_devices_.size(); ++idx) {
@@ -157,8 +196,8 @@ StatusOr<DeviceAssignment> PyLocalClient::GetDefaultDeviceAssignment(
 /* static */
 StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::FromHostBuffer(
     const void* data, const Shape& shape, bool force_copy,
-    std::shared_ptr<void> buffer_reference,
-    std::shared_ptr<PyLocalClient> client, std::shared_ptr<Device> device) {
+    std::shared_ptr<void> buffer_reference, PyLocalClient* client,
+    Device* device) {
   tensorflow::profiler::TraceMe traceme("PyLocalBuffer::FromLiterals");
   VLOG(2) << "PyLocalBuffer::FromLiterals: shape: " << shape.ToString()
           << " device: " << device->DebugString();
@@ -180,14 +219,14 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::FromHostBuffer(
         };
     se::DeviceMemoryBase buffer(const_cast<void*>(data),
                                 ShapeUtil::ByteSizeOf(shape));
+    absl::Span<const std::shared_ptr<BufferDefinitionEvent>> definition_events;
     auto device_buffer = std::make_shared<SharedDeviceBuffer>(
         /*allocator=*/nullptr, local_device->device_ordinal(),
         std::initializer_list<se::DeviceMemoryBase>{buffer},
         /*children=*/std::vector<std::shared_ptr<SharedDeviceBuffer>>{},
-        /*definition_event=*/nullptr, std::move(on_delete_callback));
+        definition_events, std::move(on_delete_callback));
     return absl::make_unique<PyLocalBuffer>(
-        shape, shape, std::move(device_buffer), std::move(client),
-        std::move(device));
+        shape, shape, std::move(device_buffer), client, device);
   }
 
   TransferManager* transfer_manager =
@@ -216,7 +255,7 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::FromHostBuffer(
       std::make_shared<BufferDefinitionEvent>();
   std::shared_ptr<SharedDeviceBuffer> device_buffer =
       SharedDeviceBuffer::FromScopedShapedBuffer(&scoped_buffer,
-                                                 definition_event);
+                                                 {definition_event});
   Shape on_device_shape = scoped_buffer.on_device_shape();
 
   auto transfer_h2d = [client, transfer_manager, local_device, device_buffer,
@@ -261,7 +300,7 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::FromHostBuffer(
     // Sets the buffer definition event. Note: this has the side effect of
     // unblocking any host threads that may have been waiting to consume the
     // buffer.
-    device_buffer->definition_event()->SetDefinitionEvent(
+    device_buffer->definition_events()[0]->SetDefinitionEvent(
         std::move(event), local_device->host_to_device_stream());
 
     if (local_device->synchronous_deallocation()) {
@@ -276,12 +315,12 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::FromHostBuffer(
   client->h2d_transfer_pool()->Schedule(transfer_h2d);
   return absl::make_unique<PyLocalBuffer>(
       compact_shape, std::move(on_device_shape), std::move(device_buffer),
-      std::move(client), std::move(device));
+      client, device);
 }
 
 /* static */ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::MakeTuple(
-    const std::vector<PyLocalBuffer*> buffers,
-    std::shared_ptr<PyLocalClient> client, std::shared_ptr<Device> device) {
+    absl::Span<PyLocalBuffer* const> buffers, PyLocalClient* client,
+    Device* device) {
   TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
                       device->GetLocalDeviceState());
   std::vector<Shape> host_shapes;
@@ -291,7 +330,7 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::FromHostBuffer(
   device_shapes.reserve(buffers.size());
   device_buffers.reserve(buffers.size());
   for (const PyLocalBuffer* buffer : buffers) {
-    if (buffer->device().get() != device.get()) {
+    if (buffer->device() != device) {
       return InvalidArgument(
           "Tuple elements must be on the same device; %s vs %s",
           buffer->device()->DebugString(), device->DebugString());
@@ -316,7 +355,7 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::FromHostBuffer(
       std::shared_ptr<SharedDeviceBuffer> tuple_buffer,
       SharedDeviceBuffer::MakeTuple(
           device_buffers, on_host_shape, transfer_manager, allocator,
-          local_device->device_ordinal(), definition_event));
+          local_device->device_ordinal(), {definition_event}));
   auto buffer = absl::make_unique<PyLocalBuffer>(
       std::move(on_host_shape), ShapeUtil::MakeTupleShape(device_shapes),
       tuple_buffer, std::move(client), std::move(device));
@@ -346,14 +385,84 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::FromHostBuffer(
   return buffer;
 }
 
+StatusOr<std::vector<std::unique_ptr<PyLocalBuffer>>>
+MakeCrossHostReceiveBuffersHelper(absl::Span<const Shape> shapes,
+                                  PyLocalClient* client, Device* device) {
+  TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
+                      device->GetLocalDeviceState());
+  TransferManager* transfer_manager =
+      client->client()->backend().transfer_manager();
+  std::vector<std::unique_ptr<PyLocalBuffer>> buffers;
+  buffers.reserve(shapes.size());
+  se::Stream* host_to_device_stream = local_device->host_to_device_stream();
+  for (const auto& shape : shapes) {
+    TF_ASSIGN_OR_RETURN(
+        ScopedShapedBuffer scoped_buffer,
+        transfer_manager->AllocateScopedShapedBuffer(
+            shape, client->allocator(), local_device->device_ordinal()));
+
+    if (!transfer_manager->CanShapedBufferBeAccessedNow(
+            local_device->compute_stream()->parent(), scoped_buffer)) {
+      return Unimplemented(
+          "Cross host receive not enabled unless deallocations are deferred");
+    }
+
+    absl::InlinedVector<std::shared_ptr<BufferDefinitionEvent>, 2>
+        definition_events;
+
+    if (scoped_buffer.on_device_shape().IsTuple()) {
+      TF_CHECK_OK(transfer_manager->WriteTupleIndexTablesAsync(
+          host_to_device_stream, scoped_buffer));
+      definition_events = {std::make_shared<BufferDefinitionEvent>(),
+                           std::make_shared<BufferDefinitionEvent>()};
+      TF_ASSIGN_OR_RETURN(EventPool::Handle event,
+                          local_device->event_pool().ThenAllocateAndRecordEvent(
+                              host_to_device_stream));
+      definition_events[1]->SetDefinitionEvent(std::move(event),
+                                               host_to_device_stream);
+    } else {
+      definition_events = {std::make_shared<BufferDefinitionEvent>()};
+    }
+
+    std::shared_ptr<SharedDeviceBuffer> device_buffer =
+        SharedDeviceBuffer::FromScopedShapedBuffer(&scoped_buffer,
+                                                   definition_events);
+    Shape on_device_shape = scoped_buffer.on_device_shape();
+
+    auto buffer = absl::make_unique<PyLocalBuffer>(
+        shape, std::move(on_device_shape), std::move(device_buffer), client,
+        device);
+
+    buffers.push_back(std::move(buffer));
+  }
+  return buffers;
+}
+
+/*static*/ void PyLocalBuffer::MakeCrossHostReceiveBuffers(
+    absl::Span<const Shape> shapes, PyLocalClient* client, Device* device,
+    PyLocalCrossHostRecvNotifier&& notifier) {
+  if (shapes.empty()) {
+    notifier(InvalidArgument(
+        "shapes parameter empty in MakeCrossHostReceiveBuffers"));
+    return;
+  }
+  auto buffer_or = MakeCrossHostReceiveBuffersHelper(shapes, client, device);
+  if (!buffer_or.ok()) {
+    notifier(buffer_or.status());
+    return;
+  }
+
+  client->EnqueueCrossHostReceive(buffer_or.ConsumeValueOrDie(),
+                                  std::move(notifier));
+}
+
 PyLocalBuffer::PyLocalBuffer(Shape on_host_shape, Shape on_device_shape,
                              std::shared_ptr<SharedDeviceBuffer> device_buffer,
-                             std::shared_ptr<PyLocalClient> client,
-                             std::shared_ptr<Device> device)
-    : client_(std::move(client)),
+                             PyLocalClient* client, Device* device)
+    : client_(client),
       on_host_shape_(std::move(on_host_shape)),
       on_device_shape_(std::move(on_device_shape)),
-      device_(std::move(device)),
+      device_(device),
       device_buffer_(std::move(device_buffer)) {}
 
 void PyLocalBuffer::Delete() {
@@ -425,7 +534,7 @@ StatusOr<ShapedBuffer> PyLocalBuffer::AsShapedBuffer() const {
 }
 
 StatusOr<std::vector<std::unique_ptr<PyLocalBuffer>>>
-PyLocalBuffer::DestructureTuple() {
+PyLocalBuffer::DestructureTuple() const {
   tensorflow::profiler::TraceMe traceme("PyLocalBuffer::DestructureTuple");
   absl::MutexLock lock(&mu_);
   if (!on_host_shape_.IsTuple()) {
@@ -449,13 +558,13 @@ PyLocalBuffer::DestructureTuple() {
 }
 
 StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::CopyToDevice(
-    std::shared_ptr<Device> dst_device) {
+    Device* dst_device) {
   tensorflow::profiler::TraceMe traceme("PyLocalBuffer::CopyToDevice");
   std::shared_ptr<SharedDeviceBuffer> src_device_buffer = DeviceBuffer();
   TF_ASSIGN_OR_RETURN(LocalDeviceState * dst_local_device,
                       dst_device->GetLocalDeviceState());
 
-  if (dst_device.get() == device_.get()) {
+  if (dst_device == device_) {
     return absl::make_unique<PyLocalBuffer>(
         on_host_shape_, on_device_shape_, src_device_buffer, client_, device_);
   }
@@ -488,9 +597,11 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::CopyToDevice(
     TF_RET_CHECK(input_buffer.size() == output_buffer.size())
         << "input: " << input_buffer.size()
         << " output: " << output_buffer.size();
-    TF_RETURN_IF_ERROR(transfer_local_device->ThenMemcpyDeviceToDevice(
-        transfer_stream, dst_local_device->compute_stream(), input_buffer,
-        output_buffer));
+    if (input_buffer.size() != 0) {
+      TF_RETURN_IF_ERROR(transfer_local_device->ThenMemcpyDeviceToDevice(
+          transfer_stream, dst_local_device->compute_stream(), input_buffer,
+          output_buffer));
+    }
   }
 
   // We hold on to the `src_device_buffer` until the transfer is finished.
@@ -517,12 +628,18 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalBuffer::CopyToDevice(
   definition_event->SetDefinitionEvent(std::move(event), transfer_stream);
 
   std::shared_ptr<SharedDeviceBuffer> dst_device_buffer =
-      SharedDeviceBuffer::FromScopedShapedBuffer(&dst_buffer, definition_event);
+      SharedDeviceBuffer::FromScopedShapedBuffer(&dst_buffer,
+                                                 {definition_event});
   return absl::make_unique<PyLocalBuffer>(
       dst_buffer.on_host_shape(), dst_buffer.on_device_shape(),
       std::move(dst_device_buffer), client_, dst_device);
 }
 
+Status PyLocalBuffer::CopyToRemoteDevice(
+    absl::string_view serialized_descriptor, Device* dst_device) {
+  return client_->CopyToRemoteDevice(this, serialized_descriptor, dst_device);
+}
+
 Status PyLocalBuffer::BlockHostUntilReady() {
   tensorflow::profiler::TraceMe traceme("PyLocalBuffer::BlockHostUntilReady");
   std::shared_ptr<SharedDeviceBuffer> device_buffer = DeviceBuffer();
@@ -541,8 +658,7 @@ Status PyLocalBuffer::BlockHostUntilReady() {
   return stream->BlockHostUntilDone();
 }
 
-static std::shared_ptr<Device> LookupDevice(const PyLocalClient& client,
-                                            int device_id) {
+static Device* LookupDevice(const PyLocalClient& client, int device_id) {
   auto it = client.id_to_device().find(device_id);
   CHECK(it != client.id_to_device().end())
       << "Unknown device id: " << device_id;
@@ -551,8 +667,8 @@ static std::shared_ptr<Device> LookupDevice(const PyLocalClient& client,
 
 PyLocalExecutable::PyLocalExecutable(
     std::vector<std::unique_ptr<LocalExecutable>> executables,
-    DeviceAssignment device_assignment, std::shared_ptr<PyLocalClient> client)
-    : client_(std::move(client)),
+    DeviceAssignment device_assignment, PyLocalClient* client)
+    : client_(client),
       device_assignment_(
           std::make_shared<DeviceAssignment>(device_assignment)) {
   executables_.reserve(executables.size());
@@ -577,7 +693,7 @@ PyLocalExecutable::PyLocalExecutable(
   for (int replica = 0; replica < num_replicas; ++replica) {
     for (int partition = 0; partition < num_partitions; ++partition) {
       int device_id = (*device_assignment_)(replica, partition);
-      std::shared_ptr<Device> device = LookupDevice(*client_, device_id);
+      Device* device = LookupDevice(*client_, device_id);
       if (device->host_id() != client_->host_id()) {
         VLOG(3) << "Non-local device: " << device_id;
         continue;
@@ -602,14 +718,27 @@ const std::string& PyLocalExecutable::name() const {
   }
 }
 
-StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalExecutable::ExecuteHelper(
+StatusOr<std::vector<std::unique_ptr<PyLocalBuffer>>>
+PyLocalExecutable::ExecuteHelper(
     absl::Span<PyLocalBuffer* const> argument_handles, int replica,
-    int partition, const RunId& run_id) {
+    int partition, const RunId& run_id, const ExecuteOptions& options) const {
   const int device_id = (*device_assignment_)(replica, partition);
-  std::shared_ptr<Device> device = LookupDevice(*client_, device_id);
+  Device* device = LookupDevice(*client_, device_id);
+
+  std::unique_ptr<PyLocalBuffer> tuple_buffer;
+  std::vector<PyLocalBuffer*> tupled_arguments;
+  if (options.tuple_arguments) {
+    TF_ASSIGN_OR_RETURN(tuple_buffer, PyLocalBuffer::MakeTuple(
+                                          argument_handles, client_, device));
+    tupled_arguments = {tuple_buffer.get()};
+    argument_handles = tupled_arguments;
+  }
   CHECK_EQ(device->host_id(), client_->host_id());
   int device_ordinal = device->local_device_state()->device_ordinal();
-  tensorflow::profiler::TraceMe traceme("LocalExecutable::Execute");
+  tensorflow::profiler::TraceMe traceme([&] {
+    return absl::StrCat("LocalExecutable::Execute#run_id=", run_id.ToInt(),
+                        "#");
+  });
   VLOG(3) << "Replica " << replica << ", partition " << partition
           << " mapped to device ordinal for execution: " << device_ordinal;
 
@@ -628,7 +757,7 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalExecutable::ExecuteHelper(
           "Deleted buffer passed to Execute() as argument %d to replica %d", i,
           replica);
     }
-    if (handle->device().get() != device.get()) {
+    if (handle->device() != device) {
       return InvalidArgument(
           "Buffer passed to Execute() as argument %d to replica %d is on "
           "device %s, but replica is assigned to device %s.",
@@ -649,15 +778,16 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalExecutable::ExecuteHelper(
     event->WaitForEventOnStream(device_state->compute_stream());
   }
 
-  ExecutableRunOptions options;
-  options.set_stream(device_state->compute_stream());
-  options.set_host_to_device_stream(device_state->host_to_device_stream());
-  options.set_allocator(client_->allocator());
-  options.set_intra_op_thread_pool(
+  ExecutableRunOptions run_options;
+  run_options.set_stream(device_state->compute_stream());
+  run_options.set_host_to_device_stream(device_state->host_to_device_stream());
+  run_options.set_allocator(client_->allocator());
+  run_options.set_intra_op_thread_pool(
       client_->client()->backend().eigen_intra_op_thread_pool_device());
-  options.set_device_assignment(device_assignment_.get());
-  options.set_run_id(run_id);
-  options.set_rng_seed(device_state->GetNewPrngSeed());
+  run_options.set_device_assignment(device_assignment_.get());
+  run_options.set_run_id(run_id);
+  run_options.set_rng_seed(device_state->GetNewPrngSeed());
+  run_options.set_gpu_executable_run_options(client_->gpu_run_options());
 
   // The choice of where we wait is arbitrary; the reason for the wait is pacing
   // to avoid problems such as memory fragmentation and running ahead too far,
@@ -670,7 +800,7 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalExecutable::ExecuteHelper(
   int executable_idx = executables_.size() > 1 ? partition : 0;
 
   StatusOr<ScopedShapedBuffer> result_buffer_or_status =
-      executables_[executable_idx]->RunAsync(argument_buffer_ptrs, options);
+      executables_[executable_idx]->RunAsync(argument_buffer_ptrs, run_options);
 
   VLOG(1) << "Replica " << replica << " partition " << partition
           << " completed; ok=" << result_buffer_or_status.ok();
@@ -690,7 +820,7 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalExecutable::ExecuteHelper(
 
   std::shared_ptr<SharedDeviceBuffer> out_buffer =
       SharedDeviceBuffer::FromScopedShapedBuffer(&result_buffer,
-                                                 definition_event);
+                                                 {definition_event});
 
   if (device_state->synchronous_deallocation()) {
     device_buffers.push_back(out_buffer);
@@ -702,13 +832,19 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalExecutable::ExecuteHelper(
       device_state->compute_stream(),
       std::make_tuple(executables_[executable_idx], compute_reservation,
                       device_assignment_));
-  return absl::make_unique<PyLocalBuffer>(
+  std::vector<std::unique_ptr<PyLocalBuffer>> outputs;
+  outputs.push_back(absl::make_unique<PyLocalBuffer>(
       result_buffer.on_host_shape(), result_buffer.on_device_shape(),
-      std::move(out_buffer), client_, device);
+      std::move(out_buffer), client_, device));
+  if (options.untuple_result && result_buffer.on_host_shape().IsTuple()) {
+    TF_ASSIGN_OR_RETURN(outputs, outputs.front()->DestructureTuple());
+  }
+  return outputs;
 }
 
-StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalExecutable::Execute(
-    absl::Span<PyLocalBuffer* const> argument_handles) {
+StatusOr<std::vector<std::unique_ptr<PyLocalBuffer>>>
+PyLocalExecutable::Execute(absl::Span<PyLocalBuffer* const> argument_handles,
+                           const ExecuteOptions& options) const {
   if (num_replicas() != 1) {
     return InvalidArgument(
         "Attempted to execute computation with %d replicas using Execute()",
@@ -721,27 +857,18 @@ StatusOr<std::unique_ptr<PyLocalBuffer>> PyLocalExecutable::Execute(
   }
   VLOG(1) << "Executing computation " << name();
   return ExecuteHelper(argument_handles, /*replica=*/0, /*partition=*/0,
-                       RunId());
+                       RunId(), options);
 }
 
-StatusOr<std::vector<std::unique_ptr<PyLocalBuffer>>>
-PyLocalExecutable::ExecutePerReplica(
-    absl::Span<const std::vector<PyLocalBuffer*>> argument_handles) {
-  tensorflow::profiler::TraceMe traceme("LocalExecutable::ExecutePerReplica");
-  if (num_partitions() != 1) {
-    return InvalidArgument(
-        "Attempted to execute computation with %d partitions using "
-        "ExecutePerReplica()",
-        num_partitions());
-  }
-  return ExecuteOnLocalDevices(argument_handles);
-}
-
-StatusOr<std::vector<std::unique_ptr<PyLocalBuffer>>>
+StatusOr<std::vector<std::vector<std::unique_ptr<PyLocalBuffer>>>>
 PyLocalExecutable::ExecuteOnLocalDevices(
-    absl::Span<const std::vector<PyLocalBuffer*>> argument_handles) {
-  tensorflow::profiler::TraceMe traceme(
-      "LocalExecutable::ExecuteOnLocalDevices");
+    absl::Span<const std::vector<PyLocalBuffer*>> argument_handles,
+    const ExecuteOptions& options) const {
+  RunId run_id;
+  tensorflow::profiler::TraceMe traceme([&] {
+    return absl::StrCat(
+        "LocalExecutable::ExecuteOnLocalDevices#run_id=", run_id.ToInt(), "#");
+  });
 
   const int num_local_devices = local_devices_.size();
 
@@ -755,9 +882,9 @@ PyLocalExecutable::ExecuteOnLocalDevices(
 
   VLOG(1) << "Executing computation " << name()
           << "; num_replicas=" << num_replicas()
-          << " num_partitions=" << num_partitions()
-          << " num_local_devices=" << num_local_devices;
-  std::vector<StatusOr<std::unique_ptr<PyLocalBuffer>>> results(
+          << " num_partitions=" << num_partitions() << " num_local_devices=8"
+          << num_local_devices;
+  std::vector<StatusOr<std::vector<std::unique_ptr<PyLocalBuffer>>>> results(
       num_local_devices);
   if (num_local_devices == 1) {
     // Fast-path if there is only one device — run the computation on the
@@ -765,9 +892,8 @@ PyLocalExecutable::ExecuteOnLocalDevices(
     const int replica = local_logical_device_ids_[0].first;
     const int partition = local_logical_device_ids_[0].second;
     results[0] =
-        ExecuteHelper(argument_handles[0], replica, partition, RunId());
+        ExecuteHelper(argument_handles[0], replica, partition, run_id, options);
   } else {
-    RunId run_id;
     absl::Mutex mu;
     int running = num_local_devices;
     int failed = 0;
@@ -776,11 +902,11 @@ PyLocalExecutable::ExecuteOnLocalDevices(
     for (int i = 0; i < num_local_devices; ++i) {
       const int replica = local_logical_device_ids_[i].first;
       const int partition = local_logical_device_ids_[i].second;
-      std::shared_ptr<Device> device = local_devices_[i];
+      Device* device = local_devices_[i];
       const LocalDeviceState& device_state = *device->local_device_state();
       device_state.execute_thread()->Schedule([&, replica, partition, i] {
-        results[i] =
-            ExecuteHelper(argument_handles[i], replica, partition, run_id);
+        results[i] = ExecuteHelper(argument_handles[i], replica, partition,
+                                   run_id, options);
 
         absl::MutexLock lock(&mu);
         --running;
@@ -821,7 +947,7 @@ PyLocalExecutable::ExecuteOnLocalDevices(
   }
   VLOG(1) << "Replicated execution complete.";
 
-  std::vector<std::unique_ptr<PyLocalBuffer>> wrapped_results(
+  std::vector<std::vector<std::unique_ptr<PyLocalBuffer>>> wrapped_results(
       num_local_devices);
   for (int i = 0; i < num_local_devices; ++i) {
     const int replica = local_logical_device_ids_[i].first;
@@ -840,107 +966,37 @@ PyLocalExecutable::ExecuteOnLocalDevices(
   return wrapped_results;
 }
 
-/*static*/ StatusOr<std::unique_ptr<PyLocalExecutable>>
-PyLocalExecutable::CompileForDevices(
-    const XlaComputation& computation,
-    absl::optional<std::vector<Shape>> argument_layouts,
-    const ExecutableBuildOptions* build_options,
-    std::shared_ptr<PyLocalClient> client,
-    const std::vector<std::vector<std::shared_ptr<Device>>>&
-        device_assignment) {
-  if (device_assignment.empty()) {
-    return InvalidArgument(
-        "Device assignment passed to Compile() must be non-empty.");
-  }
-  if (device_assignment[0].empty()) {
-    return InvalidArgument(
-        "Device assignment passed to Compile() must have a nonzero number of "
-        "partitions per replica; replica 0 had 0 partitions.");
-  }
-  DeviceAssignment xla_assignment(device_assignment.size(),
-                                  device_assignment[0].size());
-  for (int replica = 0; replica < device_assignment.size(); ++replica) {
-    if (device_assignment[replica].size() != device_assignment[0].size()) {
-      return InvalidArgument(
-          "Device assignment passed to Compile() has different numbers of "
-          "partitions between replicas; %d partitions for replica %d versus %d "
-          "partitions for replica 0.",
-          device_assignment[replica].size(), replica,
-          device_assignment[0].size());
-    }
-    for (int partition = 0; partition < device_assignment[replica].size();
-         ++partition) {
-      if (device_assignment[0][0]->platform_name() !=
-          device_assignment[replica][partition]->platform_name()) {
-        return InvalidArgument(
-            "Device assignment passed to Compile() must have devices of a "
-            "single kind, got %s for replica 0 partition 0 and %s for replica "
-            "%d partition %d.",
-            device_assignment[0][0]->platform_name(),
-            device_assignment[replica][partition]->platform_name(), replica,
-            partition);
-      }
-      xla_assignment(replica, partition) =
-          device_assignment[replica][partition]->id();
-    }
-  }
-  return Compile(computation, std::move(argument_layouts), build_options,
-                 std::move(client), xla_assignment);
-}
-
 /*static*/ StatusOr<std::unique_ptr<PyLocalExecutable>>
 PyLocalExecutable::Compile(const XlaComputation& computation,
-                           absl::optional<std::vector<Shape>> argument_layouts,
-                           const ExecutableBuildOptions* build_options,
-                           std::shared_ptr<PyLocalClient> client,
-                           absl::optional<DeviceAssignment> device_assignment) {
+                           PyLocalClient* client, CompileOptions options) {
   tensorflow::profiler::TraceMe traceme("LocalExecutable::Compile");
 
-  ExecutableBuildOptions options;
-  if (build_options != nullptr) {
-    options = *build_options;
+  ExecutableBuildOptions& build_options = options.executable_build_options;
+  if (!build_options.device_allocator()) {
+    build_options.set_device_allocator(client->allocator());
   }
 
-  if (!options.device_allocator()) {
-    options.set_device_allocator(client->allocator());
+  if (!build_options.has_device_assignment()) {
+    VLOG(2) << "PyLocalExecutable::Compile using default device_assignment.";
+    TF_ASSIGN_OR_RETURN(
+        DeviceAssignment device_assignment,
+        client->GetDefaultDeviceAssignment(build_options.num_replicas(),
+                                           build_options.num_partitions()));
+    build_options.set_device_assignment(device_assignment);
   }
+  VLOG(2) << "PyLocalExecutable::Compile device_assignment:\n"
+          << build_options.device_assignment().ToString();
 
-  if (device_assignment) {
-    VLOG(2) << "PyLocalExecutable::Compile got device_assignment:\n"
-            << device_assignment->ToString();
-    if (device_assignment->replica_count() != options.num_replicas()) {
-      return InvalidArgument(
-          "Mismatched number of replicas for device "
-          "assignment and computation (%d vs %d).\n%s",
-          device_assignment->replica_count(), options.num_replicas(),
-          device_assignment->ToString());
-    }
-    if (device_assignment->computation_count() != options.num_partitions()) {
-      return InvalidArgument(
-          "Mismatched number of partitions for device "
-          "assignment and computation (%d vs %d).\n%s",
-          device_assignment->computation_count(), options.num_partitions(),
-          device_assignment->ToString());
-    }
-  } else {
-    TF_ASSIGN_OR_RETURN(device_assignment,
-                        client->GetDefaultDeviceAssignment(
-                            options.num_replicas(), options.num_partitions()));
-    VLOG(2) << "PyLocalExecutable::Compile using default device_assignment:\n"
-            << device_assignment->ToString();
-  }
-  options.set_device_assignment(device_assignment.value());
-
-  if (!argument_layouts) {
+  if (!options.argument_layouts) {
     TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
                         computation.GetProgramShape());
-    argument_layouts = program_shape.parameters();
-    for (Shape& shape : *argument_layouts) {
+    options.argument_layouts = program_shape.parameters();
+    for (Shape& shape : *options.argument_layouts) {
       LayoutUtil::ClearLayout(&shape);
     }
   }
   std::vector<const Shape*> argument_layout_pointers;
-  argument_layout_pointers.reserve(argument_layouts->size());
+  argument_layout_pointers.reserve(options.argument_layouts->size());
 
   // Assign a default layout to any array subshapes that are missing layouts.
   auto assign_layouts = [client](Shape* shape) {
@@ -958,14 +1014,14 @@ PyLocalExecutable::Compile(const XlaComputation& computation,
         });
   };
 
-  for (Shape& layout : *argument_layouts) {
+  for (Shape& layout : *options.argument_layouts) {
     argument_layout_pointers.push_back(&layout);
     TF_RETURN_IF_ERROR(assign_layouts(&layout));
   }
 
   Shape result_layout;
-  if (options.result_layout()) {
-    result_layout = *options.result_layout();
+  if (build_options.result_layout()) {
+    result_layout = *build_options.result_layout();
   } else {
     TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
                         computation.GetProgramShape());
@@ -973,16 +1029,15 @@ PyLocalExecutable::Compile(const XlaComputation& computation,
     LayoutUtil::ClearLayout(&result_layout);
   }
   TF_RETURN_IF_ERROR(assign_layouts(&result_layout));
-  options.set_result_layout(result_layout);
+  build_options.set_result_layout(result_layout);
 
   TF_ASSIGN_OR_RETURN(
       std::vector<std::unique_ptr<LocalExecutable>> local_executables,
       client->client()->Compile(computation, argument_layout_pointers,
-                                options));
+                                build_options));
 
-  return absl::make_unique<PyLocalExecutable>(std::move(local_executables),
-                                              std::move(*device_assignment),
-                                              std::move(client));
+  return absl::make_unique<PyLocalExecutable>(
+      std::move(local_executables), build_options.device_assignment(), client);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/local_client.h b/tensorflow/compiler/xla/python/local_client.h
index 51c8df90786..401064af77c 100644
--- a/tensorflow/compiler/xla/python/local_client.h
+++ b/tensorflow/compiler/xla/python/local_client.h
@@ -30,13 +30,20 @@ limitations under the License.
 #include "tensorflow/compiler/xla/python/local_device_state.h"
 #include "tensorflow/compiler/xla/python/shared_device_buffer.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/lib/core/status.h"
 
+// API notes:
+// Despite having the name "PyLocalClient", it is intended that this API may
+// also be consumed from C++. Python/pybind11/NumPy logic should therefore not
+// be used in this API.
+
 namespace xla {
 
 class Device {
@@ -80,15 +87,32 @@ class Device {
   const std::string platform_name_;
 };
 
+class PyLocalBuffer;
+// Helper struct for cross host transfers, returned by the callback from a call
+// to PyLocalBuffer::MakeCrossHostReceiveBuffers.
+struct PyLocalCrossHostRecvBuffer {
+  // serialized_descriptor should be transmitted to the sender and passed to a
+  // call to src_buffer->CopyToRemoteDevice.
+  std::string serialized_descriptor;
+  // The buffer that will hold the result of the transfer.
+  std::unique_ptr<PyLocalBuffer> buffer;
+};
+using PyLocalCrossHostRecvNotifier =
+    std::function<void(StatusOr<std::vector<PyLocalCrossHostRecvBuffer>>&&)>;
+
 // Encapsulates the state of Python session with XLA.
-class PyLocalClient {
+//
+// It is the responsibility of the client of this API to keep the PyLocalClient
+// alive as long as any of the other runtime objects are alive.
+class PyLocalClient : public std::enable_shared_from_this<PyLocalClient> {
  public:
   // `allocator` may null, in which case the platform default allocator is used.
   explicit PyLocalClient(
       std::string platform_name, LocalClient* client,
-      std::vector<std::shared_ptr<Device>> devices, int host_id,
+      std::vector<std::unique_ptr<Device>> devices, int host_id,
       std::unique_ptr<se::DeviceMemoryAllocator> allocator,
-      std::unique_ptr<tensorflow::Allocator> host_memory_allocator);
+      std::unique_ptr<tensorflow::Allocator> host_memory_allocator,
+      std::unique_ptr<GpuExecutableRunOptions> gpu_run_options);
   virtual ~PyLocalClient() = default;
 
   virtual StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
@@ -96,15 +120,11 @@ class PyLocalClient {
 
   int device_count() const { return devices_.size(); }
   int local_device_count() const { return local_devices_.size(); }
-  const std::vector<std::shared_ptr<Device>>& devices() const {
+  const std::vector<std::unique_ptr<Device>>& devices() const {
     return devices_;
   }
-  const std::vector<std::shared_ptr<Device>>& local_devices() const {
-    return local_devices_;
-  }
-  const std::map<int, std::shared_ptr<Device>>& id_to_device() const {
-    return id_to_device_;
-  }
+  const std::vector<Device*>& local_devices() const { return local_devices_; }
+  const std::map<int, Device*>& id_to_device() const { return id_to_device_; }
   int host_id() const { return host_id_; }
   const std::string& platform_name() const { return platform_name_; }
 
@@ -118,6 +138,10 @@ class PyLocalClient {
     return host_memory_allocator_.get();
   }
 
+  GpuExecutableRunOptions* gpu_run_options() const {
+    return gpu_run_options_.get();
+  }
+
   tensorflow::thread::ThreadPool* h2d_transfer_pool() {
     return &h2d_transfer_pool_;
   }
@@ -128,15 +152,28 @@ class PyLocalClient {
   virtual bool EnqueueD2DTransfersOnSrcStream() const { return true; }
 
  protected:
+  friend class PyLocalBuffer;
+  virtual void EnqueueCrossHostReceive(
+      std::vector<std::unique_ptr<PyLocalBuffer>>&& buffers,
+      PyLocalCrossHostRecvNotifier&& notifier) const {
+    notifier(Unimplemented("Cross host receives not implemented."));
+  }
+
+  virtual Status CopyToRemoteDevice(PyLocalBuffer* buffer,
+                                    absl::string_view serialized_descriptor,
+                                    Device* device) const {
+    return Unimplemented("Cross host sends not implemented.");
+  }
+
   std::string platform_name_;
   LocalClient* client_;
 
   // Includes all devices, including non-local devices on multi-host platforms.
-  std::vector<std::shared_ptr<Device>> devices_;
+  std::vector<std::unique_ptr<Device>> devices_;
   // Maps Device::id() to the corresponding Device. Includes all devices.
-  std::map<int, std::shared_ptr<Device>> id_to_device_;
+  std::map<int, Device*> id_to_device_;
   // Local devices indexed by local device ordinal.
-  std::vector<std::shared_ptr<Device>> local_devices_;
+  std::vector<Device*> local_devices_;
   int host_id_;
 
   se::DeviceMemoryAllocator* allocator_;
@@ -147,9 +184,16 @@ class PyLocalClient {
   // device via a staging area of pinned memory.
   std::unique_ptr<tensorflow::Allocator> host_memory_allocator_;
 
+  std::unique_ptr<GpuExecutableRunOptions> gpu_run_options_;
+
   tensorflow::thread::ThreadPool h2d_transfer_pool_;
 };
 
+// Converts a 2D set of Device objects indexed by [replica][partition] into an
+// xla::DeviceAssignment.
+StatusOr<DeviceAssignment> DevicesToDeviceAssignment(
+    absl::Span<const std::vector<Device*>> devices);
+
 // Holds a reference from Python to one or more device buffers.
 // A PyLocalBuffer can be either valid or invalid. An invalid buffer is one that
 // has never been initialized, or a buffer that has been deleted (e.g., by
@@ -166,17 +210,29 @@ class PyLocalBuffer {
   // the runtime (may be nullptr).
   static StatusOr<std::unique_ptr<PyLocalBuffer>> FromHostBuffer(
       const void* data, const Shape& shape, bool force_copy,
-      std::shared_ptr<void> buffer_reference,
-      std::shared_ptr<PyLocalClient> client, std::shared_ptr<Device> device);
+      std::shared_ptr<void> buffer_reference, PyLocalClient* client,
+      Device* device);
 
   static StatusOr<std::unique_ptr<PyLocalBuffer>> MakeTuple(
-      const std::vector<PyLocalBuffer*> buffers,
-      std::shared_ptr<PyLocalClient> client, std::shared_ptr<Device> device);
+      absl::Span<PyLocalBuffer* const> buffers, PyLocalClient* client,
+      Device* device);
+
+  // Asynchronously makes a vector of PyLocalBuffers that can be used to receive
+  // cross host transfers using `client` on `device'. `shapes` must be the exact
+  // shapes, with identical layouts, corresponding to the buffers that will be
+  // sent. When resources for the transfer are available, notifier will be
+  // called with a vector of PyLocalCrossHostRecvBuffer structs, one for each
+  // shape in `shapes`. Each struct contains a buffer that will contain the
+  // received value, and an opaque string that should be transmitted to the
+  // sending host and used in a call to CopyToRemoteDevice. None of the recv
+  // buffers will become ready until *all* of the sends have completed.
+  static void MakeCrossHostReceiveBuffers(
+      absl::Span<const Shape> shapes, PyLocalClient* client, Device* device,
+      PyLocalCrossHostRecvNotifier&& notifier);
 
   PyLocalBuffer(Shape on_host_shape, Shape on_device_shape,
                 std::shared_ptr<SharedDeviceBuffer> device_buffer,
-                std::shared_ptr<PyLocalClient> client,
-                std::shared_ptr<Device> device);
+                PyLocalClient* client, Device* device);
 
   PyLocalBuffer(const PyLocalBuffer&) = delete;
   PyLocalBuffer(PyLocalBuffer&&) = delete;
@@ -185,9 +241,9 @@ class PyLocalBuffer {
 
   const Shape& on_host_shape() const { return on_host_shape_; }
   const Shape& on_device_shape() const { return on_device_shape_; }
-  std::shared_ptr<Device> device() const { return device_; }
+  Device* device() const { return device_; }
   const std::string& platform_name() const { return client_->platform_name(); }
-  std::shared_ptr<PyLocalClient> client() const { return client_; }
+  PyLocalClient* client() const { return client_; }
 
   // Returns the buffer's value as a tuple DAG of Python arrays. If the value
   // has previously been prefetched to the host, then returns the prefetched
@@ -213,23 +269,35 @@ class PyLocalBuffer {
   StatusOr<ShapedBuffer> AsShapedBuffer() const;
 
   // Destructures a tuple-valued PyLocalBuffer into its constituent elements.
-  StatusOr<std::vector<std::unique_ptr<PyLocalBuffer>>> DestructureTuple();
+  StatusOr<std::vector<std::unique_ptr<PyLocalBuffer>>> DestructureTuple()
+      const;
 
   // Copies the buffer to device `dst_device`.
-  StatusOr<std::unique_ptr<PyLocalBuffer>> CopyToDevice(
-      std::shared_ptr<Device> dst_device);
+  StatusOr<std::unique_ptr<PyLocalBuffer>> CopyToDevice(Device* dst_device);
+
+  // Copies the buffer to remote device `dst_device`. This call must be preceded
+  // by a call to MakeCrossHostReceiveBuffers on the remote host's
+  // dst_device. MakeCrossHostReceiveBuffers takes an array of shapes to
+  // construct the destination buffers, and a callback supplies an array
+  // containing both the destination buffers, and a serialized descriptor for
+  // each buffer. For each destination buffer there should be a matching call to
+  // src->CopyToRemoteDevice on a remote host for a src buffer of the
+  // corresponding shape. serialized_descriptor is the string returned by the
+  // callback along with the corresponding destination buffer.
+  Status CopyToRemoteDevice(absl::string_view serialized_descriptor,
+                            Device* dst_device);
 
   // Blocks the host until the buffer's value has been computed and is ready for
   // immediate use on the device. Useful in particular for timing benchmarks.
   Status BlockHostUntilReady();
 
  private:
-  const std::shared_ptr<PyLocalClient> client_;
+  PyLocalClient* const client_;
   const Shape on_host_shape_;
   const Shape on_device_shape_;
-  const std::shared_ptr<Device> device_;
+  Device* const device_;
   mutable absl::Mutex mu_;
-  std::shared_ptr<SharedDeviceBuffer> device_buffer_ GUARDED_BY(mu_);
+  std::shared_ptr<SharedDeviceBuffer> device_buffer_ TF_GUARDED_BY(mu_);
 
   // The cached value of the buffer on the host, produced either from a call to
   // CopyToHost or from a call to ToLiteral. Once a value has been fetched to
@@ -241,7 +309,25 @@ class PyLocalBuffer {
     Status status;
     std::shared_ptr<Literal> value;
   };
-  std::shared_ptr<HostValue> host_value_ GUARDED_BY(mu_);
+  std::shared_ptr<HostValue> host_value_ TF_GUARDED_BY(mu_);
+};
+
+struct CompileOptions {
+  // The layouts of the arguments that the computation should expect.
+  absl::optional<std::vector<Shape>> argument_layouts;
+
+  // XLA's compilation time options.
+  ExecutableBuildOptions executable_build_options;
+};
+
+struct ExecuteOptions {
+  // If true, the arguments to the computation will be wrapped in a tuple and
+  // passed as a single parameter.
+  bool tuple_arguments = false;
+
+  // If true, the computation must return a tuple, which will be destructured
+  // into its elements.
+  bool untuple_result = false;
 };
 
 // Represents a compiled computation that can be executed given handles to
@@ -249,27 +335,14 @@ class PyLocalBuffer {
 // partition, as specified by the build options).
 class PyLocalExecutable {
  public:
-  // Compiles a computation to an executable.
-  static StatusOr<std::unique_ptr<PyLocalExecutable>> CompileForDevices(
-      const XlaComputation& computation,
-      absl::optional<std::vector<Shape>> argument_layouts,
-      const ExecutableBuildOptions* build_options,
-      std::shared_ptr<PyLocalClient> client,
-      const std::vector<std::vector<std::shared_ptr<Device>>>&
-          device_assignment);
-
-  // TODO(phawkins): Deprecated. Delete once all callers have been updated to
-  // use the newer form.
   static StatusOr<std::unique_ptr<PyLocalExecutable>> Compile(
-      const XlaComputation& computation,
-      absl::optional<std::vector<Shape>> argument_layouts,
-      const ExecutableBuildOptions* build_options,
-      std::shared_ptr<PyLocalClient> client,
-      absl::optional<DeviceAssignment> device_assignment);
+      const XlaComputation& computation, PyLocalClient* client,
+      CompileOptions options);
 
   PyLocalExecutable(std::vector<std::unique_ptr<LocalExecutable>> executables,
-                    DeviceAssignment device_assignment,
-                    std::shared_ptr<PyLocalClient> client);
+                    DeviceAssignment device_assignment, PyLocalClient* client);
+
+  PyLocalClient* client() const { return client_; }
 
   int num_replicas() const {
     return executables_[0]->build_options().num_replicas();
@@ -299,41 +372,34 @@ class PyLocalExecutable {
     return local_logical_device_ids_;
   }
 
-  const std::vector<std::shared_ptr<Device>>& local_devices() const {
-    return local_devices_;
-  }
+  const std::vector<Device*>& local_devices() const { return local_devices_; }
 
-  StatusOr<std::unique_ptr<PyLocalBuffer>> Execute(
-      absl::Span<PyLocalBuffer* const> argument_handles);
-
-  // Execute on many replicas. Takes a sequence of argument lists (one argument
-  // list per replica) and returns a tuple of results (one result per replica).
-  // The number of argument lists must be equal to the replica count.
-  // The executable must have only one partition.
-  // TODO(cjfj): Remove this once JAX is moved to `ExecuteOnLocalDevices`.
-  StatusOr<std::vector<std::unique_ptr<PyLocalBuffer>>> ExecutePerReplica(
-      absl::Span<const std::vector<PyLocalBuffer*>> argument_handles);
+  StatusOr<std::vector<std::unique_ptr<PyLocalBuffer>>> Execute(
+      absl::Span<PyLocalBuffer* const> argument_handles,
+      const ExecuteOptions& options) const;
 
   // Execute on local devices. Takes a sequence of argument lists (one argument
   // list per local device) and returns a tuple of results (one result per local
   // device). The number of argument lists must be equal to the local device
   // count.
-  StatusOr<std::vector<std::unique_ptr<PyLocalBuffer>>> ExecuteOnLocalDevices(
-      absl::Span<const std::vector<PyLocalBuffer*>> argument_handles);
+  StatusOr<std::vector<std::vector<std::unique_ptr<PyLocalBuffer>>>>
+  ExecuteOnLocalDevices(
+      absl::Span<const std::vector<PyLocalBuffer*>> argument_handles,
+      const ExecuteOptions& options) const;
 
   void Delete() { executables_.clear(); }
 
   const string& name() const;
 
  private:
-  StatusOr<std::unique_ptr<PyLocalBuffer>> ExecuteHelper(
+  StatusOr<std::vector<std::unique_ptr<PyLocalBuffer>>> ExecuteHelper(
       absl::Span<PyLocalBuffer* const> argument_handles, int replica,
-      int partition, const RunId& run_id);
+      int partition, const RunId& run_id, const ExecuteOptions& options) const;
 
   // Create shared pointers so we can free them after the execution: with
   // asynchronous execution, the process being executed can outlive the
   // executable itself.
-  std::shared_ptr<PyLocalClient> const client_;
+  PyLocalClient* const client_;
   // One executable per partition.
   std::vector<std::shared_ptr<LocalExecutable>> executables_;
   std::shared_ptr<DeviceAssignment> device_assignment_;
@@ -350,7 +416,7 @@ class PyLocalExecutable {
   // assigned.
   // shared_ptrs instead of unique_ptrs to play well with the Python bindings
   // (see xla.cc).
-  std::vector<std::shared_ptr<Device>> local_devices_;
+  std::vector<Device*> local_devices_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/local_device_state.h b/tensorflow/compiler/xla/python/local_device_state.h
index a64176294e0..fa73c832c57 100644
--- a/tensorflow/compiler/xla/python/local_device_state.h
+++ b/tensorflow/compiler/xla/python/local_device_state.h
@@ -129,12 +129,12 @@ class LocalDeviceState {
   static constexpr int kNumDeviceToDeviceStreams = 4;
 
   absl::Mutex mu_;
-  int next_device_to_host_stream_ GUARDED_BY(mu_) = 0;
-  int next_device_to_device_stream_ GUARDED_BY(mu_) = 0;
+  int next_device_to_host_stream_ TF_GUARDED_BY(mu_) = 0;
+  int next_device_to_device_stream_ TF_GUARDED_BY(mu_) = 0;
 
-  std::random_device prng_seed_device_ GUARDED_BY(mu_);
-  std::mt19937 prng_seed_generator_ GUARDED_BY(mu_);
-  std::uniform_int_distribution<> prng_seed_distribution_ GUARDED_BY(mu_);
+  std::random_device prng_seed_device_ TF_GUARDED_BY(mu_);
+  std::mt19937 prng_seed_generator_ TF_GUARDED_BY(mu_);
+  std::uniform_int_distribution<> prng_seed_distribution_ TF_GUARDED_BY(mu_);
 
   // Callback stream is used for running short host-side callbacks after device
   // side events, without preventing the device-side stream from doing useful
diff --git a/tensorflow/compiler/xla/python/nvidia_gpu_device.cc b/tensorflow/compiler/xla/python/nvidia_gpu_device.cc
index b7b2faef8d7..26ea727dee7 100644
--- a/tensorflow/compiler/xla/python/nvidia_gpu_device.cc
+++ b/tensorflow/compiler/xla/python/nvidia_gpu_device.cc
@@ -15,28 +15,82 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/python/nvidia_gpu_device.h"
 
+#ifdef NCCL_ENABLED
+#include "third_party/nccl/nccl.h"
+#endif  // NCCL_ENABLED
 #include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
-#include "tensorflow/core/common_runtime/bfc_allocator.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_host_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_mem_allocator.h"
 #include "tensorflow/stream_executor/tf_allocator_adapter.h"
 
 namespace xla {
+namespace {
 
 static const char kGpuPlatformName[] = "gpu";
 
-GpuDevice::GpuDevice(int id,
-                     std::unique_ptr<LocalDeviceState> local_device_state)
-    : Device(id, std::move(local_device_state), kGpuPlatformName) {}
+// A custom PyLocalClient that overrides the device assignment method.
+class GpuClient : public xla::PyLocalClient {
+ public:
+  using xla::PyLocalClient::PyLocalClient;
 
-static StatusOr<std::unique_ptr<se::MultiDeviceAdapter>> CreateBFCAllocator(
-    se::Platform* platform,
-    absl::Span<const std::shared_ptr<Device>> local_devices,
-    LocalClient* client, double memory_fraction, bool preallocate) {
-  CHECK_GT(client->backend().device_count(), 0);
+  xla::StatusOr<xla::DeviceAssignment> GetDefaultDeviceAssignment(
+      int num_replicas, int num_partitions) const override;
+};
+
+xla::StatusOr<xla::DeviceAssignment> GpuClient::GetDefaultDeviceAssignment(
+    int num_replicas, int num_partitions) const {
+  // XLA:GPU does not support multiple partitions yet.
+  TF_RET_CHECK(num_partitions == 1) << num_partitions;
+  if (num_replicas <= local_devices().size()) {
+    xla::DeviceAssignment assignment(num_replicas, 1);
+    for (int i = 0; i < num_replicas; ++i) {
+      assignment(i, 0) = local_devices().at(i)->id();
+    }
+    return assignment;
+  }
+  // Fallback to default global device assignment if we can't run locally.
+  return PyLocalClient::GetDefaultDeviceAssignment(num_replicas,
+                                                   num_partitions);
+}
+
+// Builds an xla::LocalClient for the GPU platform.
+StatusOr<LocalClient*> GetGpuXlaClient() {
+  TF_ASSIGN_OR_RETURN(se::Platform * platform,
+                      PlatformUtil::GetPlatform("CUDA"));
+  if (platform->VisibleDeviceCount() <= 0) {
+    return FailedPrecondition("No visible NVidia GPU devices.");
+  }
+  LocalClientOptions options;
+  options.set_platform(platform);
+  return ClientLibrary::GetOrCreateLocalClient(options);
+}
+
+// Builds a LocalDeviceState for each GPU present.
+StatusOr<std::vector<std::unique_ptr<LocalDeviceState>>> BuildLocalDeviceStates(
+    LocalClient* xla_client, bool asynchronous) {
+  std::vector<std::unique_ptr<LocalDeviceState>> local_devices;
+  for (int i = 0; i < xla_client->device_count(); ++i) {
+    se::StreamExecutor* executor =
+        xla_client->backend().stream_executor(i).ValueOrDie();
+    local_devices.push_back(absl::make_unique<LocalDeviceState>(
+        executor, xla_client, /*synchronous_deallocation=*/false, asynchronous,
+        /*allow_event_reuse=*/true));
+  }
+  return std::move(local_devices);
+}
+
+// Builds a BFCAllocator for all local GPUs.
+StatusOr<std::unique_ptr<se::MultiDeviceAdapter>> CreateBFCAllocator(
+    absl::Span<std::unique_ptr<LocalDeviceState> const> local_devices,
+    double memory_fraction, bool preallocate) {
+  CHECK_GT(local_devices.size(), 0);
+  const se::Platform* platform = local_devices.front()->executor()->platform();
   std::vector<se::MultiDeviceAdapter::AllocatorWithStream> allocators;
-  for (se::StreamExecutor* executor : client->backend().stream_executors()) {
+  for (auto& local_device : local_devices) {
+    se::StreamExecutor* executor = local_device->executor();
     int device_ordinal = executor->device_ordinal();
     auto sub_allocator = absl::make_unique<tensorflow::GPUMemAllocator>(
         executor, tensorflow::PlatformGpuId(device_ordinal),
@@ -65,60 +119,201 @@ static StatusOr<std::unique_ptr<se::MultiDeviceAdapter>> CreateBFCAllocator(
         /*allow_growth=*/!preallocate,
         absl::StrCat("GPU_", device_ordinal, "_bfc"));
     allocators.emplace_back(std::move(gpu_bfc_allocator),
-                            local_devices.at(device_ordinal)
-                                ->local_device_state()
-                                ->compute_stream());
+                            local_device->compute_stream());
   }
   return absl::make_unique<se::MultiDeviceAdapter>(platform,
                                                    std::move(allocators));
 }
 
-StatusOr<std::shared_ptr<PyLocalClient>> GetNvidiaGpuClient(
-    bool asynchronous, const GpuAllocatorConfig& allocator_config) {
-  TF_ASSIGN_OR_RETURN(se::Platform * platform,
-                      PlatformUtil::GetPlatform("CUDA"));
-  if (platform->VisibleDeviceCount() <= 0) {
-    return FailedPrecondition("No visible NVidia GPU devices.");
-  }
-  LocalClientOptions options;
-  options.set_platform(platform);
-  TF_ASSIGN_OR_RETURN(LocalClient * client,
-                      ClientLibrary::GetOrCreateLocalClient(options));
-
-  std::vector<std::shared_ptr<Device>> devices;
-  for (int i = 0; i < client->device_count(); ++i) {
-    se::StreamExecutor* executor =
-        client->backend().stream_executor(i).ValueOrDie();
-    auto device_state = absl::make_unique<LocalDeviceState>(
-        executor, client, /*synchronous_deallocation=*/false, asynchronous,
-        /*allow_event_reuse=*/true);
-    std::shared_ptr<Device> device =
-        std::make_shared<GpuDevice>(i, std::move(device_state));
-    devices.push_back(std::move(device));
-  }
-
+// Constructs a GPU device memory allocator to use, according to the allocator
+// configuration the client requested.
+StatusOr<std::unique_ptr<se::DeviceMemoryAllocator>> GetGpuDeviceAllocator(
+    const GpuAllocatorConfig& allocator_config,
+    absl::Span<std::unique_ptr<LocalDeviceState> const> local_devices) {
   std::unique_ptr<se::DeviceMemoryAllocator> allocator;
-  std::unique_ptr<tensorflow::Allocator> host_memory_allocator;
   if (allocator_config.kind != GpuAllocatorConfig::Kind::kPlatform) {
-    TF_ASSIGN_OR_RETURN(allocator,
-                        CreateBFCAllocator(platform, devices, client,
-                                           allocator_config.memory_fraction,
-                                           allocator_config.preallocate));
+    TF_ASSIGN_OR_RETURN(
+        allocator,
+        CreateBFCAllocator(local_devices, allocator_config.memory_fraction,
+                           allocator_config.preallocate));
   }
+  return std::move(allocator);
+}
 
+// Returns a GPU pinned host memory allocator to use when staging host->GPU
+// transfers. We use a fixed 64MB pool of pinned memory.
+std::unique_ptr<tensorflow::BFCAllocator> GetGpuHostAllocator(
+    se::StreamExecutor* executor) {
   tensorflow::SubAllocator* sub_allocator = new tensorflow::GpuHostAllocator(
-      client->backend().stream_executor(0).ValueOrDie(), /*numa_node=*/0,
-      /*alloc_visitors=*/{},
-      /*free_visitors=*/{});
+      executor, /*numa_node=*/0, /*alloc_visitors=*/{}, /*free_visitors=*/{});
   // TODO(phawkins): allow the user to tune this.
   const int64 kGpuHostMemoryLimitBytes = 64 * (1LL << 30);
-  host_memory_allocator = absl::make_unique<tensorflow::BFCAllocator>(
+  return absl::make_unique<tensorflow::BFCAllocator>(
       sub_allocator, kGpuHostMemoryLimitBytes, /*allow_growth=*/true,
       /*name=*/"xla_gpu_host_bfc");
+}
 
-  return std::make_shared<PyLocalClient>("gpu", client, std::move(devices),
-                                         /*host_id=*/0, std::move(allocator),
-                                         std::move(host_memory_allocator));
+// A table mapping NcclCliqueKeys to ncclUniqueId values encoded as strings.
+// In a distributed setup the table of NCCL IDs is kept on the master node
+// (node 0). Currently node 0 is the only node that generates ncclUniqueIds;
+// see the TODO below.
+class NcclIdStore {
+ public:
+  NcclIdStore(int node_id, std::shared_ptr<DistributedRuntimeClient> client)
+      : node_id_(node_id), client_(std::move(client)) {}
+
+  StatusOr<std::string> GetNcclUniqueId(const NcclCliqueKey& key);
+
+ private:
+  const int node_id_;
+  const std::shared_ptr<DistributedRuntimeClient> client_;
+
+  absl::Mutex mu_;
+  absl::flat_hash_map<std::string, std::string> cache_ GUARDED_BY(mu_);
+};
+
+StatusOr<std::string> NcclIdStore::GetNcclUniqueId(const NcclCliqueKey& key) {
+  std::string key_string = GlobalDeviceIdsToString(key.devices());
+  {
+    absl::MutexLock lock(&mu_);
+    auto it = cache_.find(key_string);
+    if (it != cache_.end()) {
+      return it->second;
+    }
+  }
+  auto result = [&]() -> StatusOr<std::string> {
+    // TODO(phawkins): this will deadlock if node 0 is not involved in the
+    // computation. Add support for computations that only use a subset of
+    // replicas.
+    if (node_id_ == 0) {
+#ifdef NCCL_ENABLED
+      ncclUniqueId id;
+      ncclResult_t r = ncclGetUniqueId(&id);
+      TF_RET_CHECK(r == ncclSuccess);
+      std::string value(id.internal, NCCL_UNIQUE_ID_BYTES);
+      TF_RETURN_IF_ERROR(client_->KeyValueSet(key_string, value));
+      return value;
+#else
+      return FailedPrecondition("NCCL support was not built into XLA binary.");
+#endif
+    } else {
+      return client_->BlockingKeyValueGet(key_string, absl::Minutes(5));
+    }
+  }();
+  if (!result.ok()) {
+    return result.status();
+  }
+  absl::MutexLock lock(&mu_);
+  return cache_.emplace(key_string, result.ValueOrDie()).first->second;
+}
+
+std::vector<std::unique_ptr<Device>> BuildLocalDevices(
+    std::vector<std::unique_ptr<LocalDeviceState>> local_device_states) {
+  std::vector<std::unique_ptr<Device>> devices;
+  for (auto& local_device : local_device_states) {
+    int device_ordinal = local_device->device_ordinal();
+    auto device = absl::make_unique<GpuDevice>(
+        device_ordinal, std::move(local_device), /*node_id=*/0);
+    devices.push_back(std::move(device));
+  }
+  return devices;
+}
+
+Status BuildDistributedDevices(
+    std::vector<std::unique_ptr<LocalDeviceState>> local_device_states,
+    std::shared_ptr<DistributedRuntimeClient> distributed_client, int node_id,
+    std::vector<std::unique_ptr<Device>>* devices,
+    GpuExecutableRunOptions* gpu_executable_run_options) {
+  LocalTopologyProto local_topology;
+  local_topology.set_node_id(node_id);
+  for (const auto& local_device : local_device_states) {
+    const se::Platform* platform = local_device->executor()->platform();
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<xla::se::DeviceDescription> desc,
+        platform->DescriptionForDevice(local_device->device_ordinal()));
+    TF_RET_CHECK(local_device->device_ordinal() ==
+                 local_topology.devices_size());
+    DeviceProto* device_proto = local_topology.add_devices();
+    device_proto->set_local_device_ordinal(local_device->device_ordinal());
+    device_proto->set_name(desc->name());
+    device_proto->set_vendor(desc->device_vendor());
+  }
+
+  GlobalTopologyProto global_topology;
+  TF_RETURN_IF_ERROR(
+      distributed_client->Connect(local_topology, &global_topology));
+
+  std::vector<GlobalDeviceId> gpu_device_ids(local_device_states.size());
+  for (const LocalTopologyProto& node : global_topology.nodes()) {
+    for (const DeviceProto& device_proto : node.devices()) {
+      std::unique_ptr<LocalDeviceState> local_device;
+      if (node.node_id() == node_id) {
+        TF_RET_CHECK(device_proto.local_device_ordinal() >= 0 &&
+                     device_proto.local_device_ordinal() <
+                         local_device_states.size());
+        TF_RET_CHECK(local_device_states[device_proto.local_device_ordinal()] !=
+                     nullptr);
+        local_device =
+            std::move(local_device_states[device_proto.local_device_ordinal()]);
+        gpu_device_ids[device_proto.local_device_ordinal()] =
+            GlobalDeviceId(device_proto.global_device_id());
+      }
+      auto device =
+          absl::make_unique<GpuDevice>(device_proto.global_device_id(),
+                                       std::move(local_device), node.node_id());
+      devices->push_back(std::move(device));
+    }
+  }
+  for (const auto& device : local_device_states) {
+    TF_RET_CHECK(device == nullptr);
+  }
+  gpu_executable_run_options->set_gpu_global_device_ids(
+      std::move(gpu_device_ids));
+  auto nccl_id_store =
+      std::make_shared<NcclIdStore>(node_id, distributed_client);
+  gpu_executable_run_options->set_nccl_unique_id_callback(
+      [nccl_id_store](const NcclCliqueKey& key) {
+        return nccl_id_store->GetNcclUniqueId(key);
+      });
+  return Status::OK();
+}
+
+}  // namespace
+
+GpuDevice::GpuDevice(int id,
+                     std::unique_ptr<LocalDeviceState> local_device_state,
+                     int node_id)
+    : Device(id, std::move(local_device_state), kGpuPlatformName, node_id) {}
+
+StatusOr<std::shared_ptr<PyLocalClient>> GetNvidiaGpuClient(
+    bool asynchronous, const GpuAllocatorConfig& allocator_config,
+    std::shared_ptr<DistributedRuntimeClient> distributed_client, int node_id) {
+  TF_ASSIGN_OR_RETURN(LocalClient * xla_client, GetGpuXlaClient());
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::unique_ptr<LocalDeviceState>> local_device_states,
+      BuildLocalDeviceStates(xla_client, asynchronous));
+  TF_ASSIGN_OR_RETURN(
+      auto allocator,
+      GetGpuDeviceAllocator(allocator_config, local_device_states));
+  auto host_memory_allocator =
+      GetGpuHostAllocator(local_device_states.front()->executor());
+
+  std::vector<std::unique_ptr<Device>> devices;
+  auto gpu_run_options = absl::make_unique<GpuExecutableRunOptions>();
+  if (distributed_client) {
+    TF_RETURN_IF_ERROR(BuildDistributedDevices(
+        std::move(local_device_states), std::move(distributed_client), node_id,
+        &devices, gpu_run_options.get()));
+  } else {
+    devices = BuildLocalDevices(std::move(local_device_states));
+  }
+
+  std::shared_ptr<PyLocalClient> pyclient = std::make_shared<GpuClient>(
+      "gpu", xla_client, std::move(devices),
+      /*node_id=*/node_id, std::move(allocator),
+      std::move(host_memory_allocator),
+      /*gpu_run_options=*/std::move(gpu_run_options));
+  return pyclient;
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/nvidia_gpu_device.h b/tensorflow/compiler/xla/python/nvidia_gpu_device.h
index a89f8044d4f..333a82a2d78 100644
--- a/tensorflow/compiler/xla/python/nvidia_gpu_device.h
+++ b/tensorflow/compiler/xla/python/nvidia_gpu_device.h
@@ -18,14 +18,17 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/compiler/xla/python/distributed/client.h"
 #include "tensorflow/compiler/xla/python/local_client.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/common_runtime/bfc_allocator.h"
 
 namespace xla {
 
 class GpuDevice : public Device {
  public:
-  GpuDevice(int id, std::unique_ptr<LocalDeviceState> local_device_state);
+  GpuDevice(int id, std::unique_ptr<LocalDeviceState> local_device_state,
+            int node_id);
 };
 
 struct GpuAllocatorConfig {
@@ -48,8 +51,11 @@ struct GpuAllocatorConfig {
   bool preallocate = true;
 };
 
+// distributed_client may be nullptr in non-distributed settings.
+// distributed_client should not be Open()ed before calling this function.
 StatusOr<std::shared_ptr<PyLocalClient>> GetNvidiaGpuClient(
-    bool asynchronous, const GpuAllocatorConfig& allocator_config);
+    bool asynchronous, const GpuAllocatorConfig& allocator_config,
+    std::shared_ptr<DistributedRuntimeClient> distributed_client, int node_id);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/python/shared_device_buffer.cc b/tensorflow/compiler/xla/python/shared_device_buffer.cc
index ca6da645024..91f2b434a61 100644
--- a/tensorflow/compiler/xla/python/shared_device_buffer.cc
+++ b/tensorflow/compiler/xla/python/shared_device_buffer.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/python/shared_device_buffer.h"
 
+#include <iterator>
 #include <memory>
 
 #include "tensorflow/stream_executor/device_memory.h"
@@ -60,7 +61,8 @@ static std::shared_ptr<SharedDeviceBuffer> BufferFromScopedShapedBufferIterator(
     int device_ordinal, se::DeviceMemoryAllocator* allocator,
     ShapeTree<se::DeviceMemoryBase>::iterator* iterator,
     const ShapeTree<se::DeviceMemoryBase>::iterator& end,
-    const std::shared_ptr<BufferDefinitionEvent>& definition_event) {
+    absl::Span<const std::shared_ptr<BufferDefinitionEvent>>
+        definition_events) {
   std::vector<se::OwningDeviceMemory> buffers;
   buffers.reserve(1);
   std::vector<std::shared_ptr<SharedDeviceBuffer>> children;
@@ -78,7 +80,7 @@ static std::shared_ptr<SharedDeviceBuffer> BufferFromScopedShapedBufferIterator(
     for (int i = 0; i < num_children; ++i) {
       children.push_back(BufferFromScopedShapedBufferIterator(
           on_host_shape.tuple_shapes(i), on_device_shape.tuple_shapes(i),
-          device_ordinal, allocator, iterator, end, definition_event));
+          device_ordinal, allocator, iterator, end, definition_events));
     }
   } else {
     // An on-host array may be an on-device tuple. For example, a complex tensor
@@ -88,20 +90,21 @@ static std::shared_ptr<SharedDeviceBuffer> BufferFromScopedShapedBufferIterator(
         [&](const Shape&, const ShapeIndex&) { consume_buffer(); });
   }
   return std::make_shared<SharedDeviceBuffer>(
-      absl::Span<se::OwningDeviceMemory>(buffers), children, definition_event);
+      absl::Span<se::OwningDeviceMemory>(buffers), children, definition_events);
 }
 
 /* static */ std::shared_ptr<SharedDeviceBuffer>
 SharedDeviceBuffer::FromScopedShapedBuffer(
     ScopedShapedBuffer* shaped_buffer,
-    const std::shared_ptr<BufferDefinitionEvent>& definition_event) {
+    absl::Span<const std::shared_ptr<BufferDefinitionEvent>>
+        definition_events) {
   ShapeTree<se::DeviceMemoryBase>::iterator iterator =
       shaped_buffer->buffers().begin();
   std::shared_ptr<SharedDeviceBuffer> output =
       BufferFromScopedShapedBufferIterator(
           shaped_buffer->on_host_shape(), shaped_buffer->on_device_shape(),
           shaped_buffer->device_ordinal(), shaped_buffer->memory_allocator(),
-          &iterator, shaped_buffer->buffers().end(), definition_event);
+          &iterator, shaped_buffer->buffers().end(), definition_events);
   CHECK(iterator == shaped_buffer->buffers().end());
   return output;
 }
@@ -111,7 +114,8 @@ SharedDeviceBuffer::MakeTuple(
     std::vector<std::shared_ptr<SharedDeviceBuffer>> children,
     const Shape& on_host_shape, TransferManager* transfer_manager,
     se::DeviceMemoryAllocator* allocator, int device_ordinal,
-    std::shared_ptr<BufferDefinitionEvent> definition_event) {
+    absl::Span<const std::shared_ptr<BufferDefinitionEvent>>
+        definition_events) {
   CHECK(on_host_shape.IsTuple() &&
         on_host_shape.tuple_shapes_size() == children.size());
   TF_ASSIGN_OR_RETURN(
@@ -122,7 +126,7 @@ SharedDeviceBuffer::MakeTuple(
   return std::make_shared<SharedDeviceBuffer>(
       allocator, device_ordinal,
       std::initializer_list<se::DeviceMemoryBase>{device_memory.Release()},
-      std::move(children), std::move(definition_event),
+      std::move(children), definition_events,
       /*on_delete_callback=*/nullptr);
 }
 
@@ -130,7 +134,8 @@ SharedDeviceBuffer::MakeTuple(
 SharedDeviceBuffer::MakeArray(
     Shape on_device_shape, TransferManager* transfer_manager,
     se::DeviceMemoryAllocator* allocator, int device_ordinal,
-    std::shared_ptr<BufferDefinitionEvent> definition_event) {
+    absl::Span<const std::shared_ptr<BufferDefinitionEvent>>
+        definition_events) {
   std::vector<se::OwningDeviceMemory> device_buffers;
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
       on_device_shape, [&](const Shape& subshape, const ShapeIndex&) -> Status {
@@ -145,7 +150,7 @@ SharedDeviceBuffer::MakeArray(
   return std::make_shared<SharedDeviceBuffer>(
       absl::Span<se::OwningDeviceMemory>(device_buffers),
       /*children=*/std::vector<std::shared_ptr<SharedDeviceBuffer>>{},
-      std::move(definition_event));
+      definition_events);
 }
 
 // Populates a buffer tree from a ShapeTree iterator.
@@ -176,25 +181,36 @@ ShapedBuffer SharedDeviceBuffer::AsShapedBuffer(const Shape& on_host_shape,
   return shaped_buffer;
 }
 
+namespace {
+
+using MoveIterator =
+    absl::Span<const std::shared_ptr<BufferDefinitionEvent>>::iterator;
+
+}  // namespace
+
 SharedDeviceBuffer::SharedDeviceBuffer(
     se::DeviceMemoryAllocator* allocator, int device_ordinal,
     absl::Span<se::DeviceMemoryBase const> device_memory,
     std::vector<std::shared_ptr<SharedDeviceBuffer>> children,
-    std::shared_ptr<BufferDefinitionEvent> definition_event,
+    absl::Span<const std::shared_ptr<BufferDefinitionEvent>> definition_events,
     std::function<void()> on_delete_callback)
     : allocator_(allocator),
       device_ordinal_(device_ordinal),
       device_memory_(device_memory.begin(), device_memory.end()),
       children_(std::move(children)),
-      definition_event_(std::move(definition_event)),
+      definition_events_(
+          std::move_iterator<MoveIterator>(definition_events.begin()),
+          std::move_iterator<MoveIterator>(definition_events.end())),
       on_delete_callback_(std::move(on_delete_callback)) {}
 
 SharedDeviceBuffer::SharedDeviceBuffer(
     absl::Span<se::OwningDeviceMemory> device_memory,
     std::vector<std::shared_ptr<SharedDeviceBuffer>> children,
-    std::shared_ptr<BufferDefinitionEvent> definition_event)
+    absl::Span<const std::shared_ptr<BufferDefinitionEvent>> definition_events)
     : children_(std::move(children)),
-      definition_event_(std::move(definition_event)) {
+      definition_events_(
+          std::move_iterator<MoveIterator>(definition_events.begin()),
+          std::move_iterator<MoveIterator>(definition_events.end())) {
   CHECK(!device_memory.empty());
   allocator_ = device_memory.front().allocator();
   device_ordinal_ = device_memory.front().device_ordinal();
@@ -222,8 +238,8 @@ SharedDeviceBuffer::~SharedDeviceBuffer() {
 void GetDeviceBufferDefinitionEvents(
     const SharedDeviceBuffer& buffer,
     absl::flat_hash_set<BufferDefinitionEvent*>* events) {
-  if (buffer.definition_event()) {
-    events->insert(buffer.definition_event().get());
+  for (const auto& e : buffer.definition_events()) {
+    events->insert(e.get());
   }
   for (const auto& child : buffer.children()) {
     GetDeviceBufferDefinitionEvents(*child, events);
diff --git a/tensorflow/compiler/xla/python/shared_device_buffer.h b/tensorflow/compiler/xla/python/shared_device_buffer.h
index 8d9d8278d33..3aa122c535d 100644
--- a/tensorflow/compiler/xla/python/shared_device_buffer.h
+++ b/tensorflow/compiler/xla/python/shared_device_buffer.h
@@ -66,7 +66,7 @@ class BufferDefinitionEvent {
   void WaitForEventOnStream(se::Stream* stream);
 
  private:
-  bool EventHasBeenRecorded() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  bool EventHasBeenRecorded() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // An event that is triggered when the content of one or more buffers is
   // ready. If this event is nullptr, it is assumed that the buffer's content is
@@ -77,7 +77,7 @@ class BufferDefinitionEvent {
 
   // A list of all streams for which the buffer's content is known to be defined
   // at the tail of the queue, i.e., for any newly enqueued command.
-  absl::InlinedVector<se::Stream*, 2> streams_defined_on_ GUARDED_BY(mu_);
+  absl::InlinedVector<se::Stream*, 2> streams_defined_on_ TF_GUARDED_BY(mu_);
 };
 
 // Class that represents a node in a reference-counted DAG of device buffers.
@@ -93,20 +93,23 @@ class SharedDeviceBuffer {
   // buffers of the shaped_buffer.
   static std::shared_ptr<SharedDeviceBuffer> FromScopedShapedBuffer(
       ScopedShapedBuffer* shaped_buffer,
-      const std::shared_ptr<BufferDefinitionEvent>& definition_event);
+      absl::Span<const std::shared_ptr<BufferDefinitionEvent>>
+          definition_events);
 
   // Makes a tuple buffer. Does not initialize the tuple table.
   static StatusOr<std::shared_ptr<SharedDeviceBuffer>> MakeTuple(
       std::vector<std::shared_ptr<SharedDeviceBuffer>> children,
       const Shape& on_host_shape, TransferManager* transfer_manager,
       se::DeviceMemoryAllocator* allocator, int device_ordinal,
-      std::shared_ptr<BufferDefinitionEvent> definition_event);
+      absl::Span<const std::shared_ptr<BufferDefinitionEvent>>
+          definition_events);
 
   // Makes an uninitialized array buffer.
   static StatusOr<std::shared_ptr<SharedDeviceBuffer>> MakeArray(
       Shape on_device_shape, TransferManager* transfer_manager,
       se::DeviceMemoryAllocator* allocator, int device_ordinal,
-      std::shared_ptr<BufferDefinitionEvent> definition_event);
+      absl::Span<const std::shared_ptr<BufferDefinitionEvent>>
+          definition_events);
 
   // Builds a ShapedBuffer view onto the buffers of 'tree'. We require but do
   // not verify that TransferManager::HostShapeToDeviceShape(on_host_shape) ==
@@ -126,19 +129,22 @@ class SharedDeviceBuffer {
   const absl::InlinedVector<se::DeviceMemoryBase, 1>& device_memory() const {
     return device_memory_;
   }
-  const std::shared_ptr<BufferDefinitionEvent> definition_event() const {
-    return definition_event_;
+  absl::Span<const std::shared_ptr<BufferDefinitionEvent>> definition_events()
+      const {
+    return definition_events_;
   }
 
   SharedDeviceBuffer() = default;
   SharedDeviceBuffer(se::DeviceMemoryAllocator* allocator, int device_ordinal,
                      absl::Span<se::DeviceMemoryBase const> device_memory,
                      std::vector<std::shared_ptr<SharedDeviceBuffer>> children,
-                     std::shared_ptr<BufferDefinitionEvent> definition_event,
+                     absl::Span<const std::shared_ptr<BufferDefinitionEvent>>
+                         definition_events,
                      std::function<void()> on_delete_callback);
   SharedDeviceBuffer(absl::Span<se::OwningDeviceMemory> device_memory,
                      std::vector<std::shared_ptr<SharedDeviceBuffer>> children,
-                     std::shared_ptr<BufferDefinitionEvent> definition_event);
+                     absl::Span<const std::shared_ptr<BufferDefinitionEvent>>
+                         definition_events);
   ~SharedDeviceBuffer();
 
  private:
@@ -155,7 +161,8 @@ class SharedDeviceBuffer {
   // ready during multistream execution. May be nullptr, which is used in the
   // single-stream execution case where events are not necessary for buffer
   // event sequencing.
-  std::shared_ptr<BufferDefinitionEvent> definition_event_;
+  absl::InlinedVector<std::shared_ptr<BufferDefinitionEvent>, 2>
+      definition_events_;
 
   // A callback to call when the SharedDeviceBuffer is about to be destroyed.
   std::function<void()> on_delete_callback_;
diff --git a/tensorflow/compiler/xla/python/shared_device_buffer_test.cc b/tensorflow/compiler/xla/python/shared_device_buffer_test.cc
index b39767a0d46..05842c52a0c 100644
--- a/tensorflow/compiler/xla/python/shared_device_buffer_test.cc
+++ b/tensorflow/compiler/xla/python/shared_device_buffer_test.cc
@@ -28,10 +28,10 @@ TEST(SharedDeviceBufferTest, MakeArray) {
   LocalClient* client = ClientLibrary::LocalClientOrDie();
 
   Shape shape = ShapeUtil::MakeShape(F32, {3, 101, 4});
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto buffer, SharedDeviceBuffer::MakeArray(
-                       shape, client->backend().transfer_manager(),
-                       client->backend().memory_allocator(), 0, nullptr));
+  TF_ASSERT_OK_AND_ASSIGN(auto buffer,
+                          SharedDeviceBuffer::MakeArray(
+                              shape, client->backend().transfer_manager(),
+                              client->backend().memory_allocator(), 0, {}));
   EXPECT_EQ(buffer->children().size(), 0);
   EXPECT_EQ(buffer->device_ordinal(), 0);
   EXPECT_EQ(buffer->allocator(), client->backend().memory_allocator());
@@ -45,19 +45,19 @@ TEST(SharedDeviceBufferTest, MakeTuple) {
   Shape a_shape = ShapeUtil::MakeShape(F32, {3, 101, 4});
   Shape b_shape = ShapeUtil::MakeShape(S8, {77});
   Shape tuple_shape = ShapeUtil::MakeTupleShape({a_shape, b_shape});
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto a_buffer, SharedDeviceBuffer::MakeArray(
-                         a_shape, client->backend().transfer_manager(),
-                         client->backend().memory_allocator(), 0, nullptr));
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto b_buffer, SharedDeviceBuffer::MakeArray(
-                         b_shape, client->backend().transfer_manager(),
-                         client->backend().memory_allocator(), 0, nullptr));
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto tuple_buffer, SharedDeviceBuffer::MakeTuple(
-                             {a_buffer, b_buffer}, tuple_shape,
-                             client->backend().transfer_manager(),
-                             client->backend().memory_allocator(), 0, nullptr));
+  TF_ASSERT_OK_AND_ASSIGN(auto a_buffer,
+                          SharedDeviceBuffer::MakeArray(
+                              a_shape, client->backend().transfer_manager(),
+                              client->backend().memory_allocator(), 0, {}));
+  TF_ASSERT_OK_AND_ASSIGN(auto b_buffer,
+                          SharedDeviceBuffer::MakeArray(
+                              b_shape, client->backend().transfer_manager(),
+                              client->backend().memory_allocator(), 0, {}));
+  TF_ASSERT_OK_AND_ASSIGN(auto tuple_buffer,
+                          SharedDeviceBuffer::MakeTuple(
+                              {a_buffer, b_buffer}, tuple_shape,
+                              client->backend().transfer_manager(),
+                              client->backend().memory_allocator(), 0, {}));
   ASSERT_EQ(tuple_buffer->children().size(), 2);
   EXPECT_EQ(tuple_buffer->children()[0], a_buffer);
   EXPECT_EQ(tuple_buffer->children()[1], b_buffer);
@@ -75,30 +75,28 @@ TEST(SharedDeviceBufferTest, AsShapedBuffer) {
   Shape ab_tuple_shape = ShapeUtil::MakeTupleShape({a_shape, b_shape});
   Shape c_shape = ShapeUtil::MakeShape(S64, {});
   Shape abc_tuple_shape = ShapeUtil::MakeTupleShape({c_shape, ab_tuple_shape});
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto a_buffer, SharedDeviceBuffer::MakeArray(
-                         a_shape, client->backend().transfer_manager(),
-                         client->backend().memory_allocator(), 0, nullptr));
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto b_buffer, SharedDeviceBuffer::MakeArray(
-                         b_shape, client->backend().transfer_manager(),
-                         client->backend().memory_allocator(), 0, nullptr));
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto ab_tuple_buffer,
-      SharedDeviceBuffer::MakeTuple({a_buffer, b_buffer}, ab_tuple_shape,
-                                    client->backend().transfer_manager(),
-                                    client->backend().memory_allocator(), 0,
-                                    nullptr));
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto c_buffer, SharedDeviceBuffer::MakeArray(
-                         c_shape, client->backend().transfer_manager(),
-                         client->backend().memory_allocator(), 0, nullptr));
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto abc_tuple_buffer,
-      SharedDeviceBuffer::MakeTuple(
-          {c_buffer, ab_tuple_buffer}, abc_tuple_shape,
-          client->backend().transfer_manager(),
-          client->backend().memory_allocator(), 0, nullptr));
+  TF_ASSERT_OK_AND_ASSIGN(auto a_buffer,
+                          SharedDeviceBuffer::MakeArray(
+                              a_shape, client->backend().transfer_manager(),
+                              client->backend().memory_allocator(), 0, {}));
+  TF_ASSERT_OK_AND_ASSIGN(auto b_buffer,
+                          SharedDeviceBuffer::MakeArray(
+                              b_shape, client->backend().transfer_manager(),
+                              client->backend().memory_allocator(), 0, {}));
+  TF_ASSERT_OK_AND_ASSIGN(auto ab_tuple_buffer,
+                          SharedDeviceBuffer::MakeTuple(
+                              {a_buffer, b_buffer}, ab_tuple_shape,
+                              client->backend().transfer_manager(),
+                              client->backend().memory_allocator(), 0, {}));
+  TF_ASSERT_OK_AND_ASSIGN(auto c_buffer,
+                          SharedDeviceBuffer::MakeArray(
+                              c_shape, client->backend().transfer_manager(),
+                              client->backend().memory_allocator(), 0, {}));
+  TF_ASSERT_OK_AND_ASSIGN(auto abc_tuple_buffer,
+                          SharedDeviceBuffer::MakeTuple(
+                              {c_buffer, ab_tuple_buffer}, abc_tuple_shape,
+                              client->backend().transfer_manager(),
+                              client->backend().memory_allocator(), 0, {}));
   Shape abc_tuple_device_shape =
       client->backend().transfer_manager()->HostShapeToDeviceShape(
           abc_tuple_shape);
@@ -140,7 +138,7 @@ TEST(SharedDeviceBufferTest, FromScopedShapedBuffer) {
       ScopedShapedBuffer shaped_buffer,
       client->LiteralToShapedBuffer(literal, /*device_ordinal=*/0));
   std::shared_ptr<SharedDeviceBuffer> device_buffer =
-      SharedDeviceBuffer::FromScopedShapedBuffer(&shaped_buffer, nullptr);
+      SharedDeviceBuffer::FromScopedShapedBuffer(&shaped_buffer, {});
 
   ASSERT_EQ(device_buffer->device_memory().size(), 1);
   ASSERT_EQ(device_buffer->children().size(), 2);
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/BUILD b/tensorflow/compiler/xla/python/tpu_driver/client/BUILD
index 148822f3ba7..b5f1a831d4a 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/BUILD
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/BUILD
@@ -90,8 +90,3 @@ cc_library(
     name = "libtpu",
     hdrs = ["libtpu.h"],
 )
-
-cc_library(
-    name = "libtftpu",
-    hdrs = ["libtftpu.h"],
-)
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc
index 33573c1c8d8..706db57c4ac 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc
@@ -35,8 +35,6 @@ limitations under the License.
 
 namespace xla {
 
-constexpr char kTpuPlatform[] = "tpu";
-
 TpuDevice::TpuDevice(int id, int host_id, const std::array<int, 3>& coords,
                      int core_on_chip)
     : xla::Device(id, /*local_device_state=*/nullptr, kTpuPlatform, host_id),
@@ -154,7 +152,7 @@ StatusOr<DeviceAssignment> PyTpuClient::GetDefaultDeviceAssignment(
 Status PyTpuClient::CheckDeviceId(int device_id,
                                   absl::string_view caller_name) {
   if (device_id < 0 || device_id >= device_count()) {
-    return InvalidArgument("%s got bad device_id: %d (num_devices=%d)",
+    return InvalidArgument("%s got bad device_id: %d (num_devices=%d).",
                            caller_name, device_id, device_count());
   }
   return Status::OK();
@@ -174,12 +172,12 @@ static Status CheckDataType(xla::PrimitiveType dtype) {
 StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuBuffer::FromLiterals(
     std::vector<BorrowingLiteral> leaves, const Shape& tuple_shape,
     std::shared_ptr<void> leaves_references,
-    std::shared_ptr<PyTpuClient> client, int device_id) {
+    std::shared_ptr<PyTpuClient> client, std::shared_ptr<Device> device) {
   tensorflow::profiler::TraceMe traceme("PyTpuBuffer::FromLiterals");
   VLOG(1) << "PyTpuBuffer::FromLiterals: shape: " << tuple_shape.DebugString()
-          << " device id: " << device_id;
+          << " device: " << device->DebugString();
   TF_RETURN_IF_ERROR(
-      client->CheckDeviceId(device_id, "PyTpuBuffer::FromLiterals"));
+      client->CheckDeviceId(device->id(), "PyTpuBuffer::FromLiterals"));
   tpu_driver::TpuDriver* driver = client->driver();
 
   if (!tuple_shape.IsTuple()) {
@@ -193,7 +191,7 @@ StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuBuffer::FromLiterals(
           event->AddCallback([leaves_references](Status) {});
           return event;
         },
-        std::move(client), device_id);
+        std::move(client), std::move(device));
   }
 
   std::vector<std::unique_ptr<PyTpuBuffer>> child_buffers;
@@ -213,7 +211,7 @@ StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuBuffer::FromLiterals(
             [driver, &leaf, &indexed_shape](tpu_driver::BufferHandle* handle) {
               return driver->TransferToDevice(leaf.untyped_data(), handle, {});
             },
-            client, device_id));
+            client, device));
     child_buffer_ptrs.push_back(child_buffer.get());
     child_buffers.push_back(std::move(child_buffer));
     ++it_leaf;
@@ -223,13 +221,14 @@ StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuBuffer::FromLiterals(
   // `MakeTuple` will extract and make the tuple buffer hold onto the
   // `device_buffer_` contained in each `child_buffer`, so it's safe for
   // `child_buffers` to get destroyed before this call returns.
-  return MakeTuple(std::move(child_buffer_ptrs), std::move(client), device_id);
+  return MakeTuple(std::move(child_buffer_ptrs), std::move(client),
+                   std::move(device));
 }
 
 /* static */
 StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuBuffer::MakeTuple(
     const std::vector<PyTpuBuffer*> buffers,
-    std::shared_ptr<PyTpuClient> client, int device_id) {
+    std::shared_ptr<PyTpuClient> client, std::shared_ptr<Device> device) {
   std::vector<Shape> child_shapes;
   std::vector<std::shared_ptr<TpuSharedBuffer>> child_device_buffers;
   std::vector<tpu_driver::BufferHandle*> child_handle_ptrs;
@@ -252,11 +251,11 @@ StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuBuffer::MakeTuple(
 
   Shape tuple_shape = ShapeUtil::MakeTupleShape(child_shapes);
   std::unique_ptr<tpu_driver::BufferHandle> tuple_handle =
-      client->driver()->AllocateTuple(device_id, tpu_driver::MemoryRegion::HBM,
-                                      child_handle_ptrs, {});
+      client->driver()->AllocateTuple(
+          device->id(), tpu_driver::MemoryRegion::HBM, child_handle_ptrs, {});
   auto tuple_device_buffer = std::make_shared<TpuSharedBuffer>(
       client->driver(), std::move(tuple_handle), std::move(child_events),
-      device_id);
+      std::move(device));
   return absl::make_unique<PyTpuBuffer>(
       tuple_shape, std::move(tuple_device_buffer),
       std::move(child_device_buffers), std::move(client));
@@ -268,7 +267,7 @@ PyTpuBuffer::PyTpuBuffer(
     std::shared_ptr<PyTpuClient> client)
     : client_(std::move(client)),
       on_host_shape_(std::move(on_host_shape)),
-      device_id_(device_buffer->device_id),
+      device_(device_buffer->device),
       device_buffer_(std::move(device_buffer)),
       child_buffers_(std::move(child_buffers)) {}
 
@@ -368,11 +367,11 @@ PyTpuBuffer::DestructureTuple() {
   if (!on_host_shape_.IsTuple()) {
     return InvalidArgument(
         "Attempted to destructure a PyTpuBuffer that did not have a tuple "
-        "shape; shape: %s",
+        "shape; shape: %s.",
         ShapeUtil::HumanString(on_host_shape_));
   }
   if (DeviceBuffer() == nullptr) {
-    return InvalidArgument("Attempted to destructure a deleted buffer");
+    return InvalidArgument("Attempted to destructure a deleted buffer.");
   }
 
   absl::MutexLock lock(&mu_);
@@ -388,14 +387,14 @@ PyTpuBuffer::DestructureTuple() {
 }
 
 StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuBuffer::CopyToDevice(
-    int dst_device_id) {
+    std::shared_ptr<Device> dst_device) {
   tensorflow::profiler::TraceMe traceme("PyTpuBuffer::CopyToDevice");
   if (on_host_shape_.IsTuple()) {
     return Unimplemented("CopyToDevice for tuples is not supported.");
   }
 
   std::shared_ptr<TpuSharedBuffer> src_device_buffer = DeviceBuffer();
-  if (dst_device_id == device_id_) {
+  if (dst_device->id() == device_->id()) {
     return absl::make_unique<PyTpuBuffer>(
         on_host_shape_, src_device_buffer,
         std::vector<std::shared_ptr<TpuSharedBuffer>>(), client_);
@@ -414,7 +413,7 @@ StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuBuffer::CopyToDevice(
             return driver->TransferFromDeviceToDevice(
                 src_device_buffer->handle.get(), dst_handle, src_wait_for_use);
           },
-          client_, dst_device_id));
+          client_, std::move(dst_device)));
   // TODO(jiawenhao): This may be too pessimistic: it prevents future readers
   // from reading `src_device_buffer` until the device-to-device copy is done.
   // Should this go into a new `TpuSharedBuffer::wait_for_dealloc` field?
@@ -432,13 +431,15 @@ Status PyTpuBuffer::BlockHostUntilReady() {
 
 /* static */
 StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuBuffer::AllocateBuffer(
-    const Shape& shape, std::shared_ptr<PyTpuClient> client, int device_id) {
+    const Shape& shape, std::shared_ptr<PyTpuClient> client,
+    std::shared_ptr<Device> device) {
   tensorflow::profiler::TraceMe traceme("PyTpuBuffer::AllocateBuffer");
   VLOG(1) << "PyTpuBuffer::AllocateBuffer: shape: " << shape.DebugString()
-          << " device ordinal: " << device_id;
+          << " device: " << device->DebugString();
 
   if (!shape.IsTuple()) {
-    return CreateBuffer(shape, absl::nullopt, std::move(client), device_id);
+    return CreateBuffer(shape, absl::nullopt, std::move(client),
+                        std::move(device));
   }
 
   std::vector<std::unique_ptr<PyTpuBuffer>> child_buffers;
@@ -448,7 +449,7 @@ StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuBuffer::AllocateBuffer(
 
   for (const auto& child_shape : shape.tuple_shapes()) {
     TF_ASSIGN_OR_RETURN(std::unique_ptr<PyTpuBuffer> child_buffer,
-                        AllocateBuffer(child_shape, client, device_id));
+                        AllocateBuffer(child_shape, client, device));
     child_buffer_ptrs.push_back(child_buffer.get());
     child_buffers.push_back(std::move(child_buffer));
   }
@@ -457,21 +458,23 @@ StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuBuffer::AllocateBuffer(
   // `device_buffer_` contained in each `child_buffer`, so it's safe for
   // `child_buffers` to get destroyed before this call returns.
   return PyTpuBuffer::MakeTuple(child_buffer_ptrs, std::move(client),
-                                device_id);
+                                std::move(device));
 }
 
 /*static*/
 StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuBuffer::CreateBuffer(
     const Shape& non_tuple_shape, absl::optional<BufferInitializer> initializer,
-    std::shared_ptr<PyTpuClient> client, int device_id) {
+    std::shared_ptr<PyTpuClient> client, std::shared_ptr<Device> device) {
   tensorflow::profiler::TraceMe traceme("PyTpuBuffer::CreateBuffer");
   VLOG(1) << "PyTpuBuffer::CreateBuffer: shape: "
-          << non_tuple_shape.DebugString() << " device id: " << device_id;
+          << non_tuple_shape.DebugString()
+          << " device: " << device->DebugString();
   TF_RET_CHECK(!non_tuple_shape.IsTuple());
   TF_RETURN_IF_ERROR(CheckDataType(non_tuple_shape.element_type()));
 
-  std::unique_ptr<tpu_driver::BufferHandle> handle = client->driver()->Allocate(
-      device_id, tpu_driver::MemoryRegion::HBM, non_tuple_shape.ToProto(), {});
+  std::unique_ptr<tpu_driver::BufferHandle> handle =
+      client->driver()->Allocate(device->id(), tpu_driver::MemoryRegion::HBM,
+                                 non_tuple_shape.ToProto(), {});
 
   // If this buffer needs to be initialized, anyone using this buffer must wait
   // for the initialization event in `wait_for_use` to finish first.
@@ -481,7 +484,8 @@ StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuBuffer::CreateBuffer(
     wait_for_use.push_back(std::move(init));
   }
   auto device_buffer = std::make_shared<TpuSharedBuffer>(
-      client->driver(), std::move(handle), std::move(wait_for_use), device_id);
+      client->driver(), std::move(handle), std::move(wait_for_use),
+      std::move(device));
 
   return absl::make_unique<PyTpuBuffer>(
       non_tuple_shape, std::move(device_buffer),
@@ -542,7 +546,8 @@ PyTpuExecutable::ExecuteResult PyTpuExecutable::ExecuteHelper(
           << " mapped to device id for execution: " << device_id;
 
   std::unique_ptr<::xla::PyTpuBuffer> output_buffer =
-      ::xla::PyTpuBuffer::AllocateBuffer(result_shape_, client_, device_id)
+      ::xla::PyTpuBuffer::AllocateBuffer(result_shape_, client_,
+                                         std::move(device))
           .ValueOrDie();
   VLOG(1) << "Created output buffer: " << result_shape_.DebugString();
 
@@ -610,12 +615,12 @@ StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuExecutable::Execute(
     absl::Span<PyTpuBuffer* const> argument_handles) {
   if (num_replicas() != 1) {
     return InvalidArgument(
-        "Attempted to execute computation with %d replicas using Execute()",
+        "Attempted to execute computation with %d replicas using Execute().",
         num_replicas());
   }
   if (num_partitions() != 1) {
     return InvalidArgument(
-        "Attempted to execute computation with %d partitions using Execute()",
+        "Attempted to execute computation with %d partitions using Execute().",
         num_partitions());
   }
 
@@ -636,19 +641,6 @@ StatusOr<std::unique_ptr<PyTpuBuffer>> PyTpuExecutable::Execute(
   return std::move(result.buffer);
 }
 
-StatusOr<std::vector<std::unique_ptr<PyTpuBuffer>>>
-PyTpuExecutable::ExecutePerReplica(
-    absl::Span<const std::vector<PyTpuBuffer*>> argument_handles) {
-  tensorflow::profiler::TraceMe traceme("PyTpuExecutable::ExecutePerReplica");
-  if (num_partitions() != 1) {
-    return InvalidArgument(
-        "Attempted to execute computation with %d partitions using "
-        "ExecutePerReplica()",
-        num_partitions());
-  }
-  return ExecuteOnLocalDevices(argument_handles);
-}
-
 StatusOr<std::vector<std::unique_ptr<PyTpuBuffer>>>
 PyTpuExecutable::ExecuteOnLocalDevices(
     absl::Span<const std::vector<PyTpuBuffer*>> argument_handles) {
@@ -660,7 +652,7 @@ PyTpuExecutable::ExecuteOnLocalDevices(
   if (argument_handles.size() != num_local_devices) {
     return InvalidArgument(
         "Attempted to execute with %d argument lists when local device "
-        "count is %d (total replica count: %d, partition count: %d)",
+        "count is %d (total replica count: %d, partition count: %d).",
         argument_handles.size(), num_local_devices, num_replicas(),
         num_partitions());
   }
@@ -717,54 +709,6 @@ PyTpuExecutable::ExecuteOnLocalDevices(
   return wrapped_results;
 }
 
-/*static*/ StatusOr<std::unique_ptr<PyTpuExecutable>>
-PyTpuExecutable::CompileForDevices(
-    const XlaComputation& computation,
-    absl::optional<std::vector<Shape>> argument_layouts,
-    const ExecutableBuildOptions* build_options,
-    std::shared_ptr<PyTpuClient> client,
-    const std::vector<std::vector<std::shared_ptr<Device>>>&
-        device_assignment) {
-  if (device_assignment.empty()) {
-    return InvalidArgument(
-        "Device assignment passed to Compile() must be non-empty.");
-  }
-  if (device_assignment[0].empty()) {
-    return InvalidArgument(
-        "Device assignment passed to Compile() must have a nonzero number of "
-        "partitions per replica; replica 0 had 0 partitions.");
-  }
-  DeviceAssignment xla_assignment(device_assignment.size(),
-                                  device_assignment[0].size());
-  for (int replica = 0; replica < device_assignment.size(); ++replica) {
-    if (device_assignment[replica].size() != device_assignment[0].size()) {
-      return InvalidArgument(
-          "Device assignment passed to Compile() has different numbers of "
-          "partitions between replicas; %d partitions for replica %d versus %d "
-          "partitions for replica 0.",
-          device_assignment[replica].size(), replica,
-          device_assignment[0].size());
-    }
-    for (int partition = 0; partition < device_assignment[replica].size();
-         ++partition) {
-      if (device_assignment[0][0]->platform_name() !=
-          device_assignment[replica][partition]->platform_name()) {
-        return InvalidArgument(
-            "Device assignment passed to Compile() must have devices of a "
-            "single kind, got %s for replica 0 partition 0 and %s for replica "
-            "%d partition %d.",
-            device_assignment[0][0]->platform_name(),
-            device_assignment[replica][partition]->platform_name(), replica,
-            partition);
-      }
-      xla_assignment(replica, partition) =
-          device_assignment[replica][partition]->id();
-    }
-  }
-  return Compile(computation, std::move(argument_layouts), build_options,
-                 std::move(client), xla_assignment);
-}
-
 /*static*/ StatusOr<std::unique_ptr<PyTpuExecutable>> PyTpuExecutable::Compile(
     const XlaComputation& computation,
     absl::optional<std::vector<Shape>> argument_layouts,
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h
index f4815b44183..4b7670707fb 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h
@@ -36,6 +36,8 @@ limitations under the License.
 
 namespace xla {
 
+constexpr char kTpuPlatform[] = "tpu";
+
 class TpuDevice : public Device {
  public:
   TpuDevice(int id, int host_id, const std::array<int, 3>& coords,
@@ -126,9 +128,9 @@ struct TpuSharedBuffer final {
   TpuSharedBuffer(tpu_driver::TpuDriver* driver,
                   std::unique_ptr<tpu_driver::BufferHandle> handle,
                   std::vector<std::shared_ptr<tpu_driver::Event>> wait_for_use,
-                  int device_id)
+                  std::shared_ptr<Device> src_device)
       : driver(driver),
-        device_id(device_id),
+        device(std::move(src_device)),
         handle(std::move(handle)),
         wait_for_use(std::move(wait_for_use)) {}
 
@@ -141,7 +143,7 @@ struct TpuSharedBuffer final {
   }
 
   tpu_driver::TpuDriver* const driver;
-  const int device_id;
+  const std::shared_ptr<Device> device;
 
   std::unique_ptr<tpu_driver::BufferHandle> handle;
   std::vector<std::shared_ptr<tpu_driver::Event>> wait_for_use;
@@ -160,12 +162,12 @@ class PyTpuBuffer {
   static StatusOr<std::unique_ptr<PyTpuBuffer>> FromLiterals(
       std::vector<BorrowingLiteral> leaves_literals, const Shape& tuple_shape,
       std::shared_ptr<void> leaves_reference,
-      std::shared_ptr<PyTpuClient> client, int device_id);
+      std::shared_ptr<PyTpuClient> client, std::shared_ptr<Device> device);
 
   // Supports nested tuple creation.
   static StatusOr<std::unique_ptr<PyTpuBuffer>> MakeTuple(
       const std::vector<PyTpuBuffer*> buffers,
-      std::shared_ptr<PyTpuClient> client, int device_id);
+      std::shared_ptr<PyTpuClient> client, std::shared_ptr<Device> device);
 
   PyTpuBuffer() = delete;
   PyTpuBuffer(Shape on_host_shape,
@@ -179,7 +181,7 @@ class PyTpuBuffer {
   PyTpuBuffer& operator=(PyTpuBuffer&&) = delete;
 
   const Shape& on_host_shape() const { return on_host_shape_; }
-  int device_id() const { return device_id_; }
+  std::shared_ptr<Device> device() const { return device_; }
   const std::string& platform_name() const { return client_->platform_name(); }
   std::shared_ptr<PyTpuClient> client() const { return client_; }
 
@@ -205,8 +207,10 @@ class PyTpuBuffer {
   // Destructures a tuple-valued PyTpuBuffer into its constituent elements.
   StatusOr<std::vector<std::unique_ptr<PyTpuBuffer>>> DestructureTuple();
 
-  // Copies the buffer to device `dst_device_id`.
-  StatusOr<std::unique_ptr<PyTpuBuffer>> CopyToDevice(int dst_device_id);
+  // Copies the buffer to target device `dst_device` and returns a PyTpuBuffer
+  // object holding the context to the target device buffer.
+  StatusOr<std::unique_ptr<PyTpuBuffer>> CopyToDevice(
+      std::shared_ptr<Device> dst_device);
 
   // Blocks the host until the buffer's value has been computed and is ready for
   // immediate use on the device. Useful in particular for timing benchmarks.
@@ -215,7 +219,8 @@ class PyTpuBuffer {
   // Allocates uninitialized buffers on device `device_id`. If `shape` is a
   // tuple, the returned buffer corresponds to the root tuple buffer.
   static StatusOr<std::unique_ptr<PyTpuBuffer>> AllocateBuffer(
-      const Shape& shape, std::shared_ptr<PyTpuClient> client, int device_id);
+      const Shape& shape, std::shared_ptr<PyTpuClient> client,
+      std::shared_ptr<Device> device);
 
  private:
   // Initializes a just allocated device buffer. The returned event will be
@@ -226,18 +231,19 @@ class PyTpuBuffer {
   static StatusOr<std::unique_ptr<PyTpuBuffer>> CreateBuffer(
       const Shape& non_tuple_shape,
       absl::optional<BufferInitializer> initializer,
-      std::shared_ptr<PyTpuClient> client, int device_id);
+      std::shared_ptr<PyTpuClient> client, std::shared_ptr<Device> device);
 
   const std::shared_ptr<PyTpuClient> client_;
   const Shape on_host_shape_;
-  const int device_id_;
+  const std::shared_ptr<Device> device_;
 
   // If this is a tuple, `device_buffer_` stores the tuple buffer and
   // `child_buffers_` stores the child buffers; else, `device_buffer_` stores
   // the data content and `child_buffers_` is empty.
   mutable absl::Mutex mu_;
-  std::shared_ptr<TpuSharedBuffer> device_buffer_ GUARDED_BY(mu_);
-  std::vector<std::shared_ptr<TpuSharedBuffer>> child_buffers_ GUARDED_BY(mu_);
+  std::shared_ptr<TpuSharedBuffer> device_buffer_ TF_GUARDED_BY(mu_);
+  std::vector<std::shared_ptr<TpuSharedBuffer>> child_buffers_
+      TF_GUARDED_BY(mu_);
   // The cached value of the buffer on the host, produced either from a call to
   // CopyToHost or from a call to ToLiteral. Once a value has been fetched to
   // the host, it persists Delete() is called or the PyTpuBuffer is destroyed.
@@ -250,23 +256,13 @@ class PyTpuBuffer {
     Status status;
     std::shared_ptr<Literal> value;
   };
-  std::shared_ptr<HostValue> host_value_ GUARDED_BY(mu_);
+  std::shared_ptr<HostValue> host_value_ TF_GUARDED_BY(mu_);
 };
 
 // Represents a compiled computation that can be executed given handles to
 // device-allocated literals. Wraps an XLA LocalExecutable.
 class PyTpuExecutable {
  public:
-  // Compiles a computation to an executable.
-  static StatusOr<std::unique_ptr<PyTpuExecutable>> CompileForDevices(
-      const XlaComputation& computation,
-      absl::optional<std::vector<Shape>> argument_layouts,
-      const ExecutableBuildOptions* build_options,
-      std::shared_ptr<PyTpuClient> client,
-      const std::vector<std::vector<std::shared_ptr<Device>>>&
-          device_assignment);
-
-  // TODO(phawkins): remove after changing callers to use the first overload.
   static StatusOr<std::unique_ptr<PyTpuExecutable>> Compile(
       const XlaComputation& computation,
       absl::optional<std::vector<Shape>> argument_layouts,
@@ -309,20 +305,12 @@ class PyTpuExecutable {
     return local_devices_;
   }
 
-  // TODO(power): Both Execute and ExecutePerReplica block and wait inside for
-  // computation to finish. Coordinate with JAX code change to see if we can
-  // make both Execute and ExecutePerReplica non-blocking.
+  // TODO(power): Both Execute and ExecutePerOnLocalDevices block and wait
+  // inside for computation to finish. Coordinate with JAX code change to see if
+  // we can make both Execute and ExecutePerReplica non-blocking.
   StatusOr<std::unique_ptr<PyTpuBuffer>> Execute(
       absl::Span<PyTpuBuffer* const> argument_handles);
 
-  // Execute on many replicas. Takes a sequence of argument lists (one argument
-  // list per replica) and returns a tuple of results (one result per replica).
-  // The number of argument lists must be equal to the replica count.
-  // The executable must have only one partition.
-  // TODO(cjfj): Remove this once JAX is moved to `ExecuteOnLocalDevices`.
-  StatusOr<std::vector<std::unique_ptr<PyTpuBuffer>>> ExecutePerReplica(
-      absl::Span<const std::vector<PyTpuBuffer*>> argument_handles);
-
   // Execute on local devices. Takes a sequence of argument lists (one argument
   // list per local device) and returns a tuple of results (one result per local
   // device). The number of argument lists must be equal to the local device
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc
index f6e2fab7ef0..0dcb9dc4c84 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc
@@ -38,7 +38,7 @@ PYBIND11_MODULE(tpu_client_extension, m) {
       .def("local_devices", &PyTpuClient::local_devices)
       .def("host_id", &PyTpuClient::host_id)
       .def("GetDefaultDeviceAssignment",
-           [](PyLocalClient* client, int num_replicas, int num_partitions)
+           [](PyTpuClient* client, int num_replicas, int num_partitions)
                -> StatusOr<std::vector<std::vector<std::shared_ptr<Device>>>> {
              TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment,
                                  client->GetDefaultDeviceAssignment(
@@ -121,9 +121,9 @@ PYBIND11_MODULE(tpu_client_extension, m) {
                           std::make_move_iterator(tree.leaves.end()));
 
             py::gil_scoped_release gil_release;
-            return PyTpuBuffer::FromLiterals(std::move(leaves), tree.shape,
-                                             std::move(py_buffer_ref),
-                                             std::move(client), device->id());
+            return PyTpuBuffer::FromLiterals(
+                std::move(leaves), tree.shape, std::move(py_buffer_ref),
+                std::move(client), std::move(device));
           })
       .def_static("make_tuple",
                   [](const std::vector<PyTpuBuffer*> buffers,
@@ -137,15 +137,15 @@ PYBIND11_MODULE(tpu_client_extension, m) {
                           "Cannot make tuple on device '%s' with '%s' backend",
                           device->DebugString(), client->platform_name());
                     }
-                    return PyTpuBuffer::MakeTuple(buffers, client,
-                                                  device->id());
+                    return PyTpuBuffer::MakeTuple(buffers, std::move(client),
+                                                  std::move(device));
                   })
       .def("copy_to_device",
            [](PyTpuBuffer* buffer, std::shared_ptr<Device> dst_device) {
              CHECK(dst_device != nullptr);
              GlobalPyRefManager()->CollectGarbage();
              py::gil_scoped_release gil_release;
-             return buffer->CopyToDevice(dst_device->id());
+             return buffer->CopyToDevice(std::move(dst_device));
            })
       .def("delete", &PyTpuBuffer::Delete)
       .def("destructure", &PyTpuBuffer::DestructureTuple)
@@ -168,10 +168,7 @@ PYBIND11_MODULE(tpu_client_extension, m) {
              return LiteralToPython(std::move(literal));
            })
       .def("shape", &PyTpuBuffer::on_host_shape)
-      .def("device",
-           [](PyTpuBuffer* buffer) -> std::shared_ptr<Device> {
-             return buffer->client()->devices()[buffer->device_id()];
-           })
+      .def("device", &PyTpuBuffer::device)
       .def("platform", &PyTpuBuffer::platform_name)
       .def("is_deleted", [](const PyTpuBuffer& buffer) {
         return buffer.DeviceBuffer() == nullptr;
@@ -180,8 +177,25 @@ PYBIND11_MODULE(tpu_client_extension, m) {
   py::class_<PyTpuExecutable>(m, "TpuExecutable")
       .def_static("Compile", &PyTpuExecutable::Compile,
                   py::call_guard<py::gil_scoped_release>())
-      .def_static("Compile", &PyTpuExecutable::CompileForDevices,
-                  py::call_guard<py::gil_scoped_release>())
+      .def_static("Compile",
+                  [](const XlaComputation& computation,
+                     absl::optional<std::vector<Shape>> argument_layouts,
+                     const ExecutableBuildOptions* build_options,
+                     std::shared_ptr<PyTpuClient> client,
+                     absl::optional<std::vector<std::vector<Device*>>>
+                         device_assignment)
+                      -> StatusOr<std::unique_ptr<PyTpuExecutable>> {
+                    py::gil_scoped_release gil_release;
+                    absl::optional<DeviceAssignment> xla_device_assignment;
+                    if (device_assignment) {
+                      TF_ASSIGN_OR_RETURN(
+                          xla_device_assignment,
+                          DevicesToDeviceAssignment(*device_assignment));
+                    }
+                    return PyTpuExecutable::Compile(
+                        computation, argument_layouts, build_options, client,
+                        std::move(xla_device_assignment));
+                  })
       .def("local_logical_device_ids",
            &PyTpuExecutable::local_logical_device_ids)
       .def("local_devices", &PyTpuExecutable::local_devices)
@@ -190,8 +204,6 @@ PYBIND11_MODULE(tpu_client_extension, m) {
       .def("Delete", &PyTpuExecutable::Delete)
       .def("Execute", &PyTpuExecutable::Execute,
            py::call_guard<py::gil_scoped_release>(), py::arg("arguments"))
-      .def("ExecutePerReplica", &PyTpuExecutable::ExecutePerReplica,
-           py::call_guard<py::gil_scoped_release>(), py::arg("arguments"))
       .def("ExecuteOnLocalDevices", &PyTpuExecutable::ExecuteOnLocalDevices,
            py::call_guard<py::gil_scoped_release>(), py::arg("arguments"));
 
diff --git a/tensorflow/compiler/xla/python/tpu_driver/tpu_driver.h b/tensorflow/compiler/xla/python/tpu_driver/tpu_driver.h
index 9127f0342fa..72e55b1d11e 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/tpu_driver.h
+++ b/tensorflow/compiler/xla/python/tpu_driver/tpu_driver.h
@@ -23,7 +23,6 @@
 #include <string>
 #include <vector>
 
-#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/synchronization/mutex.h"
diff --git a/tensorflow/compiler/xla/python/types.h b/tensorflow/compiler/xla/python/types.h
index ceefbda4f90..564933c14f2 100644
--- a/tensorflow/compiler/xla/python/types.h
+++ b/tensorflow/compiler/xla/python/types.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "include/pybind11/pybind11.h"
 #include "include/pybind11/stl.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/python/local_client.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -35,6 +36,97 @@ limitations under the License.
 
 namespace xla {
 
+// Custom holder types.
+//
+// We must keep the PyLocalClient object alive as long as any of the runtime
+// objects are alive. Since we don't have a lot of control over Python
+// destructor ordering, we keep the PyLocalClient object as a std::shared_ptr<>,
+// and ensure that each Python runtime object holds a reference to the
+// PyLocalClient. An alternative design would be to keep a single global
+// singleton PyLocalClient, although this seems less flexible, especially for
+// writing tests.
+//
+// To maintain PyLocalClient references, we define pybind11 holder classes that
+// are custom smart pointers that also keep a reference to a PyLocalClient.
+// pybind11 has a `keep_alive` feature that has a similar goal, but it doesn't
+// seem sufficiently flexible to describe ownership relationships in cases where
+// the ownership doesn't pertain to a direct argument or return value of a
+// function. Another alternative to the holder classes would be to create proxy
+// objects that contain both a reference and a runtime class; holder classes
+// seem less tedious to define.
+
+// A pair of a PyLocalClient reference and an unowned pointer to T.
+template <typename T>
+struct ClientAndPtr {
+  ClientAndPtr() = default;
+  // pybind11 requires that we define a constructor that takes a raw pointer,
+  // but it should be unreachable.
+  explicit ClientAndPtr(T*) {
+    LOG(FATAL) << "ClientAndPtr should constructed via WrapWithClient.";
+  }
+
+  ClientAndPtr(const ClientAndPtr&) = default;
+  ClientAndPtr(ClientAndPtr&&) = default;
+  ClientAndPtr& operator=(const ClientAndPtr&) = default;
+  ClientAndPtr& operator=(ClientAndPtr&&) = default;
+
+  std::shared_ptr<PyLocalClient> client;
+  T* contents;
+
+  T* get() const { return contents; }
+  T* operator->() const { return contents; }
+  T& operator*() const { return *contents; }
+};
+
+// By defining a templated helper function, we can use return type deduction
+// and avoid specifying types at the caller.
+template <typename T>
+ClientAndPtr<T> WrapWithClient(std::shared_ptr<PyLocalClient> client,
+                               T* contents) {
+  ClientAndPtr<T> result;
+  result.client = std::move(client);
+  result.contents = contents;
+  return result;
+}
+
+// A pair of a PyLocalClient reference and an owned pointer to T.
+template <typename T>
+struct ClientAndUniquePtr {
+  ClientAndUniquePtr() = default;
+  // pybind11 requires that we define a constructor that takes a raw pointer,
+  // but it should be unreachable.
+  explicit ClientAndUniquePtr(T*) {
+    LOG(FATAL) << "ClientAndUniquePtr should constructed via WrapWithClient.";
+  }
+  ClientAndUniquePtr(const ClientAndUniquePtr&) = delete;
+  ClientAndUniquePtr(ClientAndUniquePtr&&) = default;
+  ClientAndUniquePtr& operator=(const ClientAndUniquePtr&) = delete;
+  ClientAndUniquePtr& operator=(ClientAndUniquePtr&&) = default;
+
+  std::shared_ptr<PyLocalClient> client;
+  std::unique_ptr<T> contents;
+
+  T* get() const { return contents.get(); }
+  T* operator->() const { return contents.get(); }
+  T& operator*() const { return *contents; }
+};
+
+template <typename T>
+ClientAndUniquePtr<T> WrapWithClient(std::shared_ptr<PyLocalClient> client,
+                                     std::unique_ptr<T> contents) {
+  ClientAndUniquePtr<T> result;
+  result.client = std::move(client);
+  result.contents = std::move(contents);
+  return result;
+}
+
+}  // namespace xla
+
+PYBIND11_DECLARE_HOLDER_TYPE(T, xla::ClientAndPtr<T>);
+PYBIND11_DECLARE_HOLDER_TYPE(T, xla::ClientAndUniquePtr<T>);
+
+namespace xla {
+
 // Initializes the NumPy API for the use of the types module.
 bool InitializeNumpyAPIForTypes();
 
diff --git a/tensorflow/compiler/xla/python/worker_thread.h b/tensorflow/compiler/xla/python/worker_thread.h
index bc7dd396f88..598f7b1d4ae 100644
--- a/tensorflow/compiler/xla/python/worker_thread.h
+++ b/tensorflow/compiler/xla/python/worker_thread.h
@@ -40,11 +40,11 @@ class WorkerThread {
   void Schedule(std::function<void()> fn);
 
  private:
-  bool WorkAvailable() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  bool WorkAvailable() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
   void WorkLoop();
 
   absl::Mutex mu_;
-  std::queue<std::function<void()>> work_queue_ GUARDED_BY(mu_);
+  std::queue<std::function<void()>> work_queue_ TF_GUARDED_BY(mu_);
 
   std::unique_ptr<tensorflow::Thread> thread_;
 };
diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index 4be375ac15a..b42202ca838 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
+#include "include/pybind11/cast.h"
 #include "include/pybind11/numpy.h"
 #include "include/pybind11/pybind11.h"
 #include "include/pybind11/pytypes.h"
@@ -40,6 +41,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/python/bfloat16.h"
 #include "tensorflow/compiler/xla/python/cpu_device.h"
+#include "tensorflow/compiler/xla/python/distributed/client.h"
+#include "tensorflow/compiler/xla/python/distributed/distributed.h"
+#include "tensorflow/compiler/xla/python/distributed/service.h"
 #include "tensorflow/compiler/xla/python/dlpack.h"
 #include "tensorflow/compiler/xla/python/local_client.h"
 #include "tensorflow/compiler/xla/python/nvidia_gpu_device.h"
@@ -70,7 +74,7 @@ namespace {
 
 struct Uniquer {
   absl::Mutex mu;
-  NameUniquer name_uniquer GUARDED_BY(mu);
+  NameUniquer name_uniquer TF_GUARDED_BY(mu);
 };
 
 Uniquer* GetUniquer() {
@@ -153,16 +157,6 @@ Status PyRegisterCustomCallTarget(const std::string& fn_name,
   return Status::OK();
 }
 
-StatusOr<std::shared_ptr<Device>> LookupDeviceOrdinal(
-    PyLocalClient* client, int device_ordinal, absl::string_view caller_name) {
-  if (device_ordinal < 0 || device_ordinal >= client->local_device_count()) {
-    return InvalidArgument(
-        "%s got bad device_ordinal: %d (num_local_devices=%d)", caller_name,
-        device_ordinal, client->local_device_count());
-  }
-  return client->local_devices()[device_ordinal];
-}
-
 // PEP 3118 buffer protocol implementation.
 
 // Extra data to be kept alive by the consumer of the buffer protocol.
@@ -552,9 +546,7 @@ class TraceMeContextManager {
   void Enter() {
     if (IsEnabled()) {
       std::string name(name_);
-      // TODO(skye): we can use kwargs_.empty() once we upgrade to pybind11 2.4
-      // in workspace.bzl
-      if (kwargs_.size() != 0) {
+      if (!kwargs_.empty()) {
         absl::StrAppend(&name, "#");
         bool first = true;
         for (const auto& entry : kwargs_) {
@@ -764,7 +756,7 @@ PYBIND11_MODULE(xla_extension, m) {
   // Literals
   py::class_<Literal, std::shared_ptr<Literal>>(m, "Literal")
       .def("__repr__", &Literal::ToString);
-  py::class_<LiteralSlice>(m, "LiteralSlice");
+  py::class_<LiteralSlice> literal_slice(m, "LiteralSlice");
   py::implicitly_convertible<Literal, LiteralSlice>();
   py::implicitly_convertible<BorrowingLiteral, LiteralSlice>();
 
@@ -790,7 +782,7 @@ PYBIND11_MODULE(xla_extension, m) {
       .def("computation_count", &DeviceAssignment::computation_count)
       .def("__repr__", &DeviceAssignment::ToString);
 
-  py::class_<Device, std::shared_ptr<Device>>(
+  py::class_<Device, ClientAndPtr<Device>>(
       m, "Device",
       "A descriptor of an available device.\n\nSubclasses are used to "
       "represent specific types of devices, e.g. CPUs, GPUs. Subclasses may "
@@ -832,12 +824,12 @@ PYBIND11_MODULE(xla_extension, m) {
             return LiteralToPython(std::move(literal_shared));
           });
 
-  py::class_<CpuDevice, Device, std::shared_ptr<CpuDevice>>(m, "CpuDevice")
+  py::class_<CpuDevice, Device, ClientAndPtr<CpuDevice>>(m, "CpuDevice")
       .def("__repr__", [](const CpuDevice& device) {
         return absl::StrFormat("CpuDevice(id=%i)", device.id());
       });
 
-  py::class_<GpuDevice, Device, std::shared_ptr<GpuDevice>>(m, "GpuDevice")
+  py::class_<GpuDevice, Device, ClientAndPtr<GpuDevice>>(m, "GpuDevice")
       .def("__repr__", [](const GpuDevice& device) {
         return absl::StrFormat("GpuDevice(id=%i)", device.id());
       });
@@ -860,16 +852,33 @@ PYBIND11_MODULE(xla_extension, m) {
   py::class_<PyLocalClient, std::shared_ptr<PyLocalClient>>(m, "LocalClient")
       .def("device_count", &PyLocalClient::device_count)
       .def("local_device_count", &PyLocalClient::local_device_count)
-      .def("devices", &PyLocalClient::devices)
-      .def("local_devices", &PyLocalClient::local_devices)
+      .def("devices",
+           [](std::shared_ptr<PyLocalClient> client) {
+             std::vector<ClientAndPtr<Device>> devices;
+             devices.reserve(client->devices().size());
+             for (const auto& device : client->devices()) {
+               devices.push_back(WrapWithClient(client, device.get()));
+             }
+             return devices;
+           })
+      .def("local_devices",
+           [](std::shared_ptr<PyLocalClient> client) {
+             std::vector<ClientAndPtr<Device>> devices;
+             devices.reserve(client->local_devices().size());
+             for (Device* device : client->local_devices()) {
+               devices.push_back(WrapWithClient(client, device));
+             }
+             return devices;
+           })
       .def("host_id", &PyLocalClient::host_id)
       .def("GetDefaultDeviceAssignment",
-           [](PyLocalClient* client, int num_replicas, int num_partitions)
-               -> StatusOr<std::vector<std::vector<std::shared_ptr<Device>>>> {
+           [](std::shared_ptr<PyLocalClient> client, int num_replicas,
+              int num_partitions)
+               -> StatusOr<std::vector<std::vector<ClientAndPtr<Device>>>> {
              TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment,
                                  client->GetDefaultDeviceAssignment(
                                      num_replicas, num_partitions));
-             std::vector<std::vector<std::shared_ptr<Device>>> result;
+             std::vector<std::vector<ClientAndPtr<Device>>> result;
              result.resize(num_replicas);
              for (int r = 0; r < num_replicas; ++r) {
                result[r].resize(num_partitions);
@@ -877,24 +886,24 @@ PYBIND11_MODULE(xla_extension, m) {
                  int device_id = device_assignment(r, p);
                  auto iter = client->id_to_device().find(device_id);
                  CHECK(iter != client->id_to_device().end()) << device_id;
-                 result[r][p] = iter->second;
+                 result[r][p] = WrapWithClient(client, iter->second);
                }
              }
              return result;
            })
       // TODO(skye): delete after all callers can handle 2D output
       .def("GetDefaultDeviceAssignment",
-           [](PyLocalClient* client, int num_replicas)
-               -> StatusOr<std::vector<std::shared_ptr<Device>>> {
+           [](std::shared_ptr<PyLocalClient> client,
+              int num_replicas) -> StatusOr<std::vector<ClientAndPtr<Device>>> {
              TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment,
                                  client->GetDefaultDeviceAssignment(
                                      num_replicas, /*num_partitions=*/1));
-             std::vector<std::shared_ptr<Device>> result;
+             std::vector<ClientAndPtr<Device>> result;
              for (int i = 0; i < num_replicas; ++i) {
                int device_id = device_assignment(i, 0);
                auto iter = client->id_to_device().find(device_id);
                CHECK(iter != client->id_to_device().end()) << device_id;
-               result.push_back(iter->second);
+               result.push_back(WrapWithClient(client, iter->second));
              }
              return result;
            })
@@ -913,16 +922,17 @@ PYBIND11_MODULE(xla_extension, m) {
   m.def("get_cpu_client", &GetCpuClient, py::arg("asynchronous") = true);
   m.def("get_nvidia_gpu_client", &GetNvidiaGpuClient,
         py::arg("asynchronous") = true,
-        py::arg("allocator_config") = GpuAllocatorConfig());
+        py::arg("allocator_config") = GpuAllocatorConfig(),
+        py::arg("distributed_client") = nullptr, py::arg("node_id") = 0);
 
-  py::class_<PyLocalBuffer> buffer(m, "PyLocalBuffer");
+  py::class_<PyLocalBuffer, ClientAndUniquePtr<PyLocalBuffer>> buffer(
+      m, "PyLocalBuffer");
   buffer
       .def_static(
           "from_python",
           [](const pybind11::object& argument,
-             std::shared_ptr<PyLocalClient> client,
-             std::shared_ptr<Device> device,
-             bool force_copy) -> StatusOr<std::unique_ptr<PyLocalBuffer>> {
+             std::shared_ptr<PyLocalClient> client, Device* device,
+             bool force_copy) -> StatusOr<ClientAndUniquePtr<PyLocalBuffer>> {
             CHECK(device != nullptr);
             auto iter = client->id_to_device().find(device->id());
             if (iter->second != device) {
@@ -943,36 +953,57 @@ PYBIND11_MODULE(xla_extension, m) {
                 GlobalPyRefManager()->ManageReference(std::move(c->array));
 
             py::gil_scoped_release gil_release;
-            return PyLocalBuffer::FromHostBuffer(
-                c->buf_ptr, c->shape, force_copy, std::move(py_buffer_ref),
-                std::move(client), std::move(device));
+            TF_ASSIGN_OR_RETURN(
+                std::unique_ptr<PyLocalBuffer> buffer,
+                PyLocalBuffer::FromHostBuffer(c->buf_ptr, c->shape, force_copy,
+                                              std::move(py_buffer_ref),
+                                              client.get(), device));
+            return WrapWithClient(std::move(client), std::move(buffer));
           },
           py::arg("argument"), py::arg("client"), py::arg("device"),
           py::arg("force_copy") = false)
-      .def_static("make_tuple",
-                  [](const std::vector<PyLocalBuffer*> buffers,
-                     std::shared_ptr<PyLocalClient> client,
-                     std::shared_ptr<Device> device)
-                      -> StatusOr<std::unique_ptr<PyLocalBuffer>> {
-                    CHECK(device != nullptr);
-                    auto iter = client->id_to_device().find(device->id());
-                    if (iter->second != device) {
-                      return InvalidArgument(
-                          "Cannot make tuple on device '%s' with '%s' backend",
-                          device->DebugString(), client->platform_name());
-                    }
-                    return PyLocalBuffer::MakeTuple(buffers, std::move(client),
-                                                    std::move(device));
-                  })
+      .def_static(
+          "make_tuple",
+          [](std::vector<PyLocalBuffer*> buffers,
+             std::shared_ptr<PyLocalClient> client,
+             Device* device) -> StatusOr<ClientAndUniquePtr<PyLocalBuffer>> {
+            CHECK(device != nullptr);
+            auto iter = client->id_to_device().find(device->id());
+            if (iter->second != device) {
+              return InvalidArgument(
+                  "Cannot make tuple on device '%s' with '%s' backend",
+                  device->DebugString(), client->platform_name());
+            }
+            TF_ASSIGN_OR_RETURN(
+                std::unique_ptr<PyLocalBuffer> buffer,
+                PyLocalBuffer::MakeTuple(buffers, client.get(), device));
+            return WrapWithClient(std::move(client), std::move(buffer));
+          })
       .def("copy_to_device",
-           [](PyLocalBuffer* buffer, std::shared_ptr<Device> dst_device) {
-             CHECK(dst_device != nullptr);
+           [](PyLocalBuffer* buffer, const ClientAndPtr<Device>& dst_device)
+               -> StatusOr<ClientAndUniquePtr<PyLocalBuffer>> {
+             CHECK(dst_device.get() != nullptr);
              GlobalPyRefManager()->CollectGarbage();
              py::gil_scoped_release gil_release;
-             return buffer->CopyToDevice(std::move(dst_device));
+             TF_ASSIGN_OR_RETURN(std::unique_ptr<PyLocalBuffer> out,
+                                 buffer->CopyToDevice(dst_device.get()));
+             return WrapWithClient(dst_device.client, std::move(out));
            })
       .def("delete", &PyLocalBuffer::Delete)
-      .def("destructure", &PyLocalBuffer::DestructureTuple)
+      .def("destructure",
+           [](const PyLocalBuffer& buffer)
+               -> StatusOr<std::vector<ClientAndUniquePtr<PyLocalBuffer>>> {
+             TF_ASSIGN_OR_RETURN(
+                 std::vector<std::unique_ptr<PyLocalBuffer>> parts,
+                 buffer.DestructureTuple());
+             std::vector<ClientAndUniquePtr<PyLocalBuffer>> output;
+             output.reserve(parts.size());
+             for (auto& part : parts) {
+               output.push_back(WrapWithClient(
+                   buffer.client()->shared_from_this(), std::move(part)));
+             }
+             return std::move(output);
+           })
       .def("block_host_until_ready",
            [](PyLocalBuffer* buffer) {
              GlobalPyRefManager()->CollectGarbage();
@@ -1004,7 +1035,11 @@ PYBIND11_MODULE(xla_extension, m) {
             return LiteralToPython(std::move(literal));
           })
       .def("shape", &PyLocalBuffer::on_host_shape)
-      .def("device", &PyLocalBuffer::device)
+      .def("device",
+           [](const PyLocalBuffer& buffer) {
+             return WrapWithClient(buffer.client()->shared_from_this(),
+                                   buffer.device());
+           })
       .def("platform", &PyLocalBuffer::platform_name)
       .def("is_deleted",
            [](const PyLocalBuffer& buffer) {
@@ -1030,24 +1065,160 @@ PYBIND11_MODULE(xla_extension, m) {
   PyTypeObject* buffer_type = reinterpret_cast<PyTypeObject*>(buffer.ptr());
   buffer_type->tp_as_buffer = &PyLocalBufferProcs;
 
-  py::class_<PyLocalExecutable>(m, "LocalExecutable")
-      .def_static("Compile", &PyLocalExecutable::Compile,
-                  py::call_guard<py::gil_scoped_release>())
-      .def_static("Compile", &PyLocalExecutable::CompileForDevices,
-                  py::call_guard<py::gil_scoped_release>())
+  py::class_<PyLocalExecutable, ClientAndUniquePtr<PyLocalExecutable>>
+      executable(m, "LocalExecutable");
+  executable
+      .def_static("Compile",
+                  [](const XlaComputation& computation,
+                     absl::optional<std::vector<Shape>> argument_layouts,
+                     const ExecutableBuildOptions* build_options,
+                     std::shared_ptr<PyLocalClient> client,
+                     absl::optional<DeviceAssignment> device_assignment)
+                      -> StatusOr<ClientAndUniquePtr<PyLocalExecutable>> {
+                    py::gil_scoped_release gil_release;
+                    CompileOptions options;
+                    options.argument_layouts = std::move(argument_layouts);
+                    if (build_options) {
+                      options.executable_build_options = *build_options;
+                    }
+                    if (device_assignment) {
+                      options.executable_build_options.set_device_assignment(
+                          *device_assignment);
+                    }
+                    TF_ASSIGN_OR_RETURN(
+                        std::unique_ptr<PyLocalExecutable> executable,
+                        PyLocalExecutable::Compile(computation, client.get(),
+                                                   std::move(options)));
+                    return WrapWithClient(std::move(client),
+                                          std::move(executable));
+                  })
+      .def_static("Compile",
+                  [](const XlaComputation& computation,
+                     absl::optional<std::vector<Shape>> argument_layouts,
+                     const ExecutableBuildOptions* build_options,
+                     std::shared_ptr<PyLocalClient> client,
+                     absl::optional<std::vector<std::vector<Device*>>>
+                         device_assignment)
+                      -> StatusOr<ClientAndUniquePtr<PyLocalExecutable>> {
+                    py::gil_scoped_release gil_release;
+                    CompileOptions options;
+                    options.argument_layouts = std::move(argument_layouts);
+                    if (build_options) {
+                      options.executable_build_options = *build_options;
+                    }
+                    if (device_assignment) {
+                      TF_ASSIGN_OR_RETURN(
+                          DeviceAssignment xla_assignment,
+                          DevicesToDeviceAssignment(*device_assignment));
+                      options.executable_build_options.set_device_assignment(
+                          xla_assignment);
+                    }
+                    TF_ASSIGN_OR_RETURN(
+                        std::unique_ptr<PyLocalExecutable> executable,
+                        PyLocalExecutable::Compile(computation, client.get(),
+                                                   std::move(options)));
+                    return WrapWithClient(std::move(client),
+                                          std::move(executable));
+                  })
       .def("local_logical_device_ids",
            &PyLocalExecutable::local_logical_device_ids)
-      .def("local_devices", &PyLocalExecutable::local_devices)
+      .def("local_devices",
+           [](const PyLocalExecutable& executable) {
+             std::vector<ClientAndPtr<Device>> devices;
+             devices.reserve(executable.local_devices().size());
+             for (Device* device : executable.local_devices()) {
+               devices.push_back(WrapWithClient(
+                   executable.client()->shared_from_this(), device));
+             }
+             return devices;
+           })
       .def("SizeOfGeneratedCodeInBytes",
            &PyLocalExecutable::SizeOfGeneratedCodeInBytes)
       .def("Delete", &PyLocalExecutable::Delete)
-      .def("Execute", &PyLocalExecutable::Execute,
-           py::call_guard<py::gil_scoped_release>(), py::arg("arguments"))
-      // TODO(phawkins): remove when all callers switch to ExecuteOnLocalDevices
-      .def("ExecutePerReplica", &PyLocalExecutable::ExecutePerReplica,
-           py::call_guard<py::gil_scoped_release>(), py::arg("arguments"))
-      .def("ExecuteOnLocalDevices", &PyLocalExecutable::ExecuteOnLocalDevices,
-           py::call_guard<py::gil_scoped_release>(), py::arg("arguments"))
+      .def(
+          "Execute",
+          [](const PyLocalExecutable& executable,
+             absl::Span<PyLocalBuffer* const> args)
+              -> StatusOr<ClientAndUniquePtr<PyLocalBuffer>> {
+            py::gil_scoped_release gil_release;
+            TF_ASSIGN_OR_RETURN(
+                std::vector<std::unique_ptr<PyLocalBuffer>> output,
+                executable.Execute(args, ExecuteOptions()));
+            return WrapWithClient(executable.client()->shared_from_this(),
+                                  std::move(output.front()));
+          },
+          py::arg("arguments"))
+      // TODO(phawkins): remove in favor of overload that returns a vector.
+      .def(
+          "Execute",
+          [](const PyLocalExecutable& executable,
+             absl::Span<PyLocalBuffer* const> args, bool tuple_arguments)
+              -> StatusOr<std::vector<ClientAndUniquePtr<PyLocalBuffer>>> {
+            py::gil_scoped_release gil_release;
+            ExecuteOptions options;
+            options.tuple_arguments = tuple_arguments;
+            options.untuple_result = true;
+            TF_ASSIGN_OR_RETURN(
+                std::vector<std::unique_ptr<PyLocalBuffer>> output_buffers,
+                executable.Execute(args, options));
+            std::vector<ClientAndUniquePtr<PyLocalBuffer>> outputs;
+            outputs.reserve(output_buffers.size());
+            for (auto& buffer : output_buffers) {
+              outputs.push_back(WrapWithClient(
+                  executable.client()->shared_from_this(), std::move(buffer)));
+            }
+            return outputs;
+          },
+          py::arg("arguments"), py::arg("tuple_arguments"))
+      // TODO(phawkins): remove in favor of overload that returns a vector.
+      .def(
+          "ExecuteOnLocalDevices",
+          [](const PyLocalExecutable& executable,
+             absl::Span<const std::vector<PyLocalBuffer*>> args)
+              -> StatusOr<std::vector<ClientAndUniquePtr<PyLocalBuffer>>> {
+            py::gil_scoped_release gil_release;
+            TF_ASSIGN_OR_RETURN(
+                std::vector<std::vector<std::unique_ptr<PyLocalBuffer>>>
+                    output_buffers,
+                executable.ExecuteOnLocalDevices(args, ExecuteOptions()));
+            std::vector<ClientAndUniquePtr<PyLocalBuffer>> outputs;
+            outputs.reserve(output_buffers.size());
+            for (auto& buffers : output_buffers) {
+              outputs.push_back(
+                  WrapWithClient(executable.client()->shared_from_this(),
+                                 std::move(buffers.front())));
+            }
+            return outputs;
+          },
+          py::arg("arguments"))
+      .def(
+          "ExecuteOnLocalDevices",
+          [](const PyLocalExecutable& executable,
+             absl::Span<const std::vector<PyLocalBuffer*>> args,
+             bool tuple_arguments)
+              -> StatusOr<
+                  std::vector<std::vector<ClientAndUniquePtr<PyLocalBuffer>>>> {
+            py::gil_scoped_release gil_release;
+            ExecuteOptions options;
+            options.tuple_arguments = tuple_arguments;
+            options.untuple_result = true;
+            TF_ASSIGN_OR_RETURN(
+                std::vector<std::vector<std::unique_ptr<PyLocalBuffer>>>
+                    output_buffers,
+                executable.ExecuteOnLocalDevices(args, options));
+            std::vector<std::vector<ClientAndUniquePtr<PyLocalBuffer>>> outputs;
+            outputs.resize(output_buffers.size());
+            for (int computation = 0; computation < output_buffers.size();
+                 ++computation) {
+              for (auto& buffer : output_buffers[computation]) {
+                outputs[computation].push_back(
+                    WrapWithClient(executable.client()->shared_from_this(),
+                                   std::move(buffer)));
+              }
+            }
+            return outputs;
+          },
+          py::arg("arguments"), py::arg("tuple_arguments"))
       .def(
           "get_hlo_modules",
           [](const PyLocalExecutable& executable)
@@ -1208,7 +1379,14 @@ PYBIND11_MODULE(xla_extension, m) {
       .def("ClearSharding", &XlaBuilder::ClearSharding);
 
   m.def("BufferToDLPackManagedTensor", BufferToDLPackManagedTensor);
-  m.def("DLPackManagedTensorToBuffer", DLPackManagedTensorToBuffer);
+  m.def("DLPackManagedTensorToBuffer",
+        [](const py::capsule& tensor, std::shared_ptr<PyLocalClient> client)
+            -> StatusOr<ClientAndUniquePtr<PyLocalBuffer>> {
+          TF_ASSIGN_OR_RETURN(
+              std::unique_ptr<PyLocalBuffer> buffer,
+              DLPackManagedTensorToBuffer(tensor, client.get()));
+          return WrapWithClient(std::move(client), std::move(buffer));
+        });
 
   py::enum_<TriangularSolveOptions::Transpose>(
       m, "TriangularSolveOptions_Transpose")
@@ -1247,6 +1425,16 @@ PYBIND11_MODULE(xla_extension, m) {
 
   BuildOpsSubmodule(&m);
   BuildProfilerSubmodule(&m);
+
+  py::class_<DistributedRuntimeService,
+             std::unique_ptr<DistributedRuntimeService>>
+      distributed_runtime_service(m, "DistributedRuntimeService");
+  py::class_<DistributedRuntimeClient,
+             std::shared_ptr<DistributedRuntimeClient>>
+      distributed_runtime_client(m, "DistributedRuntimeClient");
+
+  m.def("get_distributed_runtime_service", &GetDistributedRuntimeService);
+  m.def("get_distributed_runtime_client", &GetDistributedRuntimeClient);
 }  // NOLINT(readability/fn_size)
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 9d53f9bd082..f1f31a5eb89 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -177,7 +177,7 @@ def _cpu_backend_factory():
   return LocalBackend(platform='cpu', client=client)
 
 
-def _gpu_backend_factory():
+def _gpu_backend_factory(distributed_client=None, node_id=0):
   """Returns a GPU backend. BFC allocator is used by default."""
   allocator = os.getenv('XLA_PYTHON_CLIENT_ALLOCATOR', 'default').lower()
   memory_fraction = os.getenv('XLA_PYTHON_CLIENT_MEM_FRACTION')
@@ -197,8 +197,11 @@ def _gpu_backend_factory():
     config.memory_fraction = float(memory_fraction)
   config.preallocate = preallocate not in ('0', 'false', 'False')
 
-  client = _xla.get_nvidia_gpu_client(asynchronous=True,
-                                      allocator_config=config)
+  client = _xla.get_nvidia_gpu_client(
+      asynchronous=True,
+      allocator_config=config,
+      distributed_client=distributed_client,
+      node_id=node_id)
   return LocalBackend(platform='gpu', client=client)
 
 
@@ -604,17 +607,17 @@ class Computation(object):
 #   def SizeOfGeneratedCodeInBytes(self) -> int:
 #     """Return generated binary size, or -1 if not known."""
 #
-#   def ExecutePerReplica(self, arguments: [[Buffer]]) -> [Buffer]:
+#   def ExecuteOnLocalDevices(self, arguments: [[Buffer]]) -> [Buffer]:
 #     """Execute on many replicas with Buffer arguments and return value.
 #
 #     Args:
 #       arguments: A sequence of sequences of Buffers. The i'th inner sequence
-#         comprises the arguments for execution on the i'th replica.
+#         comprises the arguments for execution on the i'th local device.
 #
 #     Returns:
-#       A list of the computation's outputs for each replica, as a Buffer. If
-#       a shallow sequence of arguments was passed in for `arguments`, then the
-#       sole, zero'th replica's output is returned instead, as a Buffer.
+#       A list of the computation's outputs for each local device, as a Buffer.
+#       If a shallow sequence of arguments was passed in for `arguments`, then
+#       the sole, zero'th device's output is returned instead, as a Buffer.
 #     """
 #
 # There are different implementations of Executable for different backends.
@@ -658,7 +661,7 @@ def execute_with_python_values_replicated(executable, arguments, backend=None):
   for replica_args in arguments:
     arg_buffers.append(flat_arg_buffers[:len(replica_args)])
     flat_arg_buffers = flat_arg_buffers[len(replica_args):]
-  return [out.to_py() for out in executable.ExecutePerReplica(arg_buffers)]
+  return [out.to_py() for out in executable.ExecuteOnLocalDevices(arg_buffers)]
 
 
 class PaddingType(enum.Enum):
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 502f0fa7927..98851fddd2d 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -3133,6 +3133,7 @@ cc_library(
     hdrs = ["hlo_dce.h"],
     deps = [
         ":hlo",
+        ":hlo_casting_utils",
         ":hlo_pass",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:status_macros",
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index fd373671b97..1f36d906e73 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -3647,7 +3647,8 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
   // A reshape that collapses multiple dimensions into a dimension being
   // reduced can just reduce all of those dimensions instead of doing a
   // collapsing reshape before a reduction.
-  if (arg->opcode() == HloOpcode::kReshape) {
+  if (options_.enable_reduce_of_reshape() &&
+      arg->opcode() == HloOpcode::kReshape) {
     std::vector<std::pair<int64, int64>> unmodified_dims =
         ShapeUtil::DimensionsUnmodifiedByReshape(arg->operand(0)->shape(),
                                                  arg->shape());
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.h b/tensorflow/compiler/xla/service/algebraic_simplifier.h
index ce364a16134..4251e7eb846 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.h
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.h
@@ -107,6 +107,12 @@ class AlgebraicSimplifierOptions {
     return metadata_.cudnn_batchnorm_forward_training_metadata;
   }
 
+  void set_enable_reduce_of_reshape(bool enable_reduce_of_reshape) {
+    enable_reduce_of_reshape_ = enable_reduce_of_reshape;
+  }
+
+  bool enable_reduce_of_reshape() const { return enable_reduce_of_reshape_; }
+
  private:
   // Metadata struct can be used to store any metadata information encapsulated
   // with the AlgebraicSimplierOptions that can be later used in an
@@ -126,6 +132,7 @@ class AlgebraicSimplifierOptions {
   bool enable_dot_to_multiply_rewrite_{true};
   bool enable_conv_simplification_{true};
   bool enable_window_reduce_to_reduce_replacement_{true};
+  bool enable_reduce_of_reshape_{true};
   int64 very_small_gather_size_{4};
   Metadata metadata_;
 };
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.h b/tensorflow/compiler/xla/service/allocation_tracker.h
index 6e7f9fdfc13..06b55e24b69 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.h
+++ b/tensorflow/compiler/xla/service/allocation_tracker.h
@@ -87,7 +87,7 @@ class AllocationTracker {
   // Internal helper which resolves the given GlobalDataHandle to a
   // list of ScopedShapedBuffers.
   StatusOr<std::vector<const ShapedBuffer*>> ResolveInternal(
-      const GlobalDataHandle& data) const EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+      const GlobalDataHandle& data) const TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   // Internal helper which registers a vector of shaped buffers, one per
   // replica.  ShapedBufferTy is either ScopedShapedBuffer or ShapedBuffer.  If
@@ -96,18 +96,19 @@ class AllocationTracker {
   template <typename ShapedBufferTy>
   StatusOr<GlobalDataHandle> RegisterInternal(
       std::vector<ShapedBufferTy> replicated_buffers, const string& tag)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   // Adds the given device address to the allocation tracker, or if it already
   // exists, then increment its reference count.
   void AddAllocationOrIncrementRefCount(se::DeviceMemoryBase device_memory,
                                         int device_ordinal)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   // Decrements the reference count of the given device memory. Then, if it is
   // zero, deallocate the memory.
   Status DecrementRefCount(se::DeviceMemoryBase device_memory,
-                           int device_ordinal) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+                           int device_ordinal)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   // A map from device memory opaque value to allocation. One such map is
   // maintained per device ordinal.
@@ -121,11 +122,11 @@ class AllocationTracker {
 
   // The next handle to assign to an allocation, guarded by the same mutex as
   // the mapping as they'll be mutated at the same time.
-  int64 next_handle_ GUARDED_BY(mutex_);
+  int64 next_handle_ TF_GUARDED_BY(mutex_);
 
   // A map from device ordinal to AllocationMap.
   absl::flat_hash_map<int, AllocationMap> opaque_to_allocation_map_
-      GUARDED_BY(mutex_);
+      TF_GUARDED_BY(mutex_);
 
   // A map from data handle to a vector of shaped buffers that represent the
   // buffers for different replicas.
@@ -145,7 +146,7 @@ class AllocationTracker {
   // free'd when both the view *and* the original tuple are Unregistered.  This
   // refcounting is managed in opaque_to_allocation_map_.
   absl::flat_hash_map<int64, std::vector<std::unique_ptr<ShapedBuffer>>>
-      handle_to_shaped_buffers_ GUARDED_BY(mutex_);
+      handle_to_shaped_buffers_ TF_GUARDED_BY(mutex_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(AllocationTracker);
 };
diff --git a/tensorflow/compiler/xla/service/backend.h b/tensorflow/compiler/xla/service/backend.h
index 79fdeb2b0bc..2e2284a3e23 100644
--- a/tensorflow/compiler/xla/service/backend.h
+++ b/tensorflow/compiler/xla/service/backend.h
@@ -176,7 +176,7 @@ class Backend {
 
   // Mapping from stream executor to stream pools, used by `BorrowStream` above.
   absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<StreamPool>>
-      stream_pools_ GUARDED_BY(mu_);
+      stream_pools_ TF_GUARDED_BY(mu_);
 
   // The default memory allocator to use.
   std::unique_ptr<se::StreamExecutorMemoryAllocator> memory_allocator_;
diff --git a/tensorflow/compiler/xla/service/channel_tracker.h b/tensorflow/compiler/xla/service/channel_tracker.h
index 89e17eba36f..a02cda91a69 100644
--- a/tensorflow/compiler/xla/service/channel_tracker.h
+++ b/tensorflow/compiler/xla/service/channel_tracker.h
@@ -68,24 +68,24 @@ class ChannelTracker {
   // Bumps the next_channel_ number and returns the allocated number
   // wrapped in a ChannelHandle.
   ChannelHandle AllocateHandle(ChannelHandle::ChannelType type)
-      EXCLUSIVE_LOCKS_REQUIRED(channel_mutex_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(channel_mutex_);
 
   Status RegisterSendInternal(const ChannelHandle& handle)
-      EXCLUSIVE_LOCKS_REQUIRED(channel_mutex_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(channel_mutex_);
 
   Status RegisterRecvInternal(const ChannelHandle& handle)
-      EXCLUSIVE_LOCKS_REQUIRED(channel_mutex_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(channel_mutex_);
 
   // Guards the channel mapping.
   tensorflow::mutex channel_mutex_;
 
   // The next sequence number to assign to a channel.
-  int64 next_channel_ GUARDED_BY(channel_mutex_);
+  int64 next_channel_ TF_GUARDED_BY(channel_mutex_);
 
   // Mapping from ChannelHandle value to the corresponding registered
   // Channel object.
   absl::flat_hash_map<int64, Channel> opaque_to_channel_
-      GUARDED_BY(channel_mutex_);
+      TF_GUARDED_BY(channel_mutex_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(ChannelTracker);
 };
diff --git a/tensorflow/compiler/xla/service/collective_ops_utils.h b/tensorflow/compiler/xla/service/collective_ops_utils.h
index d9b6c48685b..6af05d925aa 100644
--- a/tensorflow/compiler/xla/service/collective_ops_utils.h
+++ b/tensorflow/compiler/xla/service/collective_ops_utils.h
@@ -241,9 +241,9 @@ class Rendezvous {
 
   tensorflow::mutex mu_;
 
-  bool initialized_ GUARDED_BY(mu_) = false;
+  bool initialized_ TF_GUARDED_BY(mu_) = false;
 
-  std::vector<AllReduceParticipantData> participants_ GUARDED_BY(mu_);
+  std::vector<AllReduceParticipantData> participants_ TF_GUARDED_BY(mu_);
 
  private:
   // Runs the all-reduce on the given thread.  If successful, returns
diff --git a/tensorflow/compiler/xla/service/compilation_cache.h b/tensorflow/compiler/xla/service/compilation_cache.h
index 5f94def509d..22c1af8fdab 100644
--- a/tensorflow/compiler/xla/service/compilation_cache.h
+++ b/tensorflow/compiler/xla/service/compilation_cache.h
@@ -51,7 +51,7 @@ class CompilationCache {
   using CacheKey = int64;
 
   absl::flat_hash_map<CacheKey, std::shared_ptr<Executable>> cache_
-      GUARDED_BY(mutex_);
+      TF_GUARDED_BY(mutex_);
 
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(CompilationCache);
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index c07c3eb3c3b..6bfd8c4db46 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -975,8 +975,9 @@ Status CopyInsertion::AddCopiesToResolveInterference(HloModule* module) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
                       HloAliasAnalysis::Run(module, can_share_buffer_));
 
-  for (HloComputation* computation : module->computations()) {
-    for (HloInstruction* instruction : computation->instructions()) {
+  for (HloComputation* computation : module->MakeComputationPostOrder()) {
+    for (HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
       if (instruction->opcode() == HloOpcode::kWhile) {
         TF_RETURN_IF_ERROR(AddCopiesForWhile(*alias_analysis, instruction));
       } else if (instruction->opcode() == HloOpcode::kConditional) {
diff --git a/tensorflow/compiler/xla/service/cpu/orc_jit_memory_mapper.cc b/tensorflow/compiler/xla/service/cpu/orc_jit_memory_mapper.cc
index e624e5cc7eb..244d7d4c539 100644
--- a/tensorflow/compiler/xla/service/cpu/orc_jit_memory_mapper.cc
+++ b/tensorflow/compiler/xla/service/cpu/orc_jit_memory_mapper.cc
@@ -23,7 +23,7 @@ namespace orc_jit_memory_mapper {
 
 static tensorflow::mutex mapper_instance_mutex(tensorflow::LINKER_INITIALIZED);
 static llvm::SectionMemoryManager::MemoryMapper* mapper_instance
-    GUARDED_BY(mapper_instance_mutex) = nullptr;
+    TF_GUARDED_BY(mapper_instance_mutex) = nullptr;
 
 llvm::SectionMemoryManager::MemoryMapper* GetInstance() {
   tensorflow::mutex_lock lock(mapper_instance_mutex);
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.h b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
index 2f8be8c111b..cbbc4d7bf34 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.h
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
@@ -273,7 +273,8 @@ class VectorSupportLibrary {
   llvm::Value* GetConstantFloat(llvm::Type* type, const llvm::APFloat& f) {
     llvm::Constant* scalar_value = llvm::ConstantFP::get(type->getContext(), f);
     if (llvm::isa<llvm::VectorType>(type)) {
-      return llvm::ConstantVector::getSplat(vector_size(), scalar_value);
+      return llvm::ConstantVector::getSplat(
+          llvm::ElementCount(vector_size(), /*Scalable=*/false), scalar_value);
     }
     return scalar_value;
   }
diff --git a/tensorflow/compiler/xla/service/dump.cc b/tensorflow/compiler/xla/service/dump.cc
index 3cb0eb78c5b..ca6fadc2e23 100644
--- a/tensorflow/compiler/xla/service/dump.cc
+++ b/tensorflow/compiler/xla/service/dump.cc
@@ -274,7 +274,7 @@ static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
 // dies.  But we only add an entry if dumping is enabled for this module, and
 // dumping a module leaks buffer space in stdout or bytes on disk *way* faster
 // than this hashtable leaks memory.
-static auto& module_id_to_step_number GUARDED_BY(mu) =
+static auto& module_id_to_step_number TF_GUARDED_BY(mu) =
     *new absl::flat_hash_map<int64, int64>();
 
 // Maps a module's unique ID to a timestamp indicating when we've first dumped
@@ -285,7 +285,7 @@ static auto& module_id_to_step_number GUARDED_BY(mu) =
 // dies.  But we only add an entry if dumping is enabled for this module, and
 // dumping a module leaks buffer space in stdout or bytes on disk *way* faster
 // than this hashtable leaks memory.
-static auto& module_id_to_timestamp GUARDED_BY(mu) =
+static auto& module_id_to_timestamp TF_GUARDED_BY(mu) =
     *new absl::flat_hash_map<int64, uint64>();
 
 int64 StepNumberForModule(const HloModule& module) {
@@ -432,7 +432,7 @@ void DumpHloSnapshotIfEnabled(const HloModule& module,
   int64 execution_count;
   uint64 timestamp;
   {
-    static auto& module_id_to_execution_count GUARDED_BY(mu) =
+    static auto& module_id_to_execution_count TF_GUARDED_BY(mu) =
         *new absl::flat_hash_map<int64, int64>();
     tensorflow::mutex_lock lock(mu);
     execution_count = module_id_to_execution_count[module.unique_id()]++;
@@ -469,7 +469,7 @@ void DumpHloSnapshotIfEnabled(const HloSnapshot& snapshot,
   // have to use its name.
   int64 execution_count;
   {
-    static auto& module_name_to_execution_count GUARDED_BY(mu) =
+    static auto& module_name_to_execution_count TF_GUARDED_BY(mu) =
         *new absl::flat_hash_map<string, int64>();
     tensorflow::mutex_lock lock(mu);
     execution_count = module_name_to_execution_count[name]++;
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
index dfcf50a3108..34d144ea1e9 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
@@ -584,10 +584,13 @@ Status DynamicDimensionInferenceVisitor::HandleReshape(HloInstruction* hlo) {
           HloInstruction* operand_dynamic_size,
           DimensionConstraint constraint) -> Status {
         HloInstruction* reshape = hlo;
-        TF_RET_CHECK(reshape->shape().rank() > 0)
-            << "Reshaping a dynamic dimension into a scalar, which has "
-               "undefined behavior. The offending instruction is: "
-            << reshape->ToString();
+        if (reshape->shape().rank() == 0) {
+          VLOG(0) << "Reshaping a dynamic dimension into a scalar, which has "
+                     "undefined behavior when input size is 0. The offending "
+                     "instruction is: "
+                  << reshape->ToString();
+          return Status::OK();
+        }
         auto common_factors = CommonFactors(operand->shape().dimensions(),
                                             reshape->shape().dimensions());
         int64 input_dim_start = -1;
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
index 3a69a684b86..d2913f9d2a1 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
@@ -560,6 +560,30 @@ TEST_F(DynamicDimensionInferenceTest, ReshapeTestMajorDimension) {
   EXPECT_NE(inference_->GetDynamicSize(reshape, {}, 0), nullptr);
 }
 
+TEST_F(DynamicDimensionInferenceTest, ReshapeIntoScalar) {
+  // Test the ability to a reshape into scalar.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {1});
+  auto output_shape = ShapeUtil::MakeShape(F32, {});
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, input_shape, "A"));
+
+  builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, scalar_shape_, "size_param"));
+
+  builder.AddInstruction(HloInstruction::CreateReshape(output_shape, a_param));
+
+  module_->AddEntryComputation(builder.Build());
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 0}));
+
+  SCOPED_TRACE(module_->ToString());
+  TF_CHECK_OK(RunInference());
+}
+
 TEST_F(DynamicDimensionInferenceTest, GatherTest) {
   const string hlo_text = R"(
 HloModule TensorFlowGatherV2
diff --git a/tensorflow/compiler/xla/service/execution_tracker.h b/tensorflow/compiler/xla/service/execution_tracker.h
index 4e9b9f883e2..8819b9da922 100644
--- a/tensorflow/compiler/xla/service/execution_tracker.h
+++ b/tensorflow/compiler/xla/service/execution_tracker.h
@@ -86,12 +86,12 @@ class ExecutionTracker {
 
  private:
   // The next handle to assign to an execution.
-  int64 next_handle_ GUARDED_BY(execution_mutex_);
+  int64 next_handle_ TF_GUARDED_BY(execution_mutex_);
 
   // Mapping from ExecutionHandle handle to the corresponding registered
   // AsyncExecution object.
   std::map<int64, std::unique_ptr<AsyncExecution>> handle_to_execution_
-      GUARDED_BY(execution_mutex_);
+      TF_GUARDED_BY(execution_mutex_);
 
   tensorflow::mutex execution_mutex_;  // Guards the execution mapping.
 
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index ba5d7e9d788..4a903548c22 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -156,6 +156,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_reachability",
+        "//tensorflow/core/platform:random",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
diff --git a/tensorflow/compiler/xla/service/gpu/cholesky_thunk.h b/tensorflow/compiler/xla/service/gpu/cholesky_thunk.h
index 8ef5a46b3e3..50ecca51588 100644
--- a/tensorflow/compiler/xla/service/gpu/cholesky_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/cholesky_thunk.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CHOLESKY_THUNK_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CHOLESKY_THUNK_H_
 
-#include "absl/base/thread_annotations.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
@@ -29,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/blas.h"
 
 namespace xla {
@@ -66,7 +66,8 @@ class CholeskyThunk : public Thunk {
   const int64 n_;
 
   tensorflow::mutex mu_;
-  absl::flat_hash_map<se::Stream*, CusolverContext> contexts_ GUARDED_BY(mu_);
+  absl::flat_hash_map<se::Stream*, CusolverContext> contexts_
+      TF_GUARDED_BY(mu_);
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc b/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc
index 2a071cd658d..a5001d5168d 100644
--- a/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc
@@ -127,12 +127,12 @@ class Rendezvous {
       std::make_shared<BlockingCounter>(key_.num_participants)};
 
   tensorflow::mutex mu_;
-  bool initialized_ GUARDED_BY(mu_) = false;
+  bool initialized_ TF_GUARDED_BY(mu_) = false;
 
   // We use an std::map so that we can iterate over it below in a guaranteed
   // order.  The order shouldn't actually matter, but why be nondeterministic if
   // we don't have to be?
-  std::map<int64, ParticipantData> participants_ GUARDED_BY(mu_);
+  std::map<int64, ParticipantData> participants_ TF_GUARDED_BY(mu_);
 };
 
 void EnqueueCopy(se::DeviceMemoryBase src, se::Stream* src_stream,
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.cc
index de67b115ff7..8316cb7d12d 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.cc
@@ -45,11 +45,11 @@ using GemmCacheKey =
     std::tuple<se::StreamExecutor*, Shape, Shape, Shape, std::string>;
 
 static tensorflow::mutex autotune_cache_mu(tensorflow::LINKER_INITIALIZED);
-static auto& autotune_cache GUARDED_BY(autotune_cache_mu) =
+static auto& autotune_cache TF_GUARDED_BY(autotune_cache_mu) =
     *new absl::flat_hash_map<GemmCacheKey,
                              absl::optional<se::blas::AlgorithmType>>();
-static int64 cache_hits GUARDED_BY(autotune_cache_mu) = 0;
-static int64 cache_misses GUARDED_BY(autotune_cache_mu) = 0;
+static int64 cache_hits TF_GUARDED_BY(autotune_cache_mu) = 0;
+static int64 cache_misses TF_GUARDED_BY(autotune_cache_mu) = 0;
 
 // Experimentally tries to pick the best algorithm for the given gemm.
 //
@@ -58,15 +58,48 @@ static int64 cache_misses GUARDED_BY(autotune_cache_mu) = 0;
 // than sm_50 -- in both cases, cublas doesn't support gemm-with-algorithm at
 // all.
 static StatusOr<absl::optional<se::blas::AlgorithmType>> DoUncachedGemmAutotune(
-    const HloInstruction* gemm, se::DeviceMemoryBase lhs_buffer,
-    se::DeviceMemoryBase rhs_buffer, se::DeviceMemoryBase output_buffer,
-    se::DeviceMemoryBase reference_result_buffer, se::Stream* stream,
-    const se::RedzoneAllocator& allocator, const BufferComparator& comparator,
-    bool crash_on_checking_failure) {
+    const HloInstruction* gemm, se::Stream* stream,
+    se::DeviceMemoryAllocator* allocator) {
   if (!stream->parent()->SynchronizeAllActivity()) {
     return InternalError("Failed to synchronize GPU for autotuning.");
   }
 
+  const HloModuleConfig& hlo_module_config = gemm->GetModule()->config();
+  const bool init_cublas_data =
+      hlo_module_config.debug_options().xla_gpu_autotune_level() > 1;
+  se::RedzoneAllocator input_output_allocator(
+      stream, allocator, PtxOptsFromConfig(hlo_module_config),
+      /*memory_limit=*/std::numeric_limits<int64>::max());
+
+  BufferComparator comparator(gemm->shape(), hlo_module_config);
+
+  int64 rng_state = 0;
+  auto get_initialized_buffer =
+      [&](const HloInstruction* op) -> StatusOr<se::DeviceMemoryBase> {
+    TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase buffer,
+                        input_output_allocator.AllocateBytes(
+                            ShapeUtil::ByteSizeOf(op->shape())));
+    if (init_cublas_data) {
+      InitializeBuffer(stream, op->shape().element_type(), &rng_state, buffer);
+    }
+    return buffer;
+  };
+
+  TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase lhs_buffer,
+                      get_initialized_buffer(gemm->operand(0)));
+  TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase rhs_buffer,
+                      get_initialized_buffer(gemm->operand(1)));
+  TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase output_buffer,
+                      get_initialized_buffer(gemm));
+  TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase reference_result_buffer,
+                      get_initialized_buffer(gemm));
+
+  const DebugOptions& debug_options =
+      gemm->GetModule()->config().debug_options();
+
+  const bool crash_on_checking_failure =
+      debug_options.xla_gpu_crash_on_verification_failures();
+
   GemmBackendConfig backend_config =
       gemm->backend_config<GemmBackendConfig>().ValueOrDie();
   const int32 cublas_autotune_level =
@@ -124,7 +157,7 @@ static StatusOr<absl::optional<se::blas::AlgorithmType>> DoUncachedGemmAutotune(
 
     TF_ASSIGN_OR_RETURN(
         se::RedzoneAllocator::RedzoneCheckStatus rz_check_status,
-        allocator.CheckRedzones());
+        input_output_allocator.CheckRedzones());
     if (!rz_check_status.ok()) {
       result.mutable_failure()->set_kind(AutotuneResult::REDZONE_MODIFIED);
       *result.mutable_failure()->mutable_msg() =
@@ -194,17 +227,14 @@ static StatusOr<absl::optional<se::blas::AlgorithmType>> DoUncachedGemmAutotune(
 }
 
 static StatusOr<absl::optional<se::blas::AlgorithmType>> DoGemmAutotune(
-    const HloInstruction* instr, const HloInstruction* lhs,
-    const HloInstruction* rhs, se::DeviceMemoryBase lhs_buffer,
-    se::DeviceMemoryBase rhs_buffer, se::DeviceMemoryBase output_buffer,
-    se::DeviceMemoryBase reference_result_buffer, se::Stream* stream,
-    bool crash_on_checking_failure, const se::RedzoneAllocator& allocator,
-    const BufferComparator& comparator) {
+    const HloInstruction* instr, const GemmBackendConfig& gemm_config,
+    se::DeviceMemoryAllocator* allocator, se::Stream* stream) {
+  const HloInstruction* lhs = instr->operand(0);
+  const HloInstruction* rhs = instr->operand(1);
+
   // Don't run autotuning concurrently on the same GPU.
   tensorflow::mutex_lock gpu_lock = LockGpu(stream->parent());
 
-  GemmBackendConfig gemm_config =
-      instr->backend_config<GemmBackendConfig>().ValueOrDie();
 
   GemmCacheKey key =
       std::make_tuple(stream->parent(), lhs->shape(), rhs->shape(),
@@ -235,11 +265,8 @@ static StatusOr<absl::optional<se::blas::AlgorithmType>> DoGemmAutotune(
     VLOG(2) << "Batch size is non-singular, using generic algorithm";
     result = absl::nullopt;
   } else {
-    TF_ASSIGN_OR_RETURN(
-        result,
-        DoUncachedGemmAutotune(instr, lhs_buffer, rhs_buffer, output_buffer,
-                               reference_result_buffer, stream, allocator,
-                               comparator, crash_on_checking_failure));
+    TF_ASSIGN_OR_RETURN(result,
+                        DoUncachedGemmAutotune(instr, stream, allocator));
   }
 
   CHECK(autotune_cache.emplace(key, result).second);
@@ -255,52 +282,11 @@ static StatusOr<bool> RunOnInstruction(HloInstruction* instr,
   TF_ASSIGN_OR_RETURN(se::Stream* const stream,
                       allocator->GetStream(executor->device_ordinal()));
 
-  const HloModuleConfig& hlo_module_config = instr->GetModule()->config();
-  const bool init_cublas_data =
-      hlo_module_config.debug_options().xla_gpu_autotune_level() > 1;
-  se::RedzoneAllocator input_output_allocator(
-      stream, allocator, PtxOptsFromConfig(hlo_module_config),
-      /*memory_limit=*/std::numeric_limits<int64>::max());
-
-  BufferComparator comparator(instr->shape(), hlo_module_config);
-
-  int64 rng_state = 0;
-  auto get_initialized_buffer =
-      [&](const HloInstruction* op) -> StatusOr<se::DeviceMemoryBase> {
-    TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase buffer,
-                        input_output_allocator.AllocateBytes(
-                            ShapeUtil::ByteSizeOf(op->shape())));
-    if (init_cublas_data) {
-      InitializeBuffer(stream, op->shape().element_type(), &rng_state, buffer);
-    }
-    return buffer;
-  };
-
   GemmBackendConfig gemm_config =
       instr->backend_config<GemmBackendConfig>().ValueOrDie();
-  const HloInstruction* lhs = instr->operand(0);
-  const HloInstruction* rhs = instr->operand(1);
 
-  TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase lhs_buffer,
-                      get_initialized_buffer(lhs));
-  TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase rhs_buffer,
-                      get_initialized_buffer(rhs));
-  TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase output_buffer,
-                      get_initialized_buffer(instr));
-  TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase reference_result_buffer,
-                      get_initialized_buffer(instr));
-
-  const DebugOptions& debug_options =
-      instr->GetModule()->config().debug_options();
-
-  const bool crash_on_checking_failure =
-      debug_options.xla_gpu_crash_on_verification_failures();
-
-  TF_ASSIGN_OR_RETURN(
-      absl::optional<se::blas::AlgorithmType> gemm_algorithm,
-      DoGemmAutotune(instr, lhs, rhs, lhs_buffer, rhs_buffer, output_buffer,
-                     reference_result_buffer, stream, crash_on_checking_failure,
-                     input_output_allocator, comparator));
+  TF_ASSIGN_OR_RETURN(absl::optional<se::blas::AlgorithmType> gemm_algorithm,
+                      DoGemmAutotune(instr, gemm_config, allocator, stream));
 
   // We update instruction->backend_config(); if no algorithms are supported,
   // a different API is used, which does not require specifying an algorithm.
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 51b30e238e9..3a8c3321e24 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -343,11 +343,14 @@ Status GpuCompiler::PrepareHloModuleForIrEmitting(HloModule* hlo_module) {
 // TODO(cheshire): Duplication with gpu_conv_algorithm picker, figure out a
 // right way to share this.
 static bool RequireDeterminism() {
-  bool deterministic_ops = false;
-  TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_DETERMINISTIC_OPS",
-                                             /*default_val=*/false,
-                                             &deterministic_ops));
-  return deterministic_ops;
+  static bool require_determinism = [] {
+    bool deterministic_ops = false;
+    TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_DETERMINISTIC_OPS",
+                                               /*default_val=*/false,
+                                               &deterministic_ops));
+    return deterministic_ops;
+  }();
+  return require_determinism;
 }
 
 Status GpuCompiler::OptimizeHloPostLayoutAssignment(
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
index 31ace1a416e..7a7d2e1e1b1 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
@@ -268,9 +268,9 @@ ConvCacheKey AutotuneCacheKeyfromInstruction(
 }
 
 tensorflow::mutex autotune_cache_lock(tensorflow::LINKER_INITIALIZED);
-auto& autotune_cache GUARDED_BY(autotune_cache_lock) =
+auto& autotune_cache TF_GUARDED_BY(autotune_cache_lock) =
     *new absl::flat_hash_map<ConvCacheKey, AutotuneResult>();
-auto& autotune_cache_stats GUARDED_BY(autotune_cache_lock) =
+auto& autotune_cache_stats TF_GUARDED_BY(autotune_cache_lock) =
     *new ConvCacheStats();
 }  // anonymous namespace
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager.h b/tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager.h
index 41825a33174..c1b83158dfb 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager.h
@@ -126,17 +126,17 @@ class GpuDebugInfoManager {
   };
 
   tensorflow::mutex mutex_;
-  bool tracing_active_ GUARDED_BY(mutex_) = false;
+  bool tracing_active_ TF_GUARDED_BY(mutex_) = false;
   // Modules that was running currently. Because multiple instances of the
   // modules can be running in the same time, a reference count is maintained
   // as map value.
   absl::flat_hash_map<ModuleIdentifier, int> running_module_ids_
-      GUARDED_BY(mutex_);
+      TF_GUARDED_BY(mutex_);
   // Active modules are those still tracked by us. There could be much more
   // active modules than running modules, we will try to reduce the trace size
   // by only transfer those modules that were running during tracing period.
   absl::flat_hash_map<ModuleIdentifier, GpuModuleEntry> active_modules_
-      GUARDED_BY(mutex_);
+      TF_GUARDED_BY(mutex_);
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 1f601712038..50d27182df1 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -161,6 +161,9 @@ Status GpuExecutable::ExecuteThunks(
     sub_streams.emplace_back();
     TF_ASSIGN_OR_RETURN(sub_streams.back(),
                         run_options->BorrowStream(executor->device_ordinal()));
+    // Require substreams to wait for the main stream, otherwise substreams may
+    // execute before the program is scheduled to start on the main stream.
+    sub_streams.back()->ThenWaitFor(main_stream);
   }
 
   HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream,
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index 3d3afe6168b..33642a7dc3d 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -159,9 +159,9 @@ class GpuExecutable : public Executable {
   // `ResolveConstantGlobals`.
   tensorflow::mutex module_handle_mutex_;
   std::map<stream_executor::StreamExecutor*, se::ScopedModuleHandle>
-      module_handles_ GUARDED_BY(module_handle_mutex_);
+      module_handles_ TF_GUARDED_BY(module_handle_mutex_);
   std::map<stream_executor::StreamExecutor*, BufferAllocToDeviceMemoryMap>
-      module_globals_ GUARDED_BY(module_handle_mutex_);
+      module_globals_ TF_GUARDED_BY(module_handle_mutex_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(GpuExecutable);
 };
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
index 50ef3905495..88351881f3a 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
@@ -84,7 +84,7 @@ class KernelThunk : public Thunk {
   // Loaded kernels for each `StreamExecutor`.  Requires pointer stability of
   // values.
   std::unordered_map<se::StreamExecutor*, std::unique_ptr<se::KernelBase>>
-      kernel_cache_ GUARDED_BY(mutex_);
+      kernel_cache_ TF_GUARDED_BY(mutex_);
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
index 52c4fb93199..8d568e7f5d4 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
@@ -492,7 +492,7 @@ void RendezvousNcclAllReduce::CleanupImpl(std::shared_ptr<NcclClique> handle,
 // lives, which is how we avoid expensive reinitialization of NCCL cliques.
 struct NcclAllReduceThunk::AuxData {
   tensorflow::mutex mu;
-  absl::flat_hash_set<std::shared_ptr<NcclClique>> cliques GUARDED_BY(mu);
+  absl::flat_hash_set<std::shared_ptr<NcclClique>> cliques TF_GUARDED_BY(mu);
 };
 
 /*static*/ bool NcclAllReduceThunk::CanImplement(const HloInstruction* crs) {
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index 6d036094a69..4f46e292210 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -148,18 +148,23 @@ Status NVPTXCompiler::OptimizeHloConvolutionCanonicalization(
 Status NVPTXCompiler::OptimizeHloPostLayoutAssignment(
     HloModule* hlo_module, se::StreamExecutor* stream_exec,
     se::DeviceMemoryAllocator* device_allocator) {
+  HloPassPipeline pre_pipeline("nvptx post-layout_assignment part 1");
+  // Pad the dimensions of matrices in dot operations to multiples of 8.
+  // This needs to run before GemmRewriter, which is part of
+  // OptimizeHloPostLayoutAssignment().
+  if (IsVoltaOrLater(*stream_exec)) {
+    pre_pipeline.AddPass<CublasGemmPadForTensorCores>();
+  }
+  TF_RETURN_IF_ERROR(pre_pipeline.Run(hlo_module).status());
+
   TF_RETURN_IF_ERROR(GpuCompiler::OptimizeHloPostLayoutAssignment(
       hlo_module, stream_exec, device_allocator));
 
-  HloPassPipeline pipeline("nvptx post-layout_assignment");
-  // Pad the dimensions of matrices in dot operations to multiples of 8.
-  if (IsVoltaOrLater(*stream_exec)) {
-    pipeline.AddPass<CublasGemmPadForTensorCores>();
-  }
+  HloPassPipeline post_pipeline("nvptx post-layout_assignment part 2");
 
   // Find the fastest algorithm for GEMMs.
-  pipeline.AddPass<GemmAlgorithmPicker>(stream_exec, device_allocator);
-  TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
+  post_pipeline.AddPass<GemmAlgorithmPicker>(stream_exec, device_allocator);
+  TF_RETURN_IF_ERROR(post_pipeline.Run(hlo_module).status());
 
   return Status::OK();
 }
@@ -228,13 +233,14 @@ bool MaybeLoadPtxFromFile(const HloModule* module, std::string* ptx) {
   // and warn when a file is not used to ease catching typo in filename.
   std::string prefix = xla::FilenameFor(*module, "", *ptx);
   std::string matched_filename;
-  for (const string filename :
+  for (const string full_filename :
        module->config().debug_options().xla_gpu_ptx_file()) {
     // To ease comparing many PTX versions, accept different suffixes then
     // the original filename.
+    auto filename = tensorflow::io::Basename(full_filename);
     if (absl::StartsWith(filename, prefix)) {
-      matched_filename = filename;
-      VLOG(0) << "RunBackend() - Will load PTX from file: " << filename;
+      matched_filename = full_filename;
+      VLOG(0) << "RunBackend() - Will load PTX from file: " << full_filename;
       break;
     }
   }
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
index 3098d5af25f..e69be947522 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
@@ -62,8 +62,8 @@ class NVPTXCompiler : public GpuCompiler {
   // We cache the cuda_data_dir() and the result of our search, so that if the
   // next module we have to compile has the same cuda_data_dir(), we can skip
   // the search.
-  string cached_cuda_data_dir_ GUARDED_BY(mutex_);
-  string cached_libdevice_dir_ GUARDED_BY(mutex_);
+  string cached_cuda_data_dir_ TF_GUARDED_BY(mutex_);
+  string cached_libdevice_dir_ TF_GUARDED_BY(mutex_);
 
   // Tries to compile the given ptx string to cubin.  Returns a vector with the
   // compiled cubin.  If compilation was unsuccessful, returns an empty vector.
@@ -116,7 +116,7 @@ class NVPTXCompiler : public GpuCompiler {
   // is critical here.
   absl::node_hash_map<CompilationCacheKey, CompilationCacheValue,
                       CompilationCacheHash, CompilationCacheEq>
-      compilation_cache_ GUARDED_BY(mutex_);
+      compilation_cache_ TF_GUARDED_BY(mutex_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(NVPTXCompiler);
 };
diff --git a/tensorflow/compiler/xla/service/gpu/stream_assignment.cc b/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
index a92a5783b67..d9a5463013d 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
+#include "tensorflow/core/platform/random.h"
 
 namespace xla {
 namespace gpu {
@@ -72,13 +73,17 @@ int ComputeStreamToAssign(
     return kInvalidStreamNum;
   }
 
-  if (hlo.GetModule()
-          ->config()
-          .debug_options()
-          .xla_gpu_disable_multi_streaming()) {
+  const auto& debug_options = hlo.GetModule()->config().debug_options();
+  if (debug_options.xla_gpu_disable_multi_streaming()) {
     return 0;
   }
 
+  if (debug_options.xla_gpu_use_random_streams()) {
+    // Debug feature: make random stream assignments to try to uncover
+    // concurrency bugs.
+    return tensorflow::random::New64() % 100;
+  }
+
   if (!(IsCublasGemm(hlo) || IsMatrixMultiplication(hlo))) {
     // If `hlo` is not implemented as a GEMM, keep it close to its operands to
     // avoid excessive synchronization.
diff --git a/tensorflow/compiler/xla/service/gpu/stream_assignment.h b/tensorflow/compiler/xla/service/gpu/stream_assignment.h
index 52d38b6f20e..1bcbec06921 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_assignment.h
+++ b/tensorflow/compiler/xla/service/gpu/stream_assignment.h
@@ -30,7 +30,7 @@ class StreamAssignment {
   int StreamNumberForHlo(const HloInstruction& hlo) const;
   bool HasStreamAssigned(const HloInstruction& hlo) const;
   // `hlo` needs to outlive this StreamAssignment object.
-  void AssignStreamToHlo(const HloInstruction* hlo, int stream_no);
+  void AssignStreamToHlo(const HloInstruction* hlo, int stream_num);
 
  private:
   int stream_count_ = 1;  // At least the main stream.
diff --git a/tensorflow/compiler/xla/service/hlo_dce.cc b/tensorflow/compiler/xla/service/hlo_dce.cc
index 23b27a5f67d..a573b621c88 100644
--- a/tensorflow/compiler/xla/service/hlo_dce.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce.cc
@@ -21,8 +21,10 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/status.h"
@@ -35,7 +37,8 @@ limitations under the License.
 
 namespace xla {
 
-StatusOr<bool> HloDCE::RunOnComputation(HloComputation* computation) {
+StatusOr<bool> HloDCE::RunOnComputation(
+    HloComputation* computation, bool remove_cross_partition_collective_ops) {
   bool changed = false;
   VLOG(3) << "Before dce:";
   XLA_VLOG_LINES(3, computation->ToString());
@@ -47,7 +50,12 @@ StatusOr<bool> HloDCE::RunOnComputation(HloComputation* computation) {
     if (instruction != computation->root_instruction() &&
         instruction->user_count() == 0 &&
         computation->IsSafelyRemovable(instruction) &&
-        !instruction->HasSideEffect()) {
+        (!instruction->HasSideEffect() ||
+         (remove_cross_partition_collective_ops &&
+          ((instruction->opcode() == HloOpcode::kAllReduce &&
+            !Cast<HloAllReduceInstruction>(instruction)->constrain_layout()) ||
+           instruction->opcode() == HloOpcode::kCollectivePermute ||
+           instruction->opcode() == HloOpcode::kAllToAll)))) {
       dead_roots.push_back(instruction);
     }
   }
@@ -74,8 +82,9 @@ StatusOr<bool> HloDCE::Run(HloModule* module) {
 
   // Run DCE on each computation.
   for (auto* computation : module->MakeComputationPostOrder()) {
-    TF_ASSIGN_OR_RETURN(bool changed_for_computation,
-                        RunOnComputation(computation));
+    TF_ASSIGN_OR_RETURN(
+        bool changed_for_computation,
+        RunOnComputation(computation, remove_cross_partition_collective_ops_));
     changed |= changed_for_computation;
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_dce.h b/tensorflow/compiler/xla/service/hlo_dce.h
index f22f98868ab..49bb2e3f139 100644
--- a/tensorflow/compiler/xla/service/hlo_dce.h
+++ b/tensorflow/compiler/xla/service/hlo_dce.h
@@ -35,15 +35,23 @@ namespace xla {
 // instructions cannot be deleted.
 class HloDCE : public HloModulePass {
  public:
+  HloDCE() : remove_cross_partition_collective_ops_(false) {}
+  explicit HloDCE(bool remove_cross_partition_collective_ops)
+      : remove_cross_partition_collective_ops_(
+            remove_cross_partition_collective_ops) {}
   ~HloDCE() override {}
   absl::string_view name() const override { return "dce"; }
 
   // Run DCE on a computation.
-  static StatusOr<bool> RunOnComputation(HloComputation* computation);
+  StatusOr<bool> RunOnComputation(HloComputation* computation,
+                                  bool remove_cross_partition_collective_ops);
 
   // Run the pass on the given module. Returns whether the module was changed
   // (instructions were removed).
   StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  bool remove_cross_partition_collective_ops_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 2e205606977..78e4d39d3fe 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -1557,7 +1557,7 @@ string WrapDotInHtml(absl::string_view dot) {
 
 tensorflow::mutex url_renderer_mu(tensorflow::LINKER_INITIALIZED);
 std::function<StatusOr<string>(absl::string_view)>* url_renderer
-    GUARDED_BY(url_renderer_mu) = nullptr;
+    TF_GUARDED_BY(url_renderer_mu) = nullptr;
 
 // Precondition: url_renderer != nullptr.
 //
@@ -1567,7 +1567,7 @@ std::function<StatusOr<string>(absl::string_view)>* url_renderer
 // of producing dot for the graph.)
 StatusOr<string> WrapDotInFormat(absl::string_view dot,
                                  RenderedGraphFormat format)
-    EXCLUSIVE_LOCKS_REQUIRED(url_renderer_mu) {
+    TF_EXCLUSIVE_LOCKS_REQUIRED(url_renderer_mu) {
   switch (format) {
     case RenderedGraphFormat::kUrl:
       CHECK(url_renderer != nullptr)
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index 70cbdad9ca7..c8a68db25d4 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -225,7 +225,7 @@ string HloModule::ToString(const HloPrintOptions& options) const {
   }
   s << "\n\n";
   const auto& computations = options.canonicalize_computations()
-                                 ? MakeComputationSortedByContent()
+                                 ? MakeComputationSorted()
                                  : MakeComputationPostOrder();
   for (const HloComputation* computation : computations) {
     if (!options.print_computation(computation)) {
@@ -602,16 +602,23 @@ std::vector<HloComputation*> HloModule::MakeComputationPostOrder() const {
   return post_order;
 }
 
-std::vector<HloComputation*> HloModule::MakeComputationSortedByContent() const {
-  auto result = MakeComputationPostOrder();
-  std::sort(result.begin(), result.end(),
-            [](HloComputation* a, HloComputation* b) {
-              if (a->instruction_count() != b->instruction_count()) {
-                return a->instruction_count() < b->instruction_count();
-              }
-              return a->ToString(HloPrintOptions::Fingerprint()) <
-                     b->ToString(HloPrintOptions::Fingerprint());
-            });
+namespace {
+bool CompareComputationsByContent(HloComputation* a, HloComputation* b) {
+  if (a->instruction_count() != b->instruction_count()) {
+    return a->instruction_count() < b->instruction_count();
+  }
+  return a->ToString(HloPrintOptions::Fingerprint()) <
+         b->ToString(HloPrintOptions::Fingerprint());
+}
+}  // anonymous namespace
+
+std::vector<HloComputation*> HloModule::MakeComputationSorted() const {
+  std::vector<HloComputation*> result;
+  result.reserve(computations_.size());
+  for (const auto& computation : computations_) {
+    result.push_back(computation.get());
+  }
+  std::sort(result.begin(), result.end(), CompareComputationsByContent);
   return result;
 }
 
@@ -629,10 +636,7 @@ std::vector<HloComputation*> HloModule::MakeNonfusionComputations() const {
 std::vector<HloComputation*> HloModule::MakeNonfusionComputationsSorted()
     const {
   auto result = MakeNonfusionComputations();
-  std::sort(result.begin(), result.end(),
-            [](HloComputation* a, HloComputation* b) {
-              return a->name() < b->name();
-            });
+  std::sort(result.begin(), result.end(), CompareComputationsByContent);
   return result;
 }
 
@@ -717,10 +721,10 @@ HloComputation* HloModule::GetComputationWithName(absl::string_view name) {
 
 uint64 HloModule::Hash() const {
   uint64 result = entry_computation_layout().Hash();
-  // Use MakeComputationSortedByContent() instead of MakeComputationPostOrder()
+  // Use MakeComputationSorted() instead of MakeComputationPostOrder()
   // because naming may affect the order of MakeComputationPostOrder() but not
-  // MakeComputationSortedByContent().
-  for (auto* computation : MakeComputationSortedByContent()) {
+  // MakeComputationSorted().
+  for (auto* computation : MakeComputationSorted()) {
     for (auto* instruction : computation->MakeInstructionPostOrder()) {
       result = tensorflow::Hash64Combine(result, instruction->Hash());
     }
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index e44e22ba954..38395f173e1 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -197,8 +197,8 @@ class HloModule {
   std::vector<HloComputation*> MakeComputationPostOrder() const;
 
   // Same as MakeComputationPostOrder() but sorting the computations by their
-  // contents.
-  std::vector<HloComputation*> MakeComputationSortedByContent() const;
+  // contents. The order is longer post order.
+  std::vector<HloComputation*> MakeComputationSorted() const;
 
   // Gets the computations in this module which aren't for fusion nodes.
   //
@@ -211,7 +211,7 @@ class HloModule {
   // MakeNonfusionComputations().
   std::vector<HloComputation*> MakeNonfusionComputations() const;
 
-  // Same as MakeNonfusionComputations() but sorting the computations by names.
+  // Same as MakeNonfusionComputations() but sorting computations by content.
   std::vector<HloComputation*> MakeNonfusionComputationsSorted() const;
 
   const HloModuleConfig& config() const { return config_; }
diff --git a/tensorflow/compiler/xla/service/hlo_module_config.h b/tensorflow/compiler/xla/service/hlo_module_config.h
index dee601d9e96..d90a1485441 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.h
+++ b/tensorflow/compiler/xla/service/hlo_module_config.h
@@ -183,6 +183,12 @@ class HloModuleConfig {
     return &fusion_config_;
   }
 
+  const std::vector<std::vector<int64>>& dot_config() const {
+    return dot_config_;
+  }
+
+  std::vector<std::vector<int64>>* mutable_dot_config() { return &dot_config_; }
+
  private:
   // If you add new members, be sure to update compilation_cache_key.
 
@@ -213,7 +219,14 @@ class HloModuleConfig {
   FusionConfigCollection fusion_config_collection_ =
       FusionConfigCollection::kOff;
 
+  // Custom fusion configuration, where fusion_config_[c][v] control if node v
+  // in computation c must be fused to all its consumers (true) or not (false).
   std::vector<std::vector<bool>> fusion_config_;
+
+  // Custom dot canonicalization configuration, where dot_config_[v] control
+  // how to convert dot operation v (sorted topologically and by computation) to
+  // convolution.
+  std::vector<std::vector<int64>> dot_config_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index 21be4216469..bfc6769660a 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -1648,6 +1648,8 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
       } else {
         // Found a valid block. Reset to start looking for single instructions
         // again.
+        max_rematerialized_block_size_ =
+            std::max(max_rematerialized_block_size_, max_block_size);
         changed = true;
         min_block_size = 1;
         max_block_size = 1;
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.h b/tensorflow/compiler/xla/service/hlo_rematerialization.h
index d1c4b8b5e7b..72221fa8a32 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.h
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.h
@@ -180,6 +180,10 @@ class HloRematerialization : public HloModulePass {
   // dead. Hence, no net instructions were added.
   int64 net_instructions_added_ = 0;
 
+  // Size of the largest block that has been rematerialized. This is actually an
+  // upper bound (within a factor of 2) on the block size.
+  int max_rematerialized_block_size_ = 0;
+
   RematerializationMode mode_;
 };
 
diff --git a/tensorflow/compiler/xla/service/interpreter/BUILD b/tensorflow/compiler/xla/service/interpreter/BUILD
index 552c8eb1ae5..7a4eefc1ab6 100644
--- a/tensorflow/compiler/xla/service/interpreter/BUILD
+++ b/tensorflow/compiler/xla/service/interpreter/BUILD
@@ -71,11 +71,36 @@ cc_library(
     ),
 )
 
+cc_library(
+    name = "executable_base",
+    srcs = ["executable_base.cc"],
+    hdrs = ["executable_base.h"],
+    deps = [
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_tree",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:xla_proto_cc",
+        "//tensorflow/compiler/xla/service:dynamic_dimension_inference",
+        "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_execution_profile",
+        "//tensorflow/compiler/xla/service:maybe_owning_device_memory",
+        "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/compiler/xla/service:transfer_manager",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:stream",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
 cc_library(
     name = "executable",
     srcs = ["executable.cc"],
     hdrs = ["executable.h"],
     deps = [
+        ":executable_base",
         ":executor",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc
index 725cb437f8c..cc7fdeaf0f6 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/interpreter/executable_base.h"
 #include "tensorflow/compiler/xla/service/interpreter/executor.h"
 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
@@ -41,8 +42,7 @@ InterpreterExecutable::InterpreterExecutable(
     std::unique_ptr<HloModule> hlo_module,
     std::unique_ptr<HloEvaluator> evaluator,
     absl::optional<DynamicDimensionInference> dynamic_dymension_inference)
-    : Executable(std::move(hlo_module), /*hlo_profile_printer_data=*/nullptr,
-                 /*hlo_profile_index_map=*/nullptr),
+    : InterpreterExecutableBase(std::move(hlo_module)),
       evaluator_(std::move(evaluator)),
       dynamic_dimension_inference_(std::move(dynamic_dymension_inference)) {
   if (dynamic_dimension_inference_.has_value()) {
@@ -51,107 +51,12 @@ InterpreterExecutable::InterpreterExecutable(
   }
 }
 
-InterpreterExecutable::~InterpreterExecutable() {}
-
-StatusOr<ExecutionOutput> InterpreterExecutable::ExecuteAsyncOnStream(
-    const ServiceExecutableRunOptions* run_options,
-    std::vector<ExecutionInput> arguments,
-    HloExecutionProfile* hlo_execution_profile) {
-  se::Stream* stream = run_options->stream();
-  se::StreamExecutor* executor = stream->parent();
-  const se::Platform* platform = executor->platform();
-
-  // Convert the ShapeTree to a ShapedBuffer. We do this so we can call
-  // TransferManager methods below.
-  std::vector<ShapedBuffer> argument_buffers;
-  argument_buffers.reserve(arguments.size());
-  for (auto& argument : arguments) {
-    const ShapeTree<MaybeOwningDeviceMemory>& buffers = argument.Buffers();
-    argument_buffers.push_back(ShapedBuffer(buffers.shape(), buffers.shape(),
-                                            /*platform=*/nullptr,
-                                            /*device_ordinal=*/0));
-    auto in_it = buffers.begin();
-    auto out_it = argument_buffers.back().buffers().begin();
-    for (; in_it != buffers.end(); ++in_it, ++out_it) {
-      out_it->second = in_it->second.AsDeviceMemoryBase();
-    }
-  }
-
-  VLOG(1) << "Execute " << module().name();
-  if (VLOG_IS_ON(2)) {
-    for (const auto& a : argument_buffers) {
-      VLOG(2) << "-- argument " << a;
-    }
-  }
-
-  uint64 start_micros = tensorflow::Env::Default()->NowMicros();
-
-  const HloComputation* computation = module().entry_computation();
-  if (computation->num_parameters() != arguments.size()) {
-    return tensorflow::errors::Internal(
-        "Mismatch between argument count and graph parameter count.");
-  }
-
-  // Check that the args have the right shape.
-  for (int64 i = 0; i < computation->num_parameters(); ++i) {
-    const auto& expected_shape = computation->parameter_instruction(i)->shape();
-    const auto& actual_shape = argument_buffers[i].on_device_shape();
-    if (!Shape::Equal().MinorToMajorOnlyInLayout()(expected_shape,
-                                                   actual_shape)) {
-      return InvalidArgument(
-          "Shape mismatch on parameter %d.  Expected %s, but was %s.", i,
-          ShapeUtil::HumanStringWithLayout(expected_shape),
-          ShapeUtil::HumanStringWithLayout(actual_shape));
-    }
-  }
-
-  TF_ASSIGN_OR_RETURN(TransferManager * transfer_manager,
-                      TransferManager::GetForPlatform(platform));
-
-  // Transform the ShapedBuffer arguments into literals which the evaluator
-  // consumes.
-  std::vector<Literal> arg_literals;
-  for (int64 p = 0; p < computation->num_parameters(); ++p) {
-    TF_ASSIGN_OR_RETURN(Literal arg_literal,
-                        transfer_manager->TransferLiteralFromDevice(
-                            run_options->stream(), argument_buffers[p]));
-    arg_literals.push_back(std::move(arg_literal));
-  }
-
+StatusOr<Literal> InterpreterExecutable::Evaluate(
+    const HloComputation& computation, absl::Span<const Literal> arg_literals) {
   // Execute the graph using the HloEvaluator.
-  Literal result_literal;
-  {
-    tensorflow::mutex_lock lock(evaluator_lock_);
-    evaluator_->ResetVisitStates();
-    TF_ASSIGN_OR_RETURN(result_literal,
-                        evaluator_->Evaluate(*computation, arg_literals));
-  }
-
-  // Transform the result literal back into a ShapedBuffer.
-  TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result_buffers,
-                      transfer_manager->AllocateScopedShapedBuffer(
-                          result_literal.shape(), run_options->allocator(),
-                          executor->device_ordinal()));
-  TF_RETURN_IF_ERROR(transfer_manager->TransferLiteralToDevice(
-      run_options->stream(), result_literal, result_buffers));
-  ExecutionOutput result(std::move(result_buffers));
-
-  uint64 end_micros = tensorflow::Env::Default()->NowMicros();
-
-  ExecutionProfile* profile = run_options->run_options().execution_profile();
-  if (profile) {
-    const double nanoseconds = (end_micros - start_micros) * 1000.0;
-    profile->set_compute_time_ns(std::max(nanoseconds, 1.0));
-  }
-  for (auto& argument : arguments) {
-    for (auto& index_buffer : *argument.MutableBuffers()) {
-      auto maybe_owning_buffer = index_buffer.second.Release();
-      if (maybe_owning_buffer) {
-        result.AddToBeReleased(std::move(*maybe_owning_buffer));
-      }
-    }
-  }
-  return std::move(result);
+  tensorflow::mutex_lock lock(evaluator_lock_);
+  evaluator_->ResetVisitStates();
+  return evaluator_->Evaluate(computation, arg_literals);
 }
 
 /*static*/ int64 InterpreterExecutable::ShapeSizeBytes(const Shape& shape) {
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.h b/tensorflow/compiler/xla/service/interpreter/executable.h
index 5b2f41a884c..ce68a8472f5 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.h
+++ b/tensorflow/compiler/xla/service/interpreter/executable.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/interpreter/executable_base.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -40,25 +41,22 @@ namespace interpreter {
 
 // Responsible for running a HLO graph through the HloEvaluator and output
 // buffer allocation. Refer to interpreter/README.md for more.
-class InterpreterExecutable : public Executable {
+class InterpreterExecutable : public InterpreterExecutableBase {
  public:
   InterpreterExecutable(
       std::unique_ptr<HloModule> hlo_module,
       std::unique_ptr<HloEvaluator> evaluator,
       absl::optional<DynamicDimensionInference> dynamic_dymension_inference);
-  ~InterpreterExecutable() override;
-
-  StatusOr<ExecutionOutput> ExecuteAsyncOnStream(
-      const ServiceExecutableRunOptions* run_options,
-      std::vector<ExecutionInput> arguments,
-      HloExecutionProfile* hlo_execution_profile) override
-      LOCKS_EXCLUDED(evaluator_lock_);
 
   static int64 ShapeSizeBytes(const Shape& shape);
 
  protected:
+  StatusOr<Literal> Evaluate(const HloComputation& computation,
+                             absl::Span<const Literal> arg_literals) override
+      TF_LOCKS_EXCLUDED(evaluator_lock_);
+
   // The interpreter interprets executables with an HloEvaluator.
-  std::unique_ptr<HloEvaluator> evaluator_ PT_GUARDED_BY(evaluator_lock_);
+  std::unique_ptr<HloEvaluator> evaluator_ TF_PT_GUARDED_BY(evaluator_lock_);
   mutable tensorflow::mutex evaluator_lock_;
 
  private:
diff --git a/tensorflow/compiler/xla/service/interpreter/executable_base.cc b/tensorflow/compiler/xla/service/interpreter/executable_base.cc
new file mode 100644
index 00000000000..5850cbf005b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/interpreter/executable_base.cc
@@ -0,0 +1,137 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/interpreter/executable_base.h"
+
+#include <type_traits>
+#include <vector>
+
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
+#include "tensorflow/compiler/xla/service/shaped_buffer.h"
+#include "tensorflow/compiler/xla/service/transfer_manager.h"
+#include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/stream.h"
+#include "tensorflow/stream_executor/stream_executor_pimpl.h"
+
+namespace xla {
+namespace interpreter {
+
+InterpreterExecutableBase::InterpreterExecutableBase(
+    std::unique_ptr<HloModule> hlo_module)
+    : Executable(std::move(hlo_module), /*hlo_profile_printer_data=*/nullptr,
+                 /*hlo_profile_index_map=*/nullptr) {}
+
+StatusOr<ExecutionOutput> InterpreterExecutableBase::ExecuteAsyncOnStream(
+    const ServiceExecutableRunOptions* run_options,
+    std::vector<ExecutionInput> arguments,
+    HloExecutionProfile* hlo_execution_profile) {
+  se::Stream* stream = run_options->stream();
+  se::StreamExecutor* executor = stream->parent();
+  const se::Platform* platform = executor->platform();
+
+  // Convert the ShapeTree to a ShapedBuffer. We do this so we can call
+  // TransferManager methods below.
+  std::vector<ShapedBuffer> argument_buffers;
+  argument_buffers.reserve(arguments.size());
+  for (auto& argument : arguments) {
+    const ShapeTree<MaybeOwningDeviceMemory>& buffers = argument.Buffers();
+    argument_buffers.push_back(ShapedBuffer(buffers.shape(), buffers.shape(),
+                                            /*platform=*/nullptr,
+                                            /*device_ordinal=*/0));
+    auto in_it = buffers.begin();
+    auto out_it = argument_buffers.back().buffers().begin();
+    for (; in_it != buffers.end(); ++in_it, ++out_it) {
+      out_it->second = in_it->second.AsDeviceMemoryBase();
+    }
+  }
+
+  VLOG(1) << "Execute " << module().name();
+  if (VLOG_IS_ON(2)) {
+    for (const auto& a : argument_buffers) {
+      VLOG(2) << "-- argument " << a;
+    }
+  }
+
+  uint64 start_micros = tensorflow::Env::Default()->NowMicros();
+
+  const HloComputation* computation = module().entry_computation();
+  if (computation->num_parameters() != arguments.size()) {
+    return tensorflow::errors::Internal(
+        "Mismatch between argument count and graph parameter count.");
+  }
+
+  // Check that the args have the right shape.
+  for (int64 i = 0; i < computation->num_parameters(); ++i) {
+    const auto& expected_shape = computation->parameter_instruction(i)->shape();
+    const auto& actual_shape = argument_buffers[i].on_device_shape();
+    if (!Shape::Equal().MinorToMajorOnlyInLayout()(expected_shape,
+                                                   actual_shape)) {
+      return InvalidArgument(
+          "Shape mismatch on parameter %d.  Expected %s, but was %s.", i,
+          ShapeUtil::HumanStringWithLayout(expected_shape),
+          ShapeUtil::HumanStringWithLayout(actual_shape));
+    }
+  }
+
+  TF_ASSIGN_OR_RETURN(TransferManager * transfer_manager,
+                      TransferManager::GetForPlatform(platform));
+
+  // Transform the ShapedBuffer arguments into literals which the evaluator
+  // consumes.
+  std::vector<Literal> arg_literals;
+  for (int64 p = 0; p < computation->num_parameters(); ++p) {
+    TF_ASSIGN_OR_RETURN(Literal arg_literal,
+                        transfer_manager->TransferLiteralFromDevice(
+                            run_options->stream(), argument_buffers[p]));
+    arg_literals.push_back(std::move(arg_literal));
+  }
+
+  TF_ASSIGN_OR_RETURN(Literal result_literal,
+                      Evaluate(*computation, arg_literals));
+
+  // Transform the result literal back into a ShapedBuffer.
+  TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result_buffers,
+                      transfer_manager->AllocateScopedShapedBuffer(
+                          result_literal.shape(), run_options->allocator(),
+                          executor->device_ordinal()));
+  TF_RETURN_IF_ERROR(transfer_manager->TransferLiteralToDevice(
+      run_options->stream(), result_literal, result_buffers));
+  ExecutionOutput result(std::move(result_buffers));
+
+  uint64 end_micros = tensorflow::Env::Default()->NowMicros();
+
+  ExecutionProfile* profile = run_options->run_options().execution_profile();
+  if (profile) {
+    const double nanoseconds = (end_micros - start_micros) * 1000.0;
+    profile->set_compute_time_ns(std::max(nanoseconds, 1.0));
+  }
+  for (auto& argument : arguments) {
+    for (auto& index_buffer : *argument.MutableBuffers()) {
+      auto maybe_owning_buffer = index_buffer.second.Release();
+      if (maybe_owning_buffer) {
+        result.AddToBeReleased(std::move(*maybe_owning_buffer));
+      }
+    }
+  }
+  return std::move(result);
+}
+
+}  // namespace interpreter
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/interpreter/executable_base.h b/tensorflow/compiler/xla/service/interpreter/executable_base.h
new file mode 100644
index 00000000000..a02ab7af8d0
--- /dev/null
+++ b/tensorflow/compiler/xla/service/interpreter/executable_base.h
@@ -0,0 +1,57 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_EXECUTABLE_BASE_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_EXECUTABLE_BASE_H_
+
+#include <memory>
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
+#include "tensorflow/compiler/xla/service/executable.h"
+#include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/service_executable_run_options.h"
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
+namespace xla {
+namespace interpreter {
+
+// Responsible for running a HLO graph through the HloEvaluator and output
+// buffer allocation. Refer to interpreter/README.md for more.
+class InterpreterExecutableBase : public Executable {
+ public:
+  explicit InterpreterExecutableBase(std::unique_ptr<HloModule> hlo_module);
+
+  StatusOr<ExecutionOutput> ExecuteAsyncOnStream(
+      const ServiceExecutableRunOptions* run_options,
+      std::vector<ExecutionInput> arguments,
+      HloExecutionProfile* hlo_execution_profile) override;
+
+ protected:
+  virtual StatusOr<Literal> Evaluate(
+      const HloComputation& computation,
+      absl::Span<const Literal> arg_literals) = 0;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(InterpreterExecutableBase);
+};
+
+}  // namespace interpreter
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_EXECUTABLE_BASE_H_
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.h b/tensorflow/compiler/xla/service/interpreter/executor.h
index 2279be7d2e5..3c35fda55f1 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.h
+++ b/tensorflow/compiler/xla/service/interpreter/executor.h
@@ -130,19 +130,19 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
                     std::function<port::Status()> callback) override;
 
   port::Status AllocateEvent(Event *event) override {
-    return port::Status{port::error::UNIMPLEMENTED, ""};
+    return port::Status::OK();
   }
 
   port::Status DeallocateEvent(Event *event) override {
-    return port::Status{port::error::UNIMPLEMENTED, ""};
+    return port::Status::OK();
   }
 
   port::Status RecordEvent(Stream *stream, Event *event) override {
-    return port::Status{port::error::UNIMPLEMENTED, ""};
+    return port::Status{port::error::UNIMPLEMENTED, "RecordEvent"};
   }
 
   port::Status WaitForEvent(Stream *stream, Event *event) override {
-    return port::Status{port::error::UNIMPLEMENTED, ""};
+    return port::Status{port::error::UNIMPLEMENTED, "WaitForEvent"};
   }
 
   Event::Status PollForEventStatus(Event *event) override {
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc
index 2ebb6ae567e..d30c24616ff 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc
@@ -424,10 +424,13 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
                 aliased_allocation->chunk(), definition_time, definition_time));
       }
 
+      std::vector<int64> use_times(uses.size());
+      for (int i = 0; i < uses.size(); ++i) {
+        use_times[i] = instruction_schedule.at(uses[i].instruction);
+      }
       // Iterate over the uses.
       for (HloUse use : uses) {
         int64 use_time = instruction_schedule.at(use.instruction);
-        int64 last_use_time = instruction_schedule.at(uses.back().instruction);
         int64 latest_prefetch_time = use_time;
 
         if (use.instruction->parent() != defining_computation) {
@@ -457,7 +460,7 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
           AllocationRequest request;
           request.start_time = definition_time;
           request.end_time = use_time;
-          request.last_use_time = last_use_time;
+          request.use_times = &use_times;
           request.latest_prefetch_time = latest_prefetch_time;
           request.use = use;
           request.buffer = value;
@@ -692,7 +695,7 @@ bool AlternateMemoryBestFitHeap::FindAllocation(
   VLOG(2) << "Finding allocation for " << request.buffer->ToShortString()
           << " (" << request.start_time << ", " << request.end_time
           << ") latest prefetch = " << request.latest_prefetch_time
-          << " last use = " << request.last_use_time
+          << " last use = " << request.use_times->back()
           << " use = " << request.use.ToString() << ". Size = " << request.size
           << ", def pos = " << defining_position.ToString();
   CHECK_LE(request.start_time, request.end_time);
@@ -880,8 +883,8 @@ bool AlternateMemoryBestFitHeap::AllocateInAlternateMemoryNoCopy(
   // the last use time, we try to find an allocation that is available for the
   // entire Producer to Use2 range.
   absl::optional<ChunkCandidate> chunk_candidate =
-      FindBestNoCopyChunkCandidate(request.end_time, request.last_use_time,
-                                   preferred_offset, &alternate_mem_interval);
+      FindBestChunkCandidate(request.end_time, *request.use_times,
+                             preferred_offset, &alternate_mem_interval);
   // Check if the new heap size fits within limits. Also ensure if a
   // preferred offset was provided, that offset was used.
   if (chunk_candidate) {
@@ -1027,39 +1030,39 @@ bool AlternateMemoryBestFitHeap::Prefetch(
   BufferInterval alternate_mem_interval;
   alternate_mem_interval.buffer = request.buffer;
   alternate_mem_interval.size = request.size;
-  alternate_mem_interval.end = request.end_time;
   while (!options_.prefetch_interval_picker->Done()) {
     alternate_mem_interval.start = options_.prefetch_interval_picker->Next();
     VLOG(4) << "Trying alternate memory allocation ("
-            << alternate_mem_interval.start << ", "
-            << alternate_mem_interval.end << ")";
+            << alternate_mem_interval.start << ", " << request.end_time << ")";
     // If this additional asynchronous copy would violate the limit, try a
     // different interval.
     if (ViolatesMaximumOutstandingAsyncCopies(alternate_mem_interval.start,
-                                              alternate_mem_interval.end)) {
+                                              request.end_time)) {
       VLOG(4) << "This would violate the outstanding async copy limit.";
       continue;
     }
     if (ViolatesAsyncCopyOrdering(alternate_mem_interval.start,
-                                  alternate_mem_interval.end)) {
+                                  request.end_time)) {
       VLOG(4) << "This would violate asynchronous copy ordering.";
       continue;
     }
 
-    ChunkCandidate chunk_candidate = FindChunkCandidate(alternate_mem_interval);
-    // Check if the new heap size fits within limits.
-    if (chunk_candidate.heap_size <= available_heap_size()) {
+    auto chunk_candidate = FindBestChunkCandidate(
+        request.end_time, *request.use_times,
+        /*preferred_offset=*/absl::nullopt, &alternate_mem_interval);
+    // Check if we could find a suitable chunk.
+    if (chunk_candidate) {
       VLOG(3) << "Move the buffer to alternate memory at "
               << alternate_mem_interval.start
-              << ". Offset = " << chunk_candidate.chunk.offset
-              << ", size = " << chunk_candidate.chunk.size
-              << ", heap_size = " << chunk_candidate.heap_size
+              << ". Offset = " << chunk_candidate->chunk.offset
+              << ", size = " << chunk_candidate->chunk.size
+              << ", heap_size = " << chunk_candidate->heap_size
               << ", prefetch picker = "
               << options_.prefetch_interval_picker->ToDebugString();
-      AddToPendingChunks(alternate_mem_interval, chunk_candidate);
+      AddToPendingChunks(alternate_mem_interval, *chunk_candidate);
 
       AddAsyncCopy(prev_allocation_in_default_mem, MemorySpace::kAlternate,
-                   chunk_candidate.chunk, alternate_mem_interval.start,
+                   chunk_candidate->chunk, alternate_mem_interval.start,
                    request.end_time, request.latest_prefetch_time,
                    request.allocations);
 
@@ -1071,14 +1074,16 @@ bool AlternateMemoryBestFitHeap::Prefetch(
 }
 
 absl::optional<AlternateMemoryBestFitHeap::ChunkCandidate>
-AlternateMemoryBestFitHeap::FindBestNoCopyChunkCandidate(
-    int64 end_time, int64 last_use_time, absl::optional<int64> preferred_offset,
+AlternateMemoryBestFitHeap::FindBestChunkCandidate(
+    int64 end_time, const std::vector<int64>& use_times,
+    absl::optional<int64> preferred_offset,
     BufferInterval* alternate_mem_interval) const {
   if (!preferred_offset) {
-    // Find a chunk that's as long living as possible.
-    for (alternate_mem_interval->end = last_use_time;
-         alternate_mem_interval->end >= end_time;
-         --alternate_mem_interval->end) {
+    // Find a chunk that's as long living as possible iterating in reverse over
+    // the use times.
+    for (auto use_time = use_times.rbegin();
+         use_time != use_times.rend() && *use_time >= end_time; ++use_time) {
+      alternate_mem_interval->end = *use_time;
       ChunkCandidate chunk_candidate =
           FindChunkCandidate(*alternate_mem_interval);
       if (chunk_candidate.heap_size <= available_heap_size()) {
@@ -1086,6 +1091,7 @@ AlternateMemoryBestFitHeap::FindBestNoCopyChunkCandidate(
         return chunk_candidate;
       }
     }
+    alternate_mem_interval->end = end_time;
     return absl::nullopt;
   }
   // If a preferred offset is given, try to find an allocation at that offset
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.h b/tensorflow/compiler/xla/service/memory_space_assignment.h
index b056204b15f..51ff5329482 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.h
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.h
@@ -23,9 +23,10 @@ namespace xla {
 
 // This class contains pre-set assignments determined by memory space
 // assignment. It contains two data structures: (1) a chunks vector that maps a
-// defining HloPosition to a Chunk (offset and size), and (2) a sizes vector
-// that maps the memory space to its size. If there is only one alternate memory
-// space like there is currently, there will be one entry in sizes.
+// defining HloPosition to a Chunk (offset and size), and (2) an assignment_info
+// vector that maps the memory space to information like its allocated size and
+// heap memory trace. If there is only one alternate memory space like there is
+// currently, there will be one entry in assignment_info.
 class PresetAssignments {
  public:
   // Contains per-memory-space information like the allocated size and heap
@@ -639,13 +640,13 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
   //        Segment    Segment   Segment
   //
   // start_time and end_time are the start and end logical times of the segment.
-  // last_use_time is the time of the last use for this buffer (Use3 in the
-  // figure). latest_prefetch_time is the latest time we can schedule the
-  // CopyDone for a prefetch.
+  // use_times is a sorted sequence of the times of all uses.
+  // latest_prefetch_time is the latest time we can schedule the CopyDone for a
+  // prefetch.
   struct AllocationRequest {
     int64 start_time;
     int64 end_time;
-    int64 last_use_time;
+    const std::vector<int64>* use_times;
     int64 latest_prefetch_time;
     int64 size;
     HloUse use;
@@ -696,11 +697,11 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
       const AllocationRequest& request,
       const MemorySpaceAssignment::Allocation& prev_allocation_in_default_mem);
 
-  // For a no-copy allocation, find the best possible chunk candidate, where it
-  // has the longest possible availability if no preferred offset is given, or
-  // at the preferred_offset if it is given.
-  absl::optional<ChunkCandidate> FindBestNoCopyChunkCandidate(
-      int64 end_time, int64 last_use_time,
+  // Find the best possible chunk candidate, where it has the longest possible
+  // availability if no preferred offset is given, or at the preferred_offset if
+  // it is given.
+  absl::optional<ChunkCandidate> FindBestChunkCandidate(
+      int64 end_time, const std::vector<int64>& use_times,
       absl::optional<int64> preferred_offset,
       BufferInterval* alternate_mem_interval) const;
 
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
index afceefdeae6..abeeb866e8c 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/BUILD
+++ b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
@@ -67,13 +67,13 @@ cc_library(
         ":lhlo_dialect_emitter",
         "@com_google_absl//absl/container:flat_hash_map",
         "@llvm-project//mlir:GPUDialect",
+        "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:LLVMTransforms",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TargetNVVMIR",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:compiler",
@@ -148,6 +148,7 @@ cc_library(
         "//tensorflow/compiler/mlir/xla:hlo",
         "//tensorflow/compiler/mlir/xla:hlo_legalize_to_lhlo",
         "//tensorflow/compiler/mlir/xla:lhlo",
+        "//tensorflow/compiler/mlir/xla:lhlo_copy_removal",
         "//tensorflow/compiler/mlir/xla:lhlo_fuse_linalg",
         "//tensorflow/compiler/mlir/xla:lhlo_legalize_to_affine",
         "//tensorflow/compiler/mlir/xla:lhlo_legalize_to_gpu",
@@ -188,6 +189,7 @@ cc_library(
         ":failover_compiler",
         ":inject_errors_pass",
         ":mlir_compiler",
+        "//tensorflow/compiler/mlir/xla:hlo_utils",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/tests:codegen_test_base",
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/hlo_dialect_emitter.cc b/tensorflow/compiler/xla/service/mlir_gpu/hlo_dialect_emitter.cc
index 184d8d202c3..0914e5ef820 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/hlo_dialect_emitter.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/hlo_dialect_emitter.cc
@@ -82,6 +82,8 @@ StatusOr<Value> InsertMlirOp(HloOpcode opcode, OpBuilder func_builder,
       return {func_builder.create<hlo::SelectOp>(loc, rets, args, attrs)};
     case HloOpcode::kSign:
       return {func_builder.create<hlo::SignOp>(loc, rets, args, attrs)};
+    case HloOpcode::kSqrt:
+      return {func_builder.create<hlo::SqrtOp>(loc, rets, args, attrs)};
     case HloOpcode::kSubtract:
       return {func_builder.create<hlo::SubOp>(loc, rets, args, attrs)};
     case HloOpcode::kTanh:
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
index ca26ae4e756..151d82fd2a1 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
@@ -259,8 +259,8 @@ void EnableIRPrinting(mlir::PassManager* passManager) {
   auto enable_if_vlog_is_on = [](mlir::Pass* pass, mlir::Operation* op) {
     return VLOG_IS_ON(1);
   };
-  passManager->enableIRPrinting(/*shouldPrintBeforePass=*/{},
-                                /*shouldPrintAfterPass=*/enable_if_vlog_is_on,
+  passManager->enableIRPrinting(/*shouldPrintBeforePass=*/enable_if_vlog_is_on,
+                                /*shouldPrintAfterPass=*/{},
                                 /*printModuleScope=*/false,
                                 /*printAfterOnlyOnChange=*/true, llvm::dbgs());
   passManager->disableMultithreading();
@@ -277,7 +277,7 @@ Status LowerLHLOToGPU(mlir::ModuleOp module) {
   // Next, we can strip the outer fusion operation.
   pm.addPass(absl::make_unique<FusionOpRemover>());
   // Remove unnecessary Lhlo copies.
-  pm.addPass(::mlir::xla_hlo::createLhloCopyRemovalPass());
+  pm.addPass(::mlir::xla_lhlo::createLhloCopyRemovalPass());
   // Transform lhlo operations to LinAlg.
   pm.addPass(::mlir::xla_lhlo::createLegalizeLhloToLinalgPass());
   // Fuse linalg operations. This will yield a single tiled loop nest where
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
index 75c7c284881..1f681bfab00 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/lhlo_dialect_emitter.cc
@@ -113,6 +113,9 @@ Status InsertMlirOp(HloOpcode opcode, OpBuilder func_builder, Location loc,
     case HloOpcode::kSign:
       func_builder.create<lhlo::SignOp>(loc, rets, args, attrs);
       break;
+    case HloOpcode::kSqrt:
+      func_builder.create<lhlo::SqrtOp>(loc, rets, args, attrs);
+      break;
     case HloOpcode::kSubtract:
       func_builder.create<lhlo::SubOp>(loc, rets, args, attrs);
       break;
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD
index aeaaf0b16c4..e2523d82b91 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD
@@ -39,6 +39,7 @@ tf_cc_test(
         "compare.hlo",
         "const.hlo",
         "copy.hlo",
+        "copy_transpose.hlo",
         "cos.hlo",
         "exp.hlo",
         "fused_reduce.hlo",
@@ -50,6 +51,7 @@ tf_cc_test(
         "rsqrt.hlo",
         "select.hlo",
         "sign.hlo",
+        "sqrt.hlo",
         "tanh.hlo",
     ],
     tags = tf_cuda_tests_tags() + ["no_rocm"],
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/add_in_gpu_dialect.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_in_gpu_dialect.hlo
index ec7df87af64..208ca2799b2 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/add_in_gpu_dialect.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_in_gpu_dialect.hlo
@@ -10,9 +10,9 @@ ENTRY %Add (x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
 // CHECK: "gpu.launch_func"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[ARG0]], %[[ARG1]], %[[ARG2]]
 // CHECK: }
 // CHECK: func @add_kernel(%[[ARG0]]: [[TYPE]], %[[ARG1]]: [[TYPE]], %[[ARG2]]: [[TYPE]]
-// CHECK-DAG: std.subview %[[ARG0]]{{\[}}[[INDEX:.*]]]
-// CHECK-DAG: std.subview %[[ARG1]]{{\[}}[[INDEX]]]
-// CHECK-DAG: std.subview %[[ARG2]]{{\[}}[[INDEX]]]
+// CHECK-DAG: subview %[[ARG0]]{{\[}}[[INDEX:.*]]]
+// CHECK-DAG: subview %[[ARG1]]{{\[}}[[INDEX]]]
+// CHECK-DAG: subview %[[ARG2]]{{\[}}[[INDEX]]]
 // CHECK: %[[VAL1:.*]] = load %{{.*\[}}[[INDEX:.*]]]
 // CHECK: %[[VAL2:.*]] = load %{{.*\[}}[[INDEX]]]
 // CHECK: %[[RES:.*]] = addf %[[VAL1]], %[[VAL2]]
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/add_multiply_gpu.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_multiply_gpu.hlo
index e9000956c23..fe871c1feb6 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/add_multiply_gpu.hlo
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_multiply_gpu.hlo
@@ -9,10 +9,10 @@ ENTRY %AddMultiply (x: f32[2,2], y: f32[2,2], z: f32[2,2]) -> f32[2,2] {
 }
 
 //  CHECK: func @fusion_kernel(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]], %[[ARG2:.*]]: [[TYPE]], %[[RESULT:.*]]: [[TYPE]])
-//  CHECK-DAG: std.subview %[[ARG0]]{{\[}}[[INDEX:.*]]]
-//  CHECK-DAG: std.subview %[[ARG1]]{{\[}}[[INDEX]]]
-//  CHECK-DAG: std.subview %[[ARG2]]{{\[}}[[INDEX]]]
-//  CHECK-DAG: std.subview %[[RESULT]]{{\[}}[[INDEX]]]
+//  CHECK-DAG: subview %[[ARG0]]{{\[}}[[INDEX:.*]]]
+//  CHECK-DAG: subview %[[ARG1]]{{\[}}[[INDEX]]]
+//  CHECK-DAG: subview %[[ARG2]]{{\[}}[[INDEX]]]
+//  CHECK-DAG: subview %[[RESULT]]{{\[}}[[INDEX]]]
 //  CHECK:   %[[V0:.*]] = load %{{.*\[}}[[CSTIDX:.*]]]
 //  CHECK:   %[[V1:.*]] = load %{{.*\[}}[[CSTIDX:.*]]]
 //  CHECK:   %[[ADD:.*]] = addf %[[V0]], %[[V1]]
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/copy_transpose.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/copy_transpose.hlo
new file mode 100644
index 00000000000..2ad8c1b49e3
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/copy_transpose.hlo
@@ -0,0 +1,12 @@
+HloModule CopyTranspose
+
+ENTRY %CopyTranspose (x: f32[2,4]) -> f32[2,4]{0,1} {
+  %x = f32[2,4] parameter(0)
+  ROOT %copy = f32[2,4]{0,1} copy(f32[2,4] %x)
+}
+
+// CHECK: #[[MAP0:.*]] = affine_map<(d0, d1) -> (d0 * 2 + d1)>
+// CHECK: func @copy(%[[OPERAND:.*]]: memref<2x4xf32>,
+// CHECK-SAME:       %[[RESULT:.*]]: memref<2x4xf32, #[[MAP0]]>) 
+// CHECK:   "xla_lhlo.copy"(%[[OPERAND]], %[[RESULT]])
+// CHECK-SAME: : (memref<2x4xf32>, memref<2x4xf32, #[[MAP0]]>)
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/mlir_gpu_lhlo_gen_test.cc b/tensorflow/compiler/xla/service/mlir_gpu/tests/mlir_gpu_lhlo_gen_test.cc
index 7afb7e9281d..206d46debdf 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/mlir_gpu_lhlo_gen_test.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/mlir_gpu_lhlo_gen_test.cc
@@ -58,6 +58,13 @@ TEST_F(LhloGenTest, Copy) {
           "copy.hlo"));
 }
 
+TEST_F(LhloGenTest, CopyTranspose) {
+  CompileAndVerifyIr(
+      /*hlo_text_filename=*/tensorflow::io::JoinPath(
+          "tensorflow", "compiler", "xla", "service", "mlir_gpu", "tests",
+          "copy_transpose.hlo"));
+}
+
 TEST_F(LhloGenTest, Select) {
   CompileAndVerifyIr(
       /*hlo_text_filename=*/tensorflow::io::JoinPath(
@@ -186,6 +193,13 @@ TEST_F(LhloGenTest, Sign) {
                                               "rsqrt.hlo"));
 }
 
+TEST_F(LhloGenTest, Sqrt) {
+  CompileAndVerifyIr(
+      /*hlo_text_filename=*/tensorflow::io::JoinPath(
+          "tensorflow", "compiler", "xla", "service", "mlir_gpu", "tests",
+          "sqrt.hlo"));
+}
+
 TEST_F(LhloGenTest, Tanh) {
   CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
                                               "service", "mlir_gpu", "tests",
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/sqrt.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/sqrt.hlo
new file mode 100644
index 00000000000..95461b912a3
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/sqrt.hlo
@@ -0,0 +1,11 @@
+HloModule Sqrt
+
+ENTRY %Sqrt (x: f32[2,2]) -> f32[2,2] {
+  %x = f32[2,2]{1,0} parameter(0)
+  ROOT %sqrt = f32[2,2]{1,0} sqrt(f32[2,2]{1,0} %x)
+}
+
+// CHECK: func @sqrt(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]]) {
+// CHECK:   "xla_lhlo.sqrt"(%[[ARG0]], %[[ARG1]]) : ([[TYPE]], [[TYPE]]) -> ()
+// CHECK: }
+
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index db977aaa32b..febbf9294b0 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -1187,15 +1187,19 @@ class HloInstructionIsImpl {
 
   bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
     if (inst != inst_) {
-      EXPLAIN << "HloInstruction " << inst << " is not " << inst_ << " ("
-              << InstToString(inst_) << ")";
+      EXPLAIN << "HloInstruction " << std::hex << std::nouppercase
+              << std::showbase << reinterpret_cast<uint64>(inst) << " is not "
+              << reinterpret_cast<uint64>(inst_) << " (" << InstToString(inst_)
+              << ")";
       return false;
     }
     return true;
   }
 
   void DescribeTo(std::ostream* os, int64 indent = 0) const {
-    *os << "which is " << inst_ << " (" << InstToString(inst_) << ")";
+    *os << "which is " << std::hex << std::nouppercase << std::showbase
+        << reinterpret_cast<uint64>(inst_) << " (" << InstToString(inst_)
+        << ")";
   }
 
  private:
diff --git a/tensorflow/compiler/xla/service/stream_pool.h b/tensorflow/compiler/xla/service/stream_pool.h
index 7221d323a61..9cc5b7c9cea 100644
--- a/tensorflow/compiler/xla/service/stream_pool.h
+++ b/tensorflow/compiler/xla/service/stream_pool.h
@@ -56,7 +56,7 @@ class StreamPool {
   void ReturnStream(se::Stream* stream);
 
   tensorflow::mutex mu_;
-  std::vector<std::unique_ptr<se::Stream>> streams_ GUARDED_BY(mu_);
+  std::vector<std::unique_ptr<se::Stream>> streams_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/transpose_folding.h b/tensorflow/compiler/xla/service/transpose_folding.h
index f95f982eb89..ac5e1b80651 100644
--- a/tensorflow/compiler/xla/service/transpose_folding.h
+++ b/tensorflow/compiler/xla/service/transpose_folding.h
@@ -39,6 +39,13 @@ class TransposeFolding : public HloModulePass {
                                            const OperandIndices&) {
     return {};
   }
+
+  // Helper function to always fold transposes.
+  static OperandIndices AlwaysFoldTranspose(const HloInstruction&,
+                                            const OperandIndices& ids) {
+    return ids;
+  }
+
   // transposable_gemm_operands returns the set of operands it wants to fold if
   // the instruction argument is implemented as a GEMM kernel that supports
   // transposing its arguments.
@@ -47,8 +54,10 @@ class TransposeFolding : public HloModulePass {
   // the instruction argument is implemented as a convolution that supports
   // transposing its arguments.
   explicit TransposeFolding(
-      TransposableGemmOperandsFn transposable_gemm_operands,
-      TransposableConvOperandsFn transposable_conv_operands);
+      TransposableGemmOperandsFn transposable_gemm_operands =
+          AlwaysFoldTranspose,
+      TransposableConvOperandsFn transposable_conv_operands =
+          AlwaysFoldTranspose);
   absl::string_view name() const override { return "transpose-folding"; }
 
   StatusOr<bool> Run(HloModule* module) override;
diff --git a/tensorflow/compiler/xla/shape.cc b/tensorflow/compiler/xla/shape.cc
index de243431e2c..d1d5dc17083 100644
--- a/tensorflow/compiler/xla/shape.cc
+++ b/tensorflow/compiler/xla/shape.cc
@@ -48,7 +48,7 @@ Shape::Shape(const ShapeProto& shape_proto) {
   }
   tuple_shapes_.reserve(shape_proto.tuple_shapes_size());
   for (const ShapeProto& element_shape : shape_proto.tuple_shapes()) {
-    *add_tuple_shapes() = Shape(element_shape);
+    tuple_shapes_.emplace_back(element_shape);
   }
   if (shape_proto.has_layout()) {
     *mutable_layout() = Layout::CreateFromProto(shape_proto.layout());
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 23010d6ce70..e42af57e19b 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -939,6 +939,8 @@ xla_test(
     tags = [
         "no_rocm",
         "optonly",
+        # TODO(b/151340488): Timed out on 2020-03-12.
+        "nozapfhahn",
     ],
     deps = [
         ":client_library_test_base",
@@ -1096,6 +1098,10 @@ xla_test(
     name = "convolution_test",
     timeout = "long",
     srcs = ["convolution_test.cc"],
+    backend_tags = {
+        # TODO(b/151340488): Timed out on 2020-03-12.
+        "interpreter": ["nozapfhahn"],
+    },
     shard_count = 40,
     tags = [
         "no_rocm",
@@ -1134,7 +1140,11 @@ xla_test(
     backend_args = {"gpu": ["--xla_backend_extra_options=xla_gpu_experimental_conv_disable_layout_heuristic"]},
     backends = ["gpu"],
     shard_count = 25,
-    tags = ["no_rocm"],
+    tags = [
+        "no_rocm",
+        # TODO(b/151340488): Timed out on 2020-03-12.
+        "nozapfhahn",
+    ],
     deps = CONVOLUTION_TEST_DEPS + [
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -1489,6 +1499,10 @@ xla_test(
     name = "select_and_scatter_test",
     timeout = "long",
     srcs = ["select_and_scatter_test.cc"],
+    backend_tags = {
+        # TODO(b/151340488): Timed out on 2020-03-12.
+        "interpreter": ["nozapfhahn"],
+    },
     tags = [
         "no_rocm",
         "optonly",
diff --git a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
index c35f05ebf45..53c0d84854e 100644
--- a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
+++ b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "llvm/ADT/Triple.h"
+#include "llvm/Support/Host.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.h b/tensorflow/compiler/xla/tests/local_client_test_base.h
index 8908a855847..ea457024618 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.h
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.h
@@ -66,12 +66,12 @@ class TestAllocator : public se::StreamExecutorMemoryAllocator {
   mutable tensorflow::mutex count_mutex_;
 
   // Global counts of allocations and deallocations.
-  int64 allocation_count_ GUARDED_BY(count_mutex_) = 0;
-  int64 deallocation_count_ GUARDED_BY(count_mutex_) = 0;
+  int64 allocation_count_ TF_GUARDED_BY(count_mutex_) = 0;
+  int64 deallocation_count_ TF_GUARDED_BY(count_mutex_) = 0;
 
   // Per-device counts of allocations and deallocations.
-  std::map<int, int64> device_allocation_count_ GUARDED_BY(count_mutex_);
-  std::map<int, int64> device_deallocation_count_ GUARDED_BY(count_mutex_);
+  std::map<int, int64> device_allocation_count_ TF_GUARDED_BY(count_mutex_);
+  std::map<int, int64> device_deallocation_count_ TF_GUARDED_BY(count_mutex_);
 };
 
 // A base class for tests which exercise the LocalClient interface.
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index d83ba25c345..3468c12d8c9 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -69,6 +69,10 @@ message DebugOptions {
   // Disable multi-streaming in the GPU backend.
   bool xla_gpu_disable_multi_streaming = 63;
 
+  // Debugging feature: if enabled, the GPU backend will assign HLO operators to
+  // randomly chosen streams. This is intended to trigger concurrency bugs.
+  bool xla_gpu_use_random_streams = 134;
+
   // If true, in LLVM-based backends, emit !alias.scope metadata in
   // generated IR.
   bool xla_llvm_enable_alias_scope_metadata = 70;
@@ -260,7 +264,8 @@ message DebugOptions {
 
   // Guarantee run-to-run determinism from reductions on XLA:GPU.
   bool xla_gpu_deterministic_reductions = 130;
-  // Next id: 134
+
+  // Next id: 135
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
diff --git a/tensorflow/compiler/xrt/xrt.proto b/tensorflow/compiler/xrt/xrt.proto
index b885a7593f5..47b7cda2760 100644
--- a/tensorflow/compiler/xrt/xrt.proto
+++ b/tensorflow/compiler/xrt/xrt.proto
@@ -10,11 +10,12 @@ import "tensorflow/compiler/xla/xla_data.proto";
 message DeviceAssignment {
   message ComputationDevice {
     message DeviceMeshCoordinates {
-      // The mesh coordinates for the device. Usually (X, Y, Core), in the order
-      // in which they are returned in the TopologyProto.
+      // The mesh coordinates for the device. Usually (X, Y, Z, Core), in the
+      // order in which they are returned in the TopologyProto.
       //  X    = value(0)
       //  Y    = value(1)
-      //  Core = value(2)
+      //  Z    = value(2)
+      //  Core = value(3)
       repeated int32 value = 1;
     }
     // As many replicas as there are in the replicated computation.
diff --git a/tensorflow/compiler/xrt/xrt_compilation_cache.h b/tensorflow/compiler/xrt/xrt_compilation_cache.h
index 02cb25ea35c..3c2577e620b 100644
--- a/tensorflow/compiler/xrt/xrt_compilation_cache.h
+++ b/tensorflow/compiler/xrt/xrt_compilation_cache.h
@@ -173,11 +173,11 @@ class XRTCompilationCache : public ResourceBase {
   // last reference to entry is released, entry is removed from cache_.
   void DiscardEntryRef(CompiledSubgraph* entry);
   void DiscardEntryRefLocked(CompiledSubgraph* entry)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Marks the oldest unmarked entry for eviction. Requires that there is at
   // least one such entry.
-  void MarkOldestEntryForEviction() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  void MarkOldestEntryForEviction() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Updates datastructures to indicate that entry, which had been marked for
   // eviction, has been looked up. This is called by CompileIfKeyAbsent when an
@@ -195,7 +195,7 @@ class XRTCompilationCache : public ResourceBase {
   // is never marked for eviction, so an entry larger than the max cache entries
   // will remain in the cache until it is replaced by something else.
   void LookupEntryMarkedForEviction(CompiledSubgraph* entry)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Creates a new entry by running initialize_program and places it in the
   // cache to be looked up by key. The new entry is in the 'marked for eviction'
@@ -206,7 +206,7 @@ class XRTCompilationCache : public ResourceBase {
   CompiledSubgraph* InitializeEntry(
       const string& key,
       const std::function<Status(std::unique_ptr<xla::LocalExecutable>*)>&
-          initialize_program) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+          initialize_program) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // The maximum number of entries that are stored in the cache before entries
   // are marked for eviction.
@@ -214,23 +214,24 @@ class XRTCompilationCache : public ResourceBase {
 
   mutable absl::Mutex mu_;
   // The total number of entries that are stored and not marked for eviction.
-  int cache_entries_ GUARDED_BY(mu_) = 0;
+  int cache_entries_ TF_GUARDED_BY(mu_) = 0;
   // The total number of entries that are marked for eviction.
-  int marked_for_eviction_entries_ GUARDED_BY(mu_) = 0;
+  int marked_for_eviction_entries_ TF_GUARDED_BY(mu_) = 0;
   // The value to assign to the last_use field of the next entry that is looked
   // up.
-  int64 use_counter_ GUARDED_BY(mu_) = 0;
+  int64 use_counter_ TF_GUARDED_BY(mu_) = 0;
   // All the executables that can be looked up in the cache index by key. An
   // entry is marked for eviction iff it is present in cache_ and not in
   // entries_by_last_use_.
-  std::unordered_map<string, CompiledSubgraph*> cache_ GUARDED_BY(mu_);
+  std::unordered_map<string, CompiledSubgraph*> cache_ TF_GUARDED_BY(mu_);
   // All the executable entries that can be looked up in the cache indexed by
   // uid.
-  std::unordered_map<int64, CompiledSubgraph*> entries_by_uid_ GUARDED_BY(mu_);
+  std::unordered_map<int64, CompiledSubgraph*> entries_by_uid_
+      TF_GUARDED_BY(mu_);
   // Map from last_use to entry, used to mark entries for eviction in LRU
   // order. If an entry's last_use counter is not present as a key in
   // entries_by_last_use_ then the entry has been marked for eviction.
-  std::map<int64, CompiledSubgraph*> entries_by_last_use_ GUARDED_BY(mu_);
+  std::map<int64, CompiledSubgraph*> entries_by_last_use_ TF_GUARDED_BY(mu_);
 };
 
 // Looks up or create an XRTCompilationCache object within the given resource
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index b02eb89ebfc..188988d92c4 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -175,6 +175,9 @@ exports_files([
 
 package_group(name = "experimental_access")
 
+# Authorized users go here.
+package_group(name = "friends")
+
 # -----------------------------------------------------------------------------
 # Public targets
 
@@ -469,6 +472,7 @@ tf_cuda_library(
         "//tensorflow/core/framework:graph_to_functiondef.h",
         "//tensorflow/core/framework:kernel_def_builder.h",
         "//tensorflow/core/framework:kernel_def_util.h",
+        "//tensorflow/core/framework:kernel_shape_util.h",
         "//tensorflow/core/framework:log_memory.h",
         "//tensorflow/core/framework:logging.h",
         "//tensorflow/core/framework:lookup_interface.h",
@@ -1020,7 +1024,6 @@ cc_library(
         "//tensorflow/core/kernels:constant_op",
         "//tensorflow/core/kernels:control_flow_ops",
         "//tensorflow/core/kernels:ctc_ops",
-        "//tensorflow/core/kernels:cudnn_rnn_kernels",
         "//tensorflow/core/kernels:data_flow",
         "//tensorflow/core/kernels:decode_proto_op",
         "//tensorflow/core/kernels:encode_proto_op",
@@ -1095,6 +1098,7 @@ cc_library(
         "//tensorflow/core/kernels:mkl_tfconv_op",
         "//tensorflow/core/kernels:mkl_tmp_bf16_ops",
     ]) + if_cuda([
+        "//tensorflow/core/kernels:cudnn_rnn_kernels",
         "//tensorflow/core/grappler/optimizers:gpu_swapping_kernels",
         "//tensorflow/core/grappler/optimizers:gpu_swapping_ops",
     ]) + if_nccl([
@@ -2252,7 +2256,6 @@ filegroup(
         "//tensorflow/core/framework:shared_ptr_variant.h",
         "//tensorflow/core/framework:tensor_reference.h",
         "//tensorflow/core/framework:tracking_allocator.h",  # only needed for tests
-        "//tensorflow/core/framework:unique_tensor_references.h",
         "//tensorflow/core/framework:variant.h",
         "//tensorflow/core/util:framework_internal_public_hdrs",
     ],
@@ -2351,6 +2354,7 @@ tf_cuda_library(
         "//tensorflow/core/framework:attr_value_util",
         "//tensorflow/core/framework:bfloat16",
         "//tensorflow/core/framework:common_shape_fns",
+        "//tensorflow/core/framework:kernel_shape_util",
         "//tensorflow/core/framework:node_def_util",
         "//tensorflow/core/framework:node_properties",
         "//tensorflow/core/framework:numeric_types",
@@ -3213,6 +3217,49 @@ test_suite(
     ],
 )
 
+tf_cc_test(
+    name = "common_runtime_placer_test",
+    size = "small",
+    srcs = [
+        "common_runtime/placer_test.cc",
+    ],
+    linkopts = select({
+        "//tensorflow:macos": ["-headerpad_max_install_names"],
+        "//conditions:default": [],
+    }),
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = ["no_windows"],
+    deps = [
+        ":core",
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":direct_session_internal",
+        ":framework",
+        ":framework_internal",
+        ":lib",
+        ":lib_internal",
+        ":ops",
+        ":protos_all_cc",
+        ":test",
+        ":test_main",
+        ":testlib",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/cc:sendrecv_ops",
+        "//tensorflow/cc:while_loop",
+        "//tensorflow/core/kernels:ops_util",
+        "//tensorflow/core/platform:regexp",
+        "//tensorflow/core/util:protos_test_cc",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 tf_cc_tests(
     name = "core_higher_level_tests",
     size = "small",
@@ -3232,7 +3279,6 @@ tf_cc_tests(
         "common_runtime/optimization_registry_test.cc",
         "common_runtime/pending_counts_test.cc",
         "common_runtime/placer_inspection_required_ops_utils_test.cc",
-        "common_runtime/placer_test.cc",
         "common_runtime/session_test.cc",
         "common_runtime/threadpool_device_test.cc",
         "//tensorflow/core/example:feature_util_test.cc",
diff --git a/tensorflow/core/api_def/base_api/api_def_BeginEpoch.pbtxt b/tensorflow/core/api_def/base_api/api_def_BeginEpoch.pbtxt
new file mode 100644
index 00000000000..d5fd0d609c8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BeginEpoch.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "BeginEpoch"
+  visibility: HIDDEN
+  summary: "Begins a tf.data service dataset epoch."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DecodeGif.pbtxt b/tensorflow/core/api_def/base_api/api_def_DecodeGif.pbtxt
index 75278f3c806..68438bc8114 100644
--- a/tensorflow/core/api_def/base_api/api_def_DecodeGif.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DecodeGif.pbtxt
@@ -21,6 +21,6 @@ uncompressed by running:
     convert $src.gif -coalesce $dst.gif
 
 This op also supports decoding JPEGs and PNGs, though it is cleaner to use
-`tf.image.decode_image`.
+`tf.io.decode_image`.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_DecodeJpeg.pbtxt b/tensorflow/core/api_def/base_api/api_def_DecodeJpeg.pbtxt
index b9521370d35..e6147a00412 100644
--- a/tensorflow/core/api_def/base_api/api_def_DecodeJpeg.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DecodeJpeg.pbtxt
@@ -75,6 +75,6 @@ downscaling the image later.
 
 
 This op also supports decoding PNGs and non-animated GIFs since the interface is
-the same, though it is cleaner to use `tf.image.decode_image`.
+the same, though it is cleaner to use `tf.io.decode_image`.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_DecodePng.pbtxt b/tensorflow/core/api_def/base_api/api_def_DecodePng.pbtxt
index 63404db8009..450de43751f 100644
--- a/tensorflow/core/api_def/base_api/api_def_DecodePng.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DecodePng.pbtxt
@@ -34,6 +34,6 @@ If needed, the PNG-encoded image is transformed to match the requested number
 of color channels.
 
 This op also supports decoding JPEGs and non-animated GIFs since the interface
-is the same, though it is cleaner to use `tf.image.decode_image`.
+is the same, though it is cleaner to use `tf.io.decode_image`.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_DistributeDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_DistributeDataset.pbtxt
new file mode 100644
index 00000000000..a04f1e830c4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DistributeDataset.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "DataServiceDataset"
+  visibility: HIDDEN
+  summary: "Creates a dataset that reads data from the tf.data service."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ImageProjectiveTransformV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ImageProjectiveTransformV2.pbtxt
index 73d548b226d..a9d5b981576 100644
--- a/tensorflow/core/api_def/base_api/api_def_ImageProjectiveTransformV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ImageProjectiveTransformV2.pbtxt
@@ -38,6 +38,12 @@ END
     name: "interpolation"
     description: <<END
 Interpolation method, "NEAREST" or "BILINEAR".
+END
+  }
+  attr {
+    name: "fill_mode"
+    description: <<END
+Fill mode, "REFLECT", "WRAP", or "CONSTANT".
 END
   }
   summary: "Applies the given transform to each of the images."
diff --git a/tensorflow/core/api_def/base_api/api_def_MakeDataServiceIterator.pbtxt b/tensorflow/core/api_def/base_api/api_def_MakeDataServiceIterator.pbtxt
new file mode 100644
index 00000000000..0d516687ebc
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MakeDataServiceIterator.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "MakeDataServiceIterator"
+  visibility: HIDDEN
+  summary: "Creates an iterator for reading from the tf.data service."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RegisterDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_RegisterDataset.pbtxt
new file mode 100644
index 00000000000..f073845e181
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RegisterDataset.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "RegisterDataset"
+  visibility: HIDDEN
+  summary: "Registers a dataset with the tf.data service."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_VarHandleOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_VarHandleOp.pbtxt
index 0a4caa06bdb..39606a07184 100644
--- a/tensorflow/core/api_def/base_api/api_def_VarHandleOp.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_VarHandleOp.pbtxt
@@ -23,6 +23,13 @@ END
     name: "shape"
     description: <<END
 The (possibly partially specified) shape of this variable.
+END
+  }
+  attr {
+    name: "allowed_devices"
+    description: <<END
+The allowed devices containing the resource variable. Set when the output
+ResourceHandle represents a per-replica/partitioned resource variable.
 END
   }
   summary: "Creates a handle to a Variable resource."
diff --git a/tensorflow/core/common_runtime/base_collective_executor.h b/tensorflow/core/common_runtime/base_collective_executor.h
index 9dadf33274f..8c579856d7d 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.h
+++ b/tensorflow/core/common_runtime/base_collective_executor.h
@@ -165,14 +165,14 @@ class BaseCollectiveExecutor : public CollectiveExecutor {
   condition_variable launch_cv_;
   // collective instance key -> number of local devices for which NCCL ops have
   // been launched.
-  std::unordered_map<int32, int32> launched_ GUARDED_BY(launch_mu_);
+  std::unordered_map<int32, int32> launched_ TF_GUARDED_BY(launch_mu_);
 
  private:
   Status CreateCollective(const CollectiveParams& col_params,
                           CollectiveImplementationInterface** col_impl);
   // Check if all ops on which this collective depends on have launched.
   bool CheckDependencies(const CollectiveParams& col_params)
-      EXCLUSIVE_LOCKS_REQUIRED(launch_mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(launch_mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index 5775040b209..1100ba9684c 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -269,7 +269,7 @@ size_t BFCAllocator::RoundedBytes(size_t bytes) {
 }
 
 bool BFCAllocator::DeallocateFreeRegions(size_t rounded_bytes)
-    EXCLUSIVE_LOCKS_REQUIRED(lock_) {
+    TF_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
   // Do nothing if garbage collection is off.
   if (!garbage_collection_) {
     return false;
@@ -326,7 +326,7 @@ bool BFCAllocator::DeallocateFreeRegions(size_t rounded_bytes)
 
 void BFCAllocator::DeallocateRegions(
     const absl::flat_hash_set<void*>& region_ptrs)
-    EXCLUSIVE_LOCKS_REQUIRED(lock_) {
+    TF_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
   // Explicitly remove the const qualifier as some compilers disallow passing
   // const_iterator to std::vector::erase(), which is used in
   // RemoveAllocationRegion().
@@ -450,7 +450,7 @@ void BFCAllocator::AddTraceMe(absl::string_view traceme_name,
 #endif
 
   tensorflow::profiler::TraceMe trace_me(
-      [&]() EXCLUSIVE_LOCKS_REQUIRED(lock_) {
+      [&]() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
         AllocatorStats stats = stats_;
         int64 bytes_available =
             memory_limit_ - stats.bytes_reserved - stats.bytes_in_use;
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index cb02127550d..a41ca5a1066 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -81,7 +81,7 @@ class BFCAllocator : public Allocator {
 
   void SetSafeFrontier(uint64 count) override;
 
-  virtual bool ShouldRecordOpName() const { return false; }
+  bool ShouldRecordOpName() const { return true; }
 
   MemoryDump RecordMemoryMap();
 
@@ -113,12 +113,12 @@ class BFCAllocator : public Allocator {
   // constituents so they're only useful for allocations not requiring a
   // particular timestamp.
   bool MergeTimestampedChunks(size_t required_bytes)
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   // Add TraceMe (in memory allocation and deallocation) for memory stats
   // profiling. The requested_bytes can be negative if it's a deallocation.
   void AddTraceMe(absl::string_view traceme_name, int64 requested_bytes)
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   // A ChunkHandle is an index into the chunks_ vector in BFCAllocator
   // kInvalidChunkHandle means an invalid chunk
@@ -186,7 +186,7 @@ class BFCAllocator : public Allocator {
 #endif
 
     string DebugString(BFCAllocator* a,
-                       bool recurse) NO_THREAD_SAFETY_ANALYSIS {
+                       bool recurse) TF_NO_THREAD_SAFETY_ANALYSIS {
       string dbg;
       strings::StrAppend(
           &dbg, "  Size: ", strings::HumanReadableNumBytes(size),
@@ -221,7 +221,7 @@ class BFCAllocator : public Allocator {
           : allocator_(allocator) {}
       // Sort first by size and then use pointer address as a tie breaker.
       bool operator()(const ChunkHandle ha,
-                      const ChunkHandle hb) const NO_THREAD_SAFETY_ANALYSIS {
+                      const ChunkHandle hb) const TF_NO_THREAD_SAFETY_ANALYSIS {
         const Chunk* a = allocator_->ChunkFromHandle(ha);
         const Chunk* b = allocator_->ChunkFromHandle(hb);
         if (a->size != b->size) {
@@ -382,7 +382,7 @@ class BFCAllocator : public Allocator {
   // 'rounded_bytes' bytes.  Returns true on success and false on
   // failure.
   bool Extend(size_t alignment, size_t rounded_bytes)
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   // Deallocate free regions to give back the memory to suballocator, so that
   // we can re-allocate a larger region.  The main use scenario of this function
@@ -394,58 +394,58 @@ class BFCAllocator : public Allocator {
 
   // Helper function to deallocate regions.
   void DeallocateRegions(const absl::flat_hash_set<void*>& region_ptrs)
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   // Returns a pointer to an underlying allocated chunk of size
   // 'rounded_bytes'.
   void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes,
-                     uint64 freed_before) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+                     uint64 freed_before) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   // Splits the chunk specified by 'h' into two chunks, one at least
   // of size 'num_bytes'.
   void SplitChunk(ChunkHandle h, size_t num_bytes)
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   // Merges the two chunk handles.  Requires that the chunks are
   // contiguous in their allocation.
-  void Merge(ChunkHandle h, ChunkHandle h2) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  void Merge(ChunkHandle h, ChunkHandle h2) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   // Adds the chunk 'h' to the proper free bin.
-  void InsertFreeChunkIntoBin(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  void InsertFreeChunkIntoBin(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   // Removes the free chunk pointed to by 'c' from the set free_chunks.
   void RemoveFreeChunkIterFromBin(Bin::FreeChunkSet* free_chunks,
                                   const Bin::FreeChunkSet::iterator& c)
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   // Removes a free chunk from the bin.
-  void RemoveFreeChunkFromBin(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  void RemoveFreeChunkFromBin(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
   void MaybeRemoveFreeChunkFromBin(ChunkHandle h)
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   // Removes the chunk metadata represented by 'h'.
-  void DeleteChunk(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  void DeleteChunk(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
-  string RenderOccupancy() EXCLUSIVE_LOCKS_REQUIRED(lock_);
-  void DumpMemoryLog(size_t num_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
-  MemoryDump RecordMemoryMapInternal() EXCLUSIVE_LOCKS_REQUIRED(lock_);
-  void MaybeWriteMemoryMap() EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  string RenderOccupancy() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  void DumpMemoryLog(size_t num_bytes) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  MemoryDump RecordMemoryMapInternal() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  void MaybeWriteMemoryMap() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
-  ChunkHandle AllocateChunk() EXCLUSIVE_LOCKS_REQUIRED(lock_);
-  void DeallocateChunk(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  ChunkHandle AllocateChunk() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  void DeallocateChunk(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
-  Chunk* ChunkFromHandle(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  Chunk* ChunkFromHandle(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
   const Chunk* ChunkFromHandle(ChunkHandle h) const
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
-  void MarkFree(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  void MarkFree(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   ChunkHandle TryToCoalesce(ChunkHandle h, bool ignore_freed_at)
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   // Fragmentation is calculated as the reverse ratio of the largest free chunk
   // size over total free memory, and returns a value within [0, 1].
-  double GetFragmentation() EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  double GetFragmentation() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   // Information about a Bin that is useful for debugging.
   struct BinDebugInfo {
@@ -458,7 +458,7 @@ class BFCAllocator : public Allocator {
 
   // Computes and returns a BinDebugInfo for each Bin.
   std::array<BinDebugInfo, kNumBins> get_bin_debug_info()
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   AllocatorRetry retry_helper_;
 
@@ -526,21 +526,21 @@ class BFCAllocator : public Allocator {
 
   // Structures mutable after construction
   mutable mutex lock_;
-  RegionManager region_manager_ GUARDED_BY(lock_);
+  RegionManager region_manager_ TF_GUARDED_BY(lock_);
 
-  std::vector<Chunk> chunks_ GUARDED_BY(lock_);
+  std::vector<Chunk> chunks_ TF_GUARDED_BY(lock_);
 
   // Pointer to head of linked list of free Chunks
-  ChunkHandle free_chunks_list_ GUARDED_BY(lock_);
+  ChunkHandle free_chunks_list_ TF_GUARDED_BY(lock_);
 
   // Counter containing the next unique identifier to assign to a
   // newly-created chunk.
-  int64 next_allocation_id_ GUARDED_BY(lock_);
+  int64 next_allocation_id_ TF_GUARDED_BY(lock_);
 
   // Stats.
-  AllocatorStats stats_ GUARDED_BY(lock_);
+  AllocatorStats stats_ TF_GUARDED_BY(lock_);
 #ifdef TENSORFLOW_MEM_DEBUG
-  int64 action_counter_ GUARDED_BY(lock_);
+  int64 action_counter_ TF_GUARDED_BY(lock_);
 #define MEM_DEBUG_SIZE_HISTORY_SIZE 4096
   int64 size_history_[MEM_DEBUG_SIZE_HISTORY_SIZE];
 #endif
diff --git a/tensorflow/core/common_runtime/buf_rendezvous.h b/tensorflow/core/common_runtime/buf_rendezvous.h
index 1d97d9c7ce0..527d0e28690 100644
--- a/tensorflow/core/common_runtime/buf_rendezvous.h
+++ b/tensorflow/core/common_runtime/buf_rendezvous.h
@@ -103,9 +103,9 @@ class BufRendezvous {
   const uint64 step_id_;
   const DeviceMgr* const dev_mgr_;  // Not owned.
   mutex mu_;
-  Status status_ GUARDED_BY(mu_);
+  Status status_ TF_GUARDED_BY(mu_);
   typedef absl::flat_hash_map<string, Hook*> HookTable;
-  HookTable hook_table_ GUARDED_BY(mu_);
+  HookTable hook_table_ TF_GUARDED_BY(mu_);
 
   void PurgeTable(const Status& s, HookTable* table);
 };
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr.h b/tensorflow/core/common_runtime/collective_executor_mgr.h
index d4cef14c1d2..0b2a260ad24 100644
--- a/tensorflow/core/common_runtime/collective_executor_mgr.h
+++ b/tensorflow/core/common_runtime/collective_executor_mgr.h
@@ -72,7 +72,8 @@ class CollectiveExecutorMgr : public CollectiveExecutorMgrInterface {
  private:
   mutex exec_mu_;
   // Map from step_id to CollectiveExecutor
-  gtl::FlatMap<int64, CollectiveExecutor*> executor_table_ GUARDED_BY(exec_mu_);
+  gtl::FlatMap<int64, CollectiveExecutor*> executor_table_
+      TF_GUARDED_BY(exec_mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
index d71acaee4b9..ad67ce5cb64 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -507,7 +507,7 @@ void CollectiveParamResolverLocal::InitInstanceSharedParams(
       ir->shared.instance.task_names,    // NOLINT
       attributes,
       [this, gr, cp, ir, attributes, done](const Status& s)
-          EXCLUSIVE_LOCK_FUNCTION(ir->out_mu) {
+          TF_EXCLUSIVE_LOCK_FUNCTION(ir->out_mu) {
             // Then we recover the lock in the callback thread that will hold it
             // through the rest of the call chain.  Signal the cv now, any
             // waiting threads will wake only when out_mu is released later.
@@ -607,7 +607,7 @@ void CollectiveParamResolverLocal::FindInstanceRec(
 
 void CollectiveParamResolverLocal::CallInitInstanceSharedParams(
     const GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir,
-    const InstanceRecCallback& done) NO_THREAD_SAFETY_ANALYSIS {
+    const InstanceRecCallback& done) TF_NO_THREAD_SAFETY_ANALYSIS {
   // This function serves merely to make a function call that should
   // be thread/mutex safe but violates the simple model applied by
   // static analysis, so we turn off analysis only within this
@@ -630,7 +630,7 @@ void CollectiveParamResolverLocal::CallInitInstanceSharedParams(
   ir->known.resize(cp->group.group_size, false);
   InitInstanceSharedParams(
       gr, cp, ir,
-      [this, ir, done](const Status& s) UNLOCK_FUNCTION(ir->out_mu) {
+      [this, ir, done](const Status& s) TF_UNLOCK_FUNCTION(ir->out_mu) {
         DCHECK(ir->out_mu_available);
         ir->status.Update(s);
         ir->out_mu.unlock();
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.h b/tensorflow/core/common_runtime/collective_param_resolver_local.h
index a912ffd1b1a..f1bdf63d698 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.h
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.h
@@ -66,12 +66,12 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
   struct GroupRec {
     CollGroupParams group;
     mutable mutex mu;
-    Status status GUARDED_BY(mu);
-    std::set<string> device_set GUARDED_BY(mu);
-    std::vector<string> device_list GUARDED_BY(mu);
-    std::set<string> task_set GUARDED_BY(mu);
-    std::vector<string> task_list GUARDED_BY(mu);
-    std::vector<StatusCallback> waiting GUARDED_BY(mu);
+    Status status TF_GUARDED_BY(mu);
+    std::set<string> device_set TF_GUARDED_BY(mu);
+    std::vector<string> device_list TF_GUARDED_BY(mu);
+    std::set<string> task_set TF_GUARDED_BY(mu);
+    std::vector<string> task_list TF_GUARDED_BY(mu);
+    std::vector<StatusCallback> waiting TF_GUARDED_BY(mu);
   };
 
   // Finds the GroupRec that corresponds to cp->group_key.
@@ -84,7 +84,7 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
       GroupRecCallback;
   void CompleteGroupLocal(const string& device, CollectiveParams* cp,
                           const GroupRecCallback& done)
-      LOCKS_EXCLUDED(group_mu_);
+      TF_LOCKS_EXCLUDED(group_mu_);
 
   // Used to complete/verify CollInstance.
   struct InstanceRec;
@@ -116,29 +116,29 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
     // drop that lock, then take a lock on out_mu before
     // reading/modifying its values.
     mutex in_mu;
-    bool is_init GUARDED_BY(in_mu);
-    std::vector<IRConsumer> init_waiters GUARDED_BY(in_mu);
+    bool is_init TF_GUARDED_BY(in_mu);
+    std::vector<IRConsumer> init_waiters TF_GUARDED_BY(in_mu);
 
     // A thread that wishes to acquire out_mu must ensure that it is available
     // by invoking WaitForOutMu().
     mutex out_mu;
     condition_variable out_cv;
-    bool out_mu_available GUARDED_BY(out_mu);
+    bool out_mu_available TF_GUARDED_BY(out_mu);
     // Values to be shared by all instances, constant after initialization.
-    CollectiveParams shared GUARDED_BY(out_mu);
+    CollectiveParams shared TF_GUARDED_BY(out_mu);
     // If an error occurs during initialization this structure stays in
     // the table with a non-OK status.  Purging the table and restarting
     // needs to be done at a higher level.
-    Status status GUARDED_BY(out_mu);
+    Status status TF_GUARDED_BY(out_mu);
 
     // These fields are used to count the instances that have called
     // in and become known while resolving broadcast source identity and
     // communicator key.
-    int source_rank GUARDED_BY(out_mu);
-    string communicator_key GUARDED_BY(out_mu);
-    int known_count GUARDED_BY(out_mu);
-    std::vector<bool> known GUARDED_BY(out_mu);
-    std::vector<IRConsumer> known_waiters GUARDED_BY(out_mu);
+    int source_rank TF_GUARDED_BY(out_mu);
+    string communicator_key TF_GUARDED_BY(out_mu);
+    int known_count TF_GUARDED_BY(out_mu);
+    std::vector<bool> known TF_GUARDED_BY(out_mu);
+    std::vector<IRConsumer> known_waiters TF_GUARDED_BY(out_mu);
 
     InstanceRec()
         : is_init(false),
@@ -148,7 +148,7 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
 
     // If out_mu is unavailable during distributed device locality
     // initialization, wait on out_cv until it is available again.
-    void WaitForOutMu(mutex_lock& lock) EXCLUSIVE_LOCKS_REQUIRED(out_mu);
+    void WaitForOutMu(mutex_lock& lock) TF_EXCLUSIVE_LOCKS_REQUIRED(out_mu);
   };
 
   // Find the InstanceRec with the same instance_key as cp.  If it doesn't
@@ -162,7 +162,7 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
       InstanceRecCallback;
   void FindInstanceRec(const GroupRec* gr, CollectiveParams* cp,
                        const InstanceRecCallback& done)
-      LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
+      TF_LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
 
   // Populate *ir with device membership from gr, then initialize to be specific
   // to cp->instance_key, i.e. order the devices and tasks.
@@ -171,26 +171,26 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
   //  cp is populated with all DeviceLocalities
   void InitInstanceSharedParams(const GroupRec* gr, const CollectiveParams* cp,
                                 InstanceRec* ir, const StatusCallback& done)
-      UNLOCK_FUNCTION(ir->out_mu) LOCKS_EXCLUDED(gr->mu);
+      TF_UNLOCK_FUNCTION(ir->out_mu) TF_LOCKS_EXCLUDED(gr->mu);
 
   void CallInitInstanceSharedParams(const GroupRec* gr,
                                     const CollectiveParams* cp, InstanceRec* ir,
                                     const InstanceRecCallback& done)
-      LOCKS_EXCLUDED(ir->out_mu, gr->mu);
+      TF_LOCKS_EXCLUDED(ir->out_mu, gr->mu);
 
   // Establishes the final order of ir->shared.instance.device_names and
   // ir->shared.instance.task_names by considering localities of all devices.
   void CompleteDefaultRanking(const GroupRec* gr, const CollectiveParams* cp,
                               InstanceRec* ir,
                               const std::vector<DeviceAttributes>& attributes)
-      EXCLUSIVE_LOCKS_REQUIRED(ir->out_mu);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(ir->out_mu);
 
   // Finish populating *cp.
   // Precondition: *gr has been fully populated by CompleteGroupLocal.
   void CompleteInstanceLocal(const string& device, const GroupRec* gr,
                              CollectiveParams* cp, bool is_source,
                              const StatusCallback& done)
-      LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
+      TF_LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
 
   // Finish populating *cp from fully initialized *ir.
   // Precondition: *gr and *ir are fully populated.
@@ -199,12 +199,12 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
                                            CollectiveParams* cp,
                                            InstanceRec* ir, bool is_source,
                                            const StatusCallback& done)
-      LOCKS_EXCLUDED(ir->out_mu);
+      TF_LOCKS_EXCLUDED(ir->out_mu);
 
   // Complete instance params after waiting for group.
   // Precondition: *cp has complete group data and default_rank.
   void WaitForGroup(InstanceRec* ir, CollectiveParams* cp, bool is_source,
-                    const IRConsumer& f) LOCKS_EXCLUDED(ir->out_mu);
+                    const IRConsumer& f) TF_LOCKS_EXCLUDED(ir->out_mu);
 
   // If cp.device_names contains only devices local to this process
   // populates *localities, else returns an error.
@@ -225,7 +225,7 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
 
   // Helper to grab status under lock, invoke callback out of lock.
   void CallbackWithStatus(const InstanceRecCallback& done, InstanceRec* irec)
-      LOCKS_EXCLUDED(irec->out_mu);
+      TF_LOCKS_EXCLUDED(irec->out_mu);
 
   const bool nccl_;
   const DeviceMgr* dev_mgr_;
@@ -233,10 +233,10 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
   string task_name_;
   mutex group_mu_;
   gtl::FlatMap<int32, std::unique_ptr<GroupRec>> group_table_
-      GUARDED_BY(group_mu_);
+      TF_GUARDED_BY(group_mu_);
   mutex instance_mu_;
   gtl::FlatMap<int32, std::unique_ptr<InstanceRec>> instance_table_
-      GUARDED_BY(instance_mu_);
+      TF_GUARDED_BY(instance_mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/constant_folding.cc b/tensorflow/core/common_runtime/constant_folding.cc
index 5c7d3ef19e4..467147921be 100644
--- a/tensorflow/core/common_runtime/constant_folding.cc
+++ b/tensorflow/core/common_runtime/constant_folding.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/common_runtime/constant_folding.h"
+
 #include <algorithm>
 #include <atomic>
 #include <set>
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/core/common_runtime/constant_folding.h"
-
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -45,6 +45,8 @@ namespace tensorflow {
 
 namespace {
 
+const char kScopedAllocatorAttrName[] = "_scoped_allocator";
+
 // Test to see if the Op is one that turns into a constant when its
 // inputs' shapes are known.
 bool IsShapeOp(const Node* n) {
@@ -256,6 +258,15 @@ bool IsConstantFoldable(
   if (!KernelDefAvailable(DeviceType(DEVICE_CPU), n->def())) {
     return false;
   }
+  // Do not constant fold nodes which will be allocated by ScopedAllocator.
+  // This is because the constant-folding graph will not contain the
+  // `_ScopedAllocator` node, and that is necessary to be able to run a node
+  // that will use this allocator.
+  if (n->attrs().Find(kScopedAllocatorAttrName) != nullptr) {
+    VLOG(2) << "Skip node [" << n->DebugString()
+            << "] for constant folding due to scoped allocator";
+    return false;
+  }
   return true;
 }
 
diff --git a/tensorflow/core/common_runtime/costmodel_manager.h b/tensorflow/core/common_runtime/costmodel_manager.h
index e8ea2498b91..770d4428726 100644
--- a/tensorflow/core/common_runtime/costmodel_manager.h
+++ b/tensorflow/core/common_runtime/costmodel_manager.h
@@ -47,7 +47,7 @@ class CostModelManager {
 
  private:
   mutex mu_;
-  CostModelMap cost_models_ GUARDED_BY(mu_);
+  CostModelMap cost_models_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/device.h b/tensorflow/core/common_runtime/device.h
index 9b7dadadc3a..13877933ce6 100644
--- a/tensorflow/core/common_runtime/device.h
+++ b/tensorflow/core/common_runtime/device.h
@@ -92,16 +92,6 @@ class Device : public DeviceBase {
     op_kernel->ComputeAsync(context, std::move(done));
   }
 
-  // Takes ownership of the references in tensors. If necessary, a
-  // device may override this method to keep a reference to the
-  // accessed tensors until the async computation has completed.
-  virtual void ConsumeListOfAccessedTensors(
-      DeviceContext* context, const TensorReferenceVector& tensors) {
-    for (const auto& ref : tensors) {
-      ref.Unref();
-    }
-  }
-
   // Blocks until all operations queued on the device at the time of
   // the call have completed.  Returns any error pending on the device
   // at completion.
diff --git a/tensorflow/core/common_runtime/device_mgr.h b/tensorflow/core/common_runtime/device_mgr.h
index 7a3ead75243..56248b39078 100644
--- a/tensorflow/core/common_runtime/device_mgr.h
+++ b/tensorflow/core/common_runtime/device_mgr.h
@@ -138,13 +138,14 @@ class DynamicDeviceMgr : public DeviceMgr {
   mutable mutex devices_mu_;
 
   std::unordered_map<Device*, std::unique_ptr<Device>> dynamic_devices_
-      GUARDED_BY(devices_mu_);
+      TF_GUARDED_BY(devices_mu_);
 
-  std::unordered_map<string, Device*> device_map_ GUARDED_BY(devices_mu_);
+  std::unordered_map<string, Device*> device_map_ TF_GUARDED_BY(devices_mu_);
 
-  std::unordered_map<string, int> device_type_counts_ GUARDED_BY(devices_mu_);
+  std::unordered_map<string, int> device_type_counts_
+      TF_GUARDED_BY(devices_mu_);
 
-  mutable Device* cpu_device_ GUARDED_BY(devices_mu_);
+  mutable Device* cpu_device_ TF_GUARDED_BY(devices_mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(DynamicDeviceMgr);
 };
diff --git a/tensorflow/core/common_runtime/device_set.h b/tensorflow/core/common_runtime/device_set.h
index f7680341cab..fdb7453aa79 100644
--- a/tensorflow/core/common_runtime/device_set.h
+++ b/tensorflow/core/common_runtime/device_set.h
@@ -38,7 +38,7 @@ class DeviceSet {
   ~DeviceSet();
 
   // Does not take ownership of 'device'.
-  void AddDevice(Device* device) LOCKS_EXCLUDED(devices_mu_);
+  void AddDevice(Device* device) TF_LOCKS_EXCLUDED(devices_mu_);
 
   // Set the device designated as the "client".  This device
   // must also be registered via AddDevice().
@@ -70,7 +70,7 @@ class DeviceSet {
   // Return the prioritized list of devices in this set.
   // Devices are prioritized first by `DeviceTypeOrder`, then by name.
   const PrioritizedDeviceVector& prioritized_devices() const
-      LOCKS_EXCLUDED(devices_mu_);
+      TF_LOCKS_EXCLUDED(devices_mu_);
 
   // Return the prioritized list of unique device types in this set.
   //
@@ -78,7 +78,7 @@ class DeviceSet {
   // element in the list's `std::pair<DeviceType, int32>`) will be initialized
   // to the value of `DeviceTypeOrder` for the device types.
   const PrioritizedDeviceTypeVector& prioritized_device_types() const
-      LOCKS_EXCLUDED(devices_mu_);
+      TF_LOCKS_EXCLUDED(devices_mu_);
 
   // An order to sort by device types according to system-determined
   // priority.
@@ -112,12 +112,13 @@ class DeviceSet {
 
   // Cached prioritized vector, created on-the-fly when
   // prioritized_devices() is called.
-  mutable PrioritizedDeviceVector prioritized_devices_ GUARDED_BY(devices_mu_);
+  mutable PrioritizedDeviceVector prioritized_devices_
+      TF_GUARDED_BY(devices_mu_);
 
   // Cached prioritized vector, created on-the-fly when
   // prioritized_device_types() is called.
   mutable PrioritizedDeviceTypeVector prioritized_device_types_
-      GUARDED_BY(devices_mu_);
+      TF_GUARDED_BY(devices_mu_);
 
   // Fullname -> device* for device in devices_.
   std::unordered_map<string, Device*> device_by_name_;
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index a196f74c65b..029582c04fd 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -236,8 +236,9 @@ class DirectSessionFactory : public SessionFactory {
   }
 
   mutex sessions_lock_;
-  std::vector<DirectSession*> sessions_ GUARDED_BY(sessions_lock_);
-  absl::flat_hash_set<string> session_metadata_keys_ GUARDED_BY(sessions_lock_);
+  std::vector<DirectSession*> sessions_ TF_GUARDED_BY(sessions_lock_);
+  absl::flat_hash_set<string> session_metadata_keys_
+      TF_GUARDED_BY(sessions_lock_);
 };
 
 class DirectSessionRegistrar {
@@ -592,7 +593,9 @@ Status DirectSession::RunInternal(
   if (ShouldUseRunHandlerPool(run_options) &&
       run_options.experimental().use_run_handler_pool()) {
     VLOG(1) << "Using RunHandler to scheduler inter-op closures.";
-    handler = GetOrCreateRunHandlerPool(options_)->Get(step_id, call_timeout);
+    handler = GetOrCreateRunHandlerPool(options_)->Get(
+        step_id, call_timeout,
+        run_options.experimental().run_handler_pool_options());
     if (!handler) {
       return errors::DeadlineExceeded(
           "Could not obtain RunHandler for request after waiting for ",
@@ -1333,7 +1336,11 @@ Status DirectSession::CreateExecutors(
   func_info->proc_flr.reset(new ProcessFunctionLibraryRuntime(
       device_mgr_.get(), options_.env, &options_.config, graph_def_version,
       func_info->flib_def.get(), optimizer_opts, thread_pools_[0].first,
-      nullptr, custom_kernel_creator, session_metadata));
+      /*parent=*/nullptr, custom_kernel_creator, session_metadata,
+      [](const int64, const DeviceMgr* device_mgr, Rendezvous** r) {
+        *r = new IntraProcessRendezvous(device_mgr);
+        return Status::OK();
+      }));
 
   GraphOptimizer optimizer(optimizer_opts);
   for (auto iter = graphs.begin(); iter != graphs.end(); ++iter) {
@@ -1379,11 +1386,6 @@ Status DirectSession::CreateExecutors(
       if (kernel && !OpSegment::ShouldOwnKernel(lib, kernel->type_string()))
         delete kernel;
     };
-    params.rendezvous_factory = [](const int64, const DeviceMgr* device_mgr,
-                                   Rendezvous** r) {
-      *r = new IntraProcessRendezvous(device_mgr);
-      return Status::OK();
-    };
 
     optimizer.Optimize(lib, options_.env, device, &partition_graph,
                        /*shape_map=*/nullptr);
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index 526a0e9a0a6..88e858d3c9d 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -198,7 +198,7 @@ class DirectSession : public Session {
   // 'status' is the current status of the execution.
   struct RunState {
     mutex mu;
-    Status status GUARDED_BY(mu);
+    Status status TF_GUARDED_BY(mu);
     std::unique_ptr<CollectiveExecutor::Handle> collective_executor;
     std::unique_ptr<StepStatsCollector> collector;
     TensorStore tensor_store;
@@ -275,7 +275,7 @@ class DirectSession : public Session {
   bool ShouldUseRunHandlerPool(const RunOptions& run_options) const;
 
   ::tensorflow::Status ExtendLocked(GraphDef graph)
-      EXCLUSIVE_LOCKS_REQUIRED(graph_state_lock_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(graph_state_lock_);
 
   ::tensorflow::Status ResourceHandleToInputTensor(
       const Tensor& resource_tensor, Tensor* retrieved_tensor);
@@ -343,8 +343,8 @@ class DirectSession : public Session {
   // Unique session identifier.
   string session_handle_;
   mutex graph_state_lock_;
-  bool graph_created_ GUARDED_BY(graph_state_lock_) = false;
-  bool finalized_ GUARDED_BY(graph_state_lock_) = false;
+  bool graph_created_ TF_GUARDED_BY(graph_state_lock_) = false;
+  bool finalized_ TF_GUARDED_BY(graph_state_lock_) = false;
 
   // The thread-pools to use for running ops, with a bool indicating if the pool
   // is owned.
@@ -356,7 +356,7 @@ class DirectSession : public Session {
   bool sync_on_finish_ = true;
 
   std::vector<std::unique_ptr<FunctionInfo>> functions_
-      GUARDED_BY(executor_lock_);
+      TF_GUARDED_BY(executor_lock_);
 
   mutex executor_lock_;  // protects executors_
   // Holds mappings from signature to the executors that process
@@ -365,7 +365,7 @@ class DirectSession : public Session {
   // The map value is a shared_ptr since multiple map keys can point to the
   // same ExecutorsAndKey object.
   std::unordered_map<string, std::shared_ptr<ExecutorsAndKeys>> executors_
-      GUARDED_BY(executor_lock_);
+      TF_GUARDED_BY(executor_lock_);
 
   class RunCallableCallFrame;
   struct Callable {
@@ -374,12 +374,12 @@ class DirectSession : public Session {
     ~Callable();
   };
   mutex callables_lock_;
-  int64 next_callable_handle_ GUARDED_BY(callables_lock_) = 0;
-  std::unordered_map<int64, Callable> callables_ GUARDED_BY(callables_lock_);
+  int64 next_callable_handle_ TF_GUARDED_BY(callables_lock_) = 0;
+  std::unordered_map<int64, Callable> callables_ TF_GUARDED_BY(callables_lock_);
 
   // Holds mappings from handle to partial run state.
   std::unordered_map<string, std::unique_ptr<PartialRunState>> partial_runs_
-      GUARDED_BY(executor_lock_);
+      TF_GUARDED_BY(executor_lock_);
 
   // This holds all the tensors that are currently alive in the session.
   SessionState session_state_;
@@ -393,11 +393,11 @@ class DirectSession : public Session {
   // nodes can not be moved to a different device.  Maps node names to
   // device names.
   std::unordered_map<string, string> stateful_placements_
-      GUARDED_BY(graph_state_lock_);
+      TF_GUARDED_BY(graph_state_lock_);
 
   // Execution_state; used when placing the entire graph.
   std::unique_ptr<GraphExecutionState> execution_state_
-      GUARDED_BY(graph_state_lock_);
+      TF_GUARDED_BY(graph_state_lock_);
 
   // The function library, before any rewrites or optimizations have been
   // performed. In particular, CreateGraphs() may need to modify the function
@@ -406,7 +406,7 @@ class DirectSession : public Session {
 
   // true if the Session has been Closed.
   mutex closed_lock_;
-  bool closed_ GUARDED_BY(closed_lock_) = false;
+  bool closed_ TF_GUARDED_BY(closed_lock_) = false;
 
   // For generating unique names for this session instance.
   std::atomic<int64> edge_name_counter_ = {0};
@@ -423,7 +423,7 @@ class DirectSession : public Session {
 
   // For testing collective graph key generation.
   mutex collective_graph_key_lock_;
-  int64 collective_graph_key_ GUARDED_BY(collective_graph_key_lock_) = -1;
+  int64 collective_graph_key_ TF_GUARDED_BY(collective_graph_key_lock_) = -1;
 
   // Run in caller's thread if RunOptions.inter_op_thread_pool is negative or
   // all of following conditions are met:
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 5a053c2b51a..3628f6372da 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -128,14 +128,21 @@ void EagerContext::ResetPFLR(const DeviceMgr* device_mgr, Env* env,
                              thread::ThreadPool* thread_pool,
                              DistributedFunctionLibraryRuntime* cluster_flr,
                              const CustomKernelCreator* custom_kernel_creator) {
+  Rendezvous::Factory rendezvous_factory =
+      [this](const int64 step_id, const DeviceMgr*, Rendezvous** r) {
+        *r = CreateRendezvous(step_id);
+        return Status::OK();
+      };
   if (lazy_copy_function_remote_inputs_) {
     pflr_.reset(new eager::EagerProcessFunctionLibraryRuntime(
         device_mgr, env, config, graph_def_version, lib_def, optimizer_options,
-        thread_pool, cluster_flr, custom_kernel_creator));
+        thread_pool, cluster_flr, custom_kernel_creator,
+        /*session_metadata=*/nullptr, std::move(rendezvous_factory)));
   } else {
     pflr_.reset(new ProcessFunctionLibraryRuntime(
         device_mgr, env, config, graph_def_version, lib_def, optimizer_options,
-        thread_pool, cluster_flr, custom_kernel_creator));
+        thread_pool, cluster_flr, custom_kernel_creator,
+        /*session_metadata=*/nullptr, std::move(rendezvous_factory)));
   }
 }
 
@@ -780,9 +787,28 @@ Status EagerContext::FindCustomDeviceFromName(const string& device_name,
   return Status::OK();
 }
 
-void EagerContext::RegisterCustomDevice(const string& device_name,
-                                        std::unique_ptr<CustomDevice> device) {
-  custom_devices_.emplace(device_name, std::move(device));
+Status EagerContext::RegisterCustomDevice(
+    const string& device_name, std::unique_ptr<CustomDevice> device) {
+  DeviceNameUtils::ParsedName parsed;
+  if (!DeviceNameUtils::ParseFullName(device_name, &parsed) ||
+      !parsed.has_job || !parsed.has_replica || !parsed.has_task ||
+      !parsed.has_type || !parsed.has_id) {
+    return errors::InvalidArgument(
+        device_name,
+        " could not be parsed as a device name. Use the full "
+        "/job:<name>/replica:<replica>/task:<task>/device:<type>:<device_num> "
+        "format.");
+  }
+  Device* existing_physical_device = nullptr;
+  if (FindDeviceFromName(device_name.c_str(), &existing_physical_device).ok()) {
+    return errors::AlreadyExists(device_name,
+                                 " already registered as a physical device.");
+  }
+  if (!custom_devices_.emplace(device_name, std::move(device)).second) {
+    return errors::AlreadyExists(device_name,
+                                 " already registered as a custom device.");
+  }
+  return Status::OK();
 }
 
 bool EagerContext::OnSameTask(const Device* first, const Device* second) const {
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index eee58324cb9..4006ecb04de 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -293,11 +293,11 @@ class EagerContext : public core::RefCounted {
   }
 
   // TODO(apassos) clean up RunMetadata storage.
-  mutex* MetadataMu() LOCK_RETURNED(metadata_mu_) { return &metadata_mu_; }
-  bool ShouldStoreGraphs() LOCKS_EXCLUDED(metadata_mu_);
+  mutex* MetadataMu() TF_LOCK_RETURNED(metadata_mu_) { return &metadata_mu_; }
+  bool ShouldStoreGraphs() TF_LOCKS_EXCLUDED(metadata_mu_);
   void SetShouldStoreGraphs(bool value);
   RunMetadata* RunMetadataProto() { return &run_metadata_; }
-  void ClearRunMetadata() EXCLUSIVE_LOCKS_REQUIRED(metadata_mu_);
+  void ClearRunMetadata() TF_EXCLUSIVE_LOCKS_REQUIRED(metadata_mu_);
 
   void ListDevices(std::vector<tensorflow::DeviceAttributes>* devices);
 
@@ -446,8 +446,8 @@ class EagerContext : public core::RefCounted {
   Status FindCustomDeviceFromName(const string& device_name,
                                   CustomDevice** dev) const;
 
-  void RegisterCustomDevice(const string& name,
-                            std::unique_ptr<CustomDevice> device);
+  Status RegisterCustomDevice(const string& name,
+                              std::unique_ptr<CustomDevice> device);
 
   bool OnSameTask(const Device* first, const Device* second) const;
   // Gets the CPU device on the task of device.
@@ -511,9 +511,9 @@ class EagerContext : public core::RefCounted {
   // thread-local-object-local variable in C++11.
   mutable mutex policy_map_mu_;
   std::unordered_map<std::thread::id, ContextDevicePlacementPolicy>
-      device_placement_policy_ GUARDED_BY(policy_map_mu_);
+      device_placement_policy_ TF_GUARDED_BY(policy_map_mu_);
   std::unordered_map<std::thread::id, ContextMirroringPolicy> mirroring_policy_
-      GUARDED_BY(policy_map_mu_);
+      TF_GUARDED_BY(policy_map_mu_);
 
   OwnedOrUnownedHelper<const DeviceMgr> local_device_manager_;
 
@@ -552,14 +552,14 @@ class EagerContext : public core::RefCounted {
   };
   std::unordered_map<Fprint128, core::RefCountPtr<KernelAndDevice>,
                      Fprint128Hasher>
-      kernel_cache_ GUARDED_BY(cache_mu_);
+      kernel_cache_ TF_GUARDED_BY(cache_mu_);
   std::unordered_map<string, RegisteredFunction*> registered_functions_
-      GUARDED_BY(cache_mu_);
+      TF_GUARDED_BY(cache_mu_);
 
   // Whether we should compute RunMetadata.
   std::atomic<bool> should_store_graphs_{false};
   mutex metadata_mu_;
-  RunMetadata run_metadata_ GUARDED_BY(metadata_mu_);
+  RunMetadata run_metadata_ TF_GUARDED_BY(metadata_mu_);
   GraphCollector graph_collector_;
   // TODO(fishx): Allow update following two bool after context creation.
   const bool log_device_placement_;
@@ -567,13 +567,14 @@ class EagerContext : public core::RefCounted {
 
   // Information related to step containers.
   std::atomic<int> num_active_steps_;
-  std::unique_ptr<ScopedStepContainer> step_container_ GUARDED_BY(metadata_mu_);
+  std::unique_ptr<ScopedStepContainer> step_container_
+      TF_GUARDED_BY(metadata_mu_);
 
   EagerExecutor default_executor_;
   mutable mutex executor_map_mu_;
   // Not owned.
   std::unordered_map<std::thread::id, EagerExecutor*> thread_local_executor_
-      GUARDED_BY(executor_map_mu_);
+      TF_GUARDED_BY(executor_map_mu_);
 
   const bool log_memory_;
 
@@ -607,28 +608,28 @@ class EagerContext : public core::RefCounted {
 
   mutable mutex remote_state_mu_;
 
-  uint64 context_id_ GUARDED_BY(remote_state_mu_);
+  uint64 context_id_ TF_GUARDED_BY(remote_state_mu_);
   // The view id of an eager context should be set to 0 when context is created,
   // and continuously incremented when context with the same context_id gets
   // updated. The view id should be consistent between master and workers.
-  uint64 context_view_id_ GUARDED_BY(remote_state_mu_);
+  uint64 context_view_id_ TF_GUARDED_BY(remote_state_mu_);
   std::vector<string> remote_contexts_;
 
-  int keep_alive_secs_ GUARDED_BY(remote_state_mu_);
+  int keep_alive_secs_ TF_GUARDED_BY(remote_state_mu_);
   std::atomic<int> sleep_for_secs_;
 
   std::unique_ptr<Thread> keep_alive_thread_;
   mutex keep_alive_thread_shutdown_mu_;
   condition_variable keep_alive_thread_cv_;
-  bool shutting_down_ GUARDED_BY(keep_alive_thread_shutdown_mu_) = false;
+  bool shutting_down_ TF_GUARDED_BY(keep_alive_thread_shutdown_mu_) = false;
 
   std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
       remote_mgr_;
-  bool is_master_ GUARDED_BY(remote_state_mu_);
+  bool is_master_ TF_GUARDED_BY(remote_state_mu_);
 
   // Maps from a remote worker to a list of parsed device filters.
   std::unordered_map<string, std::vector<DeviceNameUtils::ParsedName>>
-      cluster_device_filters_ GUARDED_BY(remote_state_mu_);
+      cluster_device_filters_ TF_GUARDED_BY(remote_state_mu_);
 
 #endif  // IS_MOBILE_PLATFORM
 
diff --git a/tensorflow/core/common_runtime/eager/eager_executor.h b/tensorflow/core/common_runtime/eager/eager_executor.h
index f8db56ccca6..aa8864c7ad6 100644
--- a/tensorflow/core/common_runtime/eager/eager_executor.h
+++ b/tensorflow/core/common_runtime/eager/eager_executor.h
@@ -151,7 +151,7 @@ class EagerExecutor {
     return status_;
   }
 
-  bool ok() const NO_THREAD_SAFETY_ANALYSIS { return ok_; }
+  bool ok() const TF_NO_THREAD_SAFETY_ANALYSIS { return ok_; }
 
  private:
   // Possible states for this executor.
@@ -183,11 +183,12 @@ class EagerExecutor {
     NodeState state;
   };
 
-  const char* StateStringLocked() EXCLUSIVE_LOCKS_REQUIRED(node_queue_mutex_);
+  const char* StateStringLocked()
+      TF_EXCLUSIVE_LOCKS_REQUIRED(node_queue_mutex_);
 
   void NodeDone(const core::RefCountPtr<NodeItem>& item, const Status& status,
                 bool from_queue);
-  void NotifyWaiters(uint64 id) EXCLUSIVE_LOCKS_REQUIRED(node_queue_mutex_);
+  void NotifyWaiters(uint64 id) TF_EXCLUSIVE_LOCKS_REQUIRED(node_queue_mutex_);
 
   // Starts execution of pending EagerNodes. This function loops till executor
   // state_ is set to kShutDown. If any errors are encountered, these are set
@@ -201,7 +202,7 @@ class EagerExecutor {
   // The impl of WaitForAllPendingNodes
   // `lock` is the lock that holds node_queue_mutex_.
   Status WaitForAllPendingNodesLocked(mutex_lock* lock)
-      EXCLUSIVE_LOCKS_REQUIRED(node_queue_mutex_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(node_queue_mutex_);
 
   Status WaitImpl(bool wait_all, uint64 node_id);
 
@@ -210,27 +211,27 @@ class EagerExecutor {
   mutable mutex node_queue_mutex_;
 
   // Used to signal that some EagerNodes are pending execution.
-  condition_variable nodes_pending_ GUARDED_BY(node_queue_mutex_);
+  condition_variable nodes_pending_ TF_GUARDED_BY(node_queue_mutex_);
 
   // Queue of pending NodeItems. Ordered by NodeItem::id.
   std::queue<core::RefCountPtr<NodeItem>> node_queue_
-      GUARDED_BY(node_queue_mutex_);
+      TF_GUARDED_BY(node_queue_mutex_);
 
   // Ordered by NodeItem::id.
   std::map<uint64, core::RefCountPtr<NodeItem>, std::less<uint64>>
-      unfinished_nodes_ GUARDED_BY(node_queue_mutex_);
+      unfinished_nodes_ TF_GUARDED_BY(node_queue_mutex_);
 
   // `status_` is set based on any errors raised during execution of a
   // EagerNode.  It remains set until ClearError is called.
-  Status status_ GUARDED_BY(node_queue_mutex_);
-  std::atomic<bool> ok_ GUARDED_BY(node_queue_mutex_);
+  Status status_ TF_GUARDED_BY(node_queue_mutex_);
+  std::atomic<bool> ok_ TF_GUARDED_BY(node_queue_mutex_);
 
   // Map from id of a EagerNode to condition_variables (not owned by the map).
   // These condition_variables are notified and removed when that EagerNode is
   // done executing, or if an error is found in execution of any EagerNode.
   // The map is ordered by id.
   std::multimap<uint64, condition_variable*, std::less<uint64>>
-      node_done_notifications_ GUARDED_BY(node_queue_mutex_);
+      node_done_notifications_ TF_GUARDED_BY(node_queue_mutex_);
 
   // thread_exited_notification_ is notified by the `thread_` right before it
   // exits.
@@ -238,7 +239,8 @@ class EagerExecutor {
 
   // When state_ is set to kShutDown, it indicates that `thread_` should stop as
   // soon as it is done executing the current EagerNode.
-  ExecutorState state_ GUARDED_BY(node_queue_mutex_) = ExecutorState::kActive;
+  ExecutorState state_ TF_GUARDED_BY(node_queue_mutex_) =
+      ExecutorState::kActive;
 
   // Thread object that calls the `Run` method in async mode.This thread runs
   // until state_ is set to kShuttingDown. It is `nullptr` in sync mode.
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 01815b4dee2..baaddec74e1 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -437,12 +437,13 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
         // looking it up in ResourceMgr, which is slow). So we just get
         // resource_dtypes_and_shapes for all DT_RESOURCE inputs. If
         // resource_dtypes_and_shapes is not empty, take the first element.
-        std::vector<DtypeAndPartialTensorShape> resource_dtypes_and_shapes;
-        TF_RETURN_IF_ERROR(input->GetResourceHandleDtypesAndShapes(
-            &resource_dtypes_and_shapes));
-        if (!resource_dtypes_and_shapes.empty()) {
+        TensorHandle::ResourceHandleInfo resource_handle_info;
+        TF_RETURN_IF_ERROR(input->GetResourceHandleInfo(&resource_handle_info));
+        std::vector<DtypeAndPartialTensorShape>* resource_dtypes_and_shapes =
+            &resource_handle_info.dtypes_and_shapes;
+        if (!resource_dtypes_and_shapes->empty()) {
           const DtypeAndPartialTensorShape& dtype_and_shape =
-              resource_dtypes_and_shapes.at(0);
+              resource_dtypes_and_shapes->at(0);
           input_resource_variable_dtypes_and_shapes[i] = dtype_and_shape;
 
           // Add _Arg index, dtype and shape to "cache_key".
@@ -647,8 +648,13 @@ Status StoreResourceDtypesAndShapes(const eager::Operation& remote_op,
     TF_RETURN_IF_ERROR(attr_slice.Find("dtype", &dtype));
     const AttrValue* shape;
     TF_RETURN_IF_ERROR(attr_slice.Find("shape", &shape));
-    retvals[0]->SetResourceHandleDtypeAndShape(
-        {DtypeAndPartialTensorShape{dtype->type(), shape->shape()}});
+    TensorHandle::ResourceHandleInfo resource_handle_info = {
+        {DtypeAndPartialTensorShape{dtype->type(), shape->shape()}}, {}};
+    // "allowed_devices" is set only when the output represents a
+    // per-replica/partitioned resource variable.
+    TryGetNodeAttr(attr_slice, "allowed_devices",
+                   &resource_handle_info.allowed_devices);
+    retvals[0]->SetResourceHandleInfo(std::move(resource_handle_info));
   }
   return Status::OK();
 }
@@ -857,6 +863,9 @@ Status MaybeUpdateOpDevice(EagerOperation* op) {
                           : absl::get<Device*>(op->Device());
   for (int i = 0; i < op->Inputs().size(); ++i) {
     TensorHandle* tensor_handle = op->Inputs()[i];
+    if (VariantDeviceIsCustom(tensor_handle->DeviceOrHostCPU(ctx))) {
+      continue;  // Do not try to let custom devices influence op placement.
+    }
     if (tensor_handle->dtype == DT_RESOURCE) {
       Device* resource_device = tensor_handle->resource_device();
       DVLOG(2) << "for op " << op->Name() << " input " << i << " "
@@ -869,6 +878,19 @@ Status MaybeUpdateOpDevice(EagerOperation* op) {
       // is a resource we must pin it to prevent different device selection.
       // TODO(iga): null device can mean "unspecified" or "CPU". Clean this up.
       if (resource_device != op_device || op->Device() == kVariantDeviceNull) {
+        std::vector<string> allowed_devices;
+        TF_RETURN_IF_ERROR(
+            tensor_handle->GetResourceAllowedDevices(&allowed_devices));
+        if (!allowed_devices.empty()) {
+          // TODO(b/145922293): Support allowed_devices specified in wildcard
+          // patterns.
+          std::vector<string> device_names;
+          if (std::find(allowed_devices.begin(), allowed_devices.end(),
+                        op->GetDeviceName()) != allowed_devices.end()) {
+            TF_RETURN_IF_ERROR(ctx.FindDeviceFromName(
+                op->GetDeviceName().c_str(), &resource_device));
+          }
+        }
         DVLOG(1) << (resource_device != op_device ? "Changing " : "Setting ")
                  << "device of operation " << op->Name() << " to "
                  << resource_device->name() << " because input #" << i
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index 7760c2c968f..bb007680e62 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -256,7 +256,7 @@ Status KernelAndDeviceOp::Run(
   params.output_attr_array = output_alloc_attrs_.data();
   params.function_library = flr_;
   params.slice_reader_cache = &slice_reader_cache_;
-  params.rendezvous = rendez_;
+  params.rendezvous = rendezvous_;
   OpExecutionState* op_execution_state = nullptr;
 
   CancellationManager default_cancellation_manager;
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index abbf840784b..cd57e32277a 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -169,14 +169,14 @@ class KernelAndDevice : public core::RefCounted {
 class KernelAndDeviceOp final : public KernelAndDevice {
  public:
   KernelAndDeviceOp(
-      tensorflow::Rendezvous* rendez, bool log_memory,
+      tensorflow::Rendezvous* rendezvous, bool log_memory,
       FunctionLibraryRuntime* flr,
       std::function<void(std::function<void()>)>* runner,
       std::unique_ptr<CollectiveExecutor::Handle> collective_executor,
       Device* host_cpu_device)
       : KernelAndDevice(flr, runner, std::move(collective_executor),
                         host_cpu_device),
-        rendez_(rendez),
+        rendezvous_(rendezvous),
         log_memory_(log_memory),
         step_container_(0, [this](const string& name) {
           device_->resource_manager()->Cleanup(name).IgnoreError();
@@ -218,7 +218,7 @@ class KernelAndDeviceOp final : public KernelAndDevice {
   gtl::InlinedVector<AllocatorAttributes, 4> input_alloc_attrs_;
   std::vector<Device*> input_devices_;
   gtl::InlinedVector<AllocatorAttributes, 1> output_alloc_attrs_;
-  Rendezvous* const rendez_;
+  Rendezvous* const rendezvous_;
   checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_;
   const bool log_memory_;
   ScopedStepContainer step_container_;
diff --git a/tensorflow/core/common_runtime/eager/process_function_library_runtime.cc b/tensorflow/core/common_runtime/eager/process_function_library_runtime.cc
index d45aeec4264..ca545a9e890 100644
--- a/tensorflow/core/common_runtime/eager/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/eager/process_function_library_runtime.cc
@@ -62,7 +62,8 @@ void EagerProcessFunctionLibraryRuntime::Run(
                                               std::move(done));
   }
   auto* cleanup_items = new std::vector<std::unique_ptr<CleanUpItem>>;
-  done = ApplyCleanUpToDoneCallback(cleanup_items, done);
+  done =
+      ApplyCleanUpToDoneCallback(cleanup_items, done, /*rendezvous=*/nullptr);
 
   auto get_component_args = [&args](const ComponentFunctionData& comp_data,
                                     InternalArgs* comp_args) -> Status {
diff --git a/tensorflow/core/common_runtime/eager/process_function_library_runtime.h b/tensorflow/core/common_runtime/eager/process_function_library_runtime.h
index e457735b727..a73ec63f29b 100644
--- a/tensorflow/core/common_runtime/eager/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/eager/process_function_library_runtime.h
@@ -39,16 +39,7 @@ namespace eager {
 class EagerProcessFunctionLibraryRuntime
     : public ProcessFunctionLibraryRuntime {
  public:
-  EagerProcessFunctionLibraryRuntime(
-      const DeviceMgr* device_mgr, Env* env, const ConfigProto* config,
-      int graph_def_version, const FunctionLibraryDefinition* lib_def,
-      const OptimizerOptions& optimizer_options,
-      thread::ThreadPool* thread_pool = nullptr,
-      DistributedFunctionLibraryRuntime* parent = nullptr,
-      const CustomKernelCreator* custom_kernel_creator = nullptr)
-      : ProcessFunctionLibraryRuntime(
-            device_mgr, env, config, graph_def_version, lib_def,
-            optimizer_options, thread_pool, parent, custom_kernel_creator) {}
+  using ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime;
 
 #if !defined(IS_MOBILE_PLATFORM)
   void Run(const FunctionLibraryRuntime::Options& opts,
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index e7e2fb7b197..dc805d091bf 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -55,13 +55,13 @@ const int32 kInvalidOutputNum = -1;
 #endif
 }  // namespace
 
-void TensorHandle::SetResourceHandleDtypeAndShape(
-    std::vector<DtypeAndPartialTensorShape>&& dtypes_and_shapes) {
-  handle_dtypes_and_shapes_ = std::move(dtypes_and_shapes);
+void TensorHandle::SetResourceHandleInfo(
+    ResourceHandleInfo&& resource_handle_info) {
+  resource_handle_info_ = std::move(resource_handle_info);
 }
 
-Status TensorHandle::GetResourceHandleDtypesAndShapes(
-    std::vector<DtypeAndPartialTensorShape>* result) {
+Status TensorHandle::GetResourceHandleInfoImpl(
+    std::function<void()> set_resource_info) {
   if (dtype != DT_RESOURCE) {
     return errors::InvalidArgument(
         "TensorHandle::GetResourceDtypeAndShape should be called on tensor "
@@ -70,22 +70,42 @@ Status TensorHandle::GetResourceHandleDtypesAndShapes(
   }
 
   if (IsRemote()) {
-    *result = handle_dtypes_and_shapes_;
+    set_resource_info();
     return Status::OK();
   }
 
   // Wait for this TensorHandle to be ready.
-  profiler::TraceMe activity(
-      "TensorHandle::GetResourceHandleDtypesAndShapes WaitReady",
-      profiler::TraceMeLevel::kInfo);
+  profiler::TraceMe activity("TensorHandle::GetResourceHandleInfo WaitReady",
+                             profiler::TraceMeLevel::kInfo);
   auto& data = absl::get<LocalTensorHandleData>(data_);
-  TF_RETURN_IF_ERROR(
-      data.WaitReady("TensorHandle::GetResourceHandleDtypesAndShapes"));
+  TF_RETURN_IF_ERROR(data.WaitReady("TensorHandle::GetResourceHandleInfo"));
 
-  *result = handle_dtypes_and_shapes_;
+  set_resource_info();
   return Status::OK();
 }
 
+Status TensorHandle::GetResourceHandleInfo(ResourceHandleInfo* result) {
+  auto get_resource_info = [result, this]() {
+    *result = resource_handle_info_;
+  };
+  return GetResourceHandleInfoImpl(get_resource_info);
+}
+
+Status TensorHandle::GetResourceHandleDtypesAndShapes(
+    std::vector<DtypeAndPartialTensorShape>* result) {
+  auto get_resource_info = [result, this]() {
+    *result = resource_handle_info_.dtypes_and_shapes;
+  };
+  return GetResourceHandleInfoImpl(get_resource_info);
+}
+
+Status TensorHandle::GetResourceAllowedDevices(std::vector<string>* result) {
+  auto get_resource_info = [result, this]() {
+    *result = resource_handle_info_.allowed_devices;
+  };
+  return GetResourceHandleInfoImpl(get_resource_info);
+}
+
 Status TensorHandle::CreateLocalHandle(const tensorflow::Tensor& t,
                                        TensorHandle** h) {
   // TODO(b/136608821): Move away from nullptr
@@ -145,8 +165,9 @@ TensorHandle::TensorHandle(tensorflow::Tensor&& t, Device* d, Device* op_device,
           GetResourceDevice(t.flat<class ResourceHandle>()(0), ctx)),
       ctx_(ctx),
       implicit_mirroring_(true),
-      handle_dtypes_and_shapes_(
-          t.flat<class ResourceHandle>()(0).dtypes_and_shapes()),
+      resource_handle_info_(
+          {t.flat<class ResourceHandle>()(0).dtypes_and_shapes(),
+           t.flat<class ResourceHandle>()(0).allowed_devices()}),
       data_(absl::in_place_type<LocalTensorHandleData>, std::move(t)) {
   DVLOG(3) << "Creating Local TensorHandle: " << this
            << " device: " << VariantDeviceDebugString(device_)
@@ -669,7 +690,8 @@ Status TensorHandle::SetTensor(tensorflow::Tensor&& t, const Device* d) {
 
     if (t.dtype() == DT_RESOURCE && t.NumElements() > 0) {
       auto& resource_handle = t.flat<class ResourceHandle>()(0);
-      handle_dtypes_and_shapes_ = resource_handle.dtypes_and_shapes();
+      resource_handle_info_ = {resource_handle.dtypes_and_shapes(),
+                               resource_handle.allowed_devices()};
     }
     auto& data = absl::get<LocalTensorHandleData>(data_);
     return data.SetTensor(std::move(t));
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index b3bf5ac22db..030976f32b8 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -210,13 +210,19 @@ class TensorHandle : public core::RefCounted {
 
   string DebugString() const;
 
-  void SetResourceHandleDtypeAndShape(
-      std::vector<DtypeAndPartialTensorShape>&& dtypes_and_shapes);
+  struct ResourceHandleInfo {
+    std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes;
+    std::vector<string> allowed_devices;
+  };
+
+  void SetResourceHandleInfo(ResourceHandleInfo&& resource_handle_info);
 
   // If this TensorHandle is 1) a local tensor, and 2) a resource handle,
-  // return data types and shapes of the underlying resource.
+  // return data types, shapes and allowed devices of the underlying resource.
+  Status GetResourceHandleInfo(ResourceHandleInfo* result);
   Status GetResourceHandleDtypesAndShapes(
       std::vector<DtypeAndPartialTensorShape>* result);
+  Status GetResourceAllowedDevices(std::vector<string>* result);
 
  private:
   // The TensorHandleData can either represent a local or remote tensor handle.
@@ -225,6 +231,8 @@ class TensorHandle : public core::RefCounted {
   // with a ready version of the tensor handle data.
   bool IsReady() const;
 
+  Status GetResourceHandleInfoImpl(std::function<void()> set_resource_info);
+
   VariantDevice const device_;
 
   // Device in which the op producing this tensor was executed. Equals to
@@ -241,17 +249,17 @@ class TensorHandle : public core::RefCounted {
 
   // Map of local mirrors. This can include both ready and non-ready mirrors.
   std::unordered_map<const tensorflow::Device*, LocalTensorHandleData>
-      local_mirrors_ GUARDED_BY(mu_);
+      local_mirrors_ TF_GUARDED_BY(mu_);
 #if !defined(IS_MOBILE_PLATFORM)
   // TODO(yujingzhang): Remove resource_shape_mirrors_ once scalable per-replica
   // variable is ready, since we could get the shape locally without remote copy
   // then.
   std::unordered_map<string, RemoteTensorHandleData> resource_shape_mirrors_
-      GUARDED_BY(mu_);
+      TF_GUARDED_BY(mu_);
   // TODO(gjn): Is std::map the most optimal choice here? Perhaps this should be
   // a fixed size map.
   std::unordered_map<string, RemoteTensorHandleData> remote_mirrors_
-      GUARDED_BY(mu_);
+      TF_GUARDED_BY(mu_);
 #endif
 
   // `ctx` is only guaranteed to be set if the handle is not "ready". This is
@@ -268,9 +276,9 @@ class TensorHandle : public core::RefCounted {
   bool implicit_mirroring_;
 
   // If this TensorHandle 1) is a local tensor, and 2) is a resource handle or
-  // refers to a remote resource handle, we store data types and shapes for
-  // the underlying resource.
-  std::vector<DtypeAndPartialTensorShape> handle_dtypes_and_shapes_;
+  // refers to a remote resource handle, we store data types, shapes and allowed
+  // devices for the underlying resource.
+  ResourceHandleInfo resource_handle_info_;
 
   // Does not need synchronization because it can be accessed only after
   // WaitReady() has returned. At that point, data_ is immutable.
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_data.h b/tensorflow/core/common_runtime/eager/tensor_handle_data.h
index bcf38d5b695..e5cd1b6f40c 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_data.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_data.h
@@ -98,8 +98,8 @@ class LocalTensorHandleData {
 
    private:
     mutable mutex mu_;
-    bool is_ready_ GUARDED_BY(mu_);
-    Status is_poisoned_ GUARDED_BY(mu_);
+    bool is_ready_ TF_GUARDED_BY(mu_);
+    Status is_poisoned_ TF_GUARDED_BY(mu_);
   };
 
   absl::variant<NonBlockingControl, BlockingControl> ctrl_;
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 4a8d38c8e53..a1b2224743f 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -121,12 +121,6 @@ void SetMemory(NodeExecStatsInterface* stats, OpKernelContext* ctx) {
   stats->SetMemory(ctx);
 }
 
-void SetReferencedTensors(NodeExecStatsInterface* stats,
-                          const TensorReferenceVector& tensors) {
-  if (!stats) return;
-  stats->SetReferencedTensors(tensors);
-}
-
 }  // namespace nodestats
 
 class ExecutorImpl;
@@ -403,9 +397,6 @@ class ExecutorImpl : public Executor {
   LocalExecutorParams params_;
   GraphView gview_;
 
-  // A cached value of params_
-  bool device_record_tensor_accesses_ = false;
-
   // Root nodes (with no in edges) that should form the initial ready queue
   std::vector<const NodeItem*> root_nodes_;
 
@@ -414,6 +405,9 @@ class ExecutorImpl : public Executor {
   // the overhead of constructing it for each executor instance.
   gtl::FlatMap<string, FrameInfo*> frame_info_;
 
+  // Shallow copies of the constant tensors used in the graph.
+  std::vector<Tensor> const_tensors_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(ExecutorImpl);
 };
 
@@ -636,11 +630,6 @@ Status ExecutorImpl::Initialize(const Graph& graph) {
   ControlFlowInfo cf_info;
   TF_RETURN_IF_ERROR(BuildControlFlowInfo(&graph, &cf_info));
 
-  // Cache this value so we make this virtual function call once, rather
-  // that O(# steps * # nodes per step) times.
-  device_record_tensor_accesses_ =
-      params_.device->RequiresRecordingAccessedTensors();
-
   for (auto& it : cf_info.unique_frame_names) {
     EnsureFrameInfo(it)->nodes = new std::vector<const NodeItem*>;
   }
@@ -667,7 +656,14 @@ Status ExecutorImpl::Initialize(const Graph& graph) {
     CHECK(item->kernel);
     item->kernel_is_async = (item->kernel->AsAsync() != nullptr);
     item->is_merge = IsMerge(n);
-    item->const_tensor = item->kernel->const_tensor();
+    const Tensor* const_tensor = item->kernel->const_tensor();
+    if (const_tensor) {
+      // Hold onto a shallow copy of the constant tensor in `*this` so that the
+      // reference count does not drop to 1. This prevents the constant tensor
+      // from being forwarded, and its buffer reused.
+      const_tensors_.emplace_back(*const_tensor);
+    }
+    item->const_tensor = const_tensor;
     item->is_noop = (item->kernel->type_string_view() == "NoOp");
     item->is_enter = IsEnter(n);
     if (item->is_enter) {
@@ -918,72 +914,107 @@ class ExecutorState {
 
  private:
   // Either a tensor pointer (pass-by-reference) or a tensor (pass-by-value).
-  // TODO(yuanbyu): A better way to do "has_value"?
   struct Entry {
-    Entry() {}
+    enum class State {
+      NO_VALUE = 0,      // The default state for a newly-created Entry.
+      HAS_VALUE,         // `this->val` is valid.
+      HAS_CONST_TENSOR,  // `this->const_tensor` is valid.
+      HAS_REF_TENSOR,    // `this->ref_tensor` is valid.
+    };
+
+    Entry() : state(State::NO_VALUE) {}
     Entry(const Entry& other)
-        : ref(other.ref),
-          ref_mu(other.ref_mu),
-          has_value(other.has_value),
-          val_field_is_set(other.val_field_is_set),
-          alloc_attr(other.alloc_attr) {
-      if (val_field_is_set) {
-        val.Init(*other.val);
+        : state(other.state), alloc_attr(other.alloc_attr) {
+      switch (state) {
+        case State::NO_VALUE:
+          break;
+        case State::HAS_VALUE:
+          val.Init(*other.val);
+          break;
+        case State::HAS_CONST_TENSOR:
+          const_tensor = other.const_tensor;
+          break;
+        case State::HAS_REF_TENSOR:
+          ref_tensor = other.ref_tensor;
+          break;
       }
     }
+
     ~Entry() {
-      if (val_field_is_set) val.Destroy();
+      if (state == State::HAS_VALUE) val.Destroy();
     }
 
     Entry& operator=(const Entry& other) {
-      if (val_field_is_set) {
+      if (state == State::HAS_VALUE) {
         val.Destroy();
       }
-      ref = other.ref;
-      ref_mu = other.ref_mu;
-      has_value = other.has_value;
-      val_field_is_set = other.val_field_is_set;
+      state = other.state;
       alloc_attr = other.alloc_attr;
-      if (val_field_is_set) {
-        val.Init(*other.val);
+      switch (state) {
+        case State::NO_VALUE:
+          break;
+        case State::HAS_VALUE:
+          val.Init(*other.val);
+          break;
+        case State::HAS_CONST_TENSOR:
+          const_tensor = other.const_tensor;
+          break;
+        case State::HAS_REF_TENSOR:
+          ref_tensor = other.ref_tensor;
+          break;
       }
       return *this;
     }
 
     Entry& operator=(Entry&& other) {
-      if (val_field_is_set) {
+      if (state == State::HAS_VALUE) {
         val.Destroy();
       }
-      ref = other.ref;
-      ref_mu = other.ref_mu;
-      has_value = other.has_value;
-      val_field_is_set = other.val_field_is_set;
+      state = other.state;
       alloc_attr = other.alloc_attr;
-      if (val_field_is_set) {
-        val.Init(std::move(*other.val));
+      switch (state) {
+        case State::NO_VALUE:
+          break;
+        case State::HAS_VALUE:
+          val.Init(std::move(*other.val));
+          break;
+        case State::HAS_CONST_TENSOR:
+          const_tensor = other.const_tensor;
+          break;
+        case State::HAS_REF_TENSOR:
+          ref_tensor = other.ref_tensor;
+          break;
       }
       return *this;
     }
 
-    // Clears the <val> field.
+    // Clears the <val> field, and sets this entry to the `NO_VALUE` state.
     void ClearVal() {
-      if (val_field_is_set) {
+      if (state == State::HAS_VALUE) {
         val.Destroy();
-        val_field_is_set = false;
-        has_value = false;
       }
+      state = State::NO_VALUE;
     }
 
-    // A tensor value, if val_field_is_set.
-    ManualConstructor<Tensor> val;
+    union {
+      // A tensor value. Valid iff `state_ == HAS_VALUE`.
+      ManualConstructor<Tensor> val;
 
-    Tensor* ref = nullptr;    // A tensor reference.
-    mutex* ref_mu = nullptr;  // mutex for *ref if ref is not nullptr.
+      // A pointer to a constant tensor value. Valid iff `state_ ==
+      // HAS_CONST_TENSOR`.
+      const Tensor* const_tensor;
 
-    // Whether the value exists, either in <val> or <ref>.
-    bool has_value = false;
+      // A tensor reference and associated mutex. Valid iff `state_ ==
+      // HAS_REF_TENSOR`.
+      struct {
+        Tensor* tensor;
+        mutex* mu;
+      } ref_tensor;
+    };
 
-    bool val_field_is_set = false;
+    // The current state of this entry, indicating which member of the above
+    // union is active.
+    State state;
 
     // The attributes of the allocator that creates the tensor.
     AllocatorAttributes alloc_attr;
@@ -1117,16 +1148,16 @@ class ExecutorState {
     int num_pending_inputs = 0;
 
     // The highest iteration number we have reached so far in this frame.
-    int64 iteration_count GUARDED_BY(mu) = 0;
+    int64 iteration_count TF_GUARDED_BY(mu) = 0;
 
     // The number of outstanding iterations.
-    int num_outstanding_iterations GUARDED_BY(mu) = 1;
+    int num_outstanding_iterations TF_GUARDED_BY(mu) = 1;
 
    private:
     // The active iteration states of this frame.
     gtl::InlinedVector<IterationState*, 12> iterations;
-    IterationState** const iterations_raw GUARDED_BY(mu);
-    IterationState* iterations_first GUARDED_BY(mu);
+    IterationState** const iterations_raw TF_GUARDED_BY(mu);
+    IterationState* iterations_first TF_GUARDED_BY(mu);
 
    public:
     // The NextIteration nodes to enter a new iteration. If the number of
@@ -1134,18 +1165,18 @@ class ExecutorState {
     // the next iteration until the number of outstanding iterations falls
     // below the limit.
     std::vector<std::pair<const NodeItem*, Entry>> next_iter_roots
-        GUARDED_BY(mu);
+        TF_GUARDED_BY(mu);
 
     // The values of the loop invariants for this loop. They are added into
     // this list as they "enter" the frame. When a loop invariant enters,
     // we make it available to all active iterations. When the frame starts
     // a new iteration, we make all the current loop invariants available
     // to the new iteration.
-    std::vector<std::pair<const NodeItem*, Entry>> inv_values GUARDED_BY(mu);
+    std::vector<std::pair<const NodeItem*, Entry>> inv_values TF_GUARDED_BY(mu);
 
     // The list of dead exit node items for the current highest iteration. We
     // will only "execute" the dead exits of the final iteration.
-    std::vector<const NodeItem*> dead_exits GUARDED_BY(mu);
+    std::vector<const NodeItem*> dead_exits TF_GUARDED_BY(mu);
 
     // Static information specific to this frame.
     PendingCounts* pending_counts = nullptr;
@@ -1167,7 +1198,7 @@ class ExecutorState {
     }
 
     inline IterationState* GetIteration(int64 iter)
-        EXCLUSIVE_LOCKS_REQUIRED(mu) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu) {
       if (TF_PREDICT_TRUE(iter == 0)) {
         return iterations_first;
       } else {
@@ -1177,7 +1208,7 @@ class ExecutorState {
     }
 
     inline void SetIteration(int64 iter, IterationState* state)
-        EXCLUSIVE_LOCKS_REQUIRED(mu) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu) {
       size_t index = iter % (max_parallel_iterations + 1);
       DCHECK(state == nullptr || iterations[index] == nullptr);
       iterations_raw[index] = state;
@@ -1198,7 +1229,7 @@ class ExecutorState {
     // frame. Return true iff the execution of the frame is done.
     inline bool DecrementOutstandingOpsLocked(const GraphView* gview,
                                               int64 iter, TaggedNodeSeq* ready)
-        EXCLUSIVE_LOCKS_REQUIRED(mu) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu) {
       IterationState* istate = GetIteration(iter);
       istate->outstanding_ops--;
       if (istate->outstanding_ops != 0) {
@@ -1209,39 +1240,40 @@ class ExecutorState {
     }
 
     // Returns true if the computation in the frame is completed.
-    inline bool IsFrameDone() EXCLUSIVE_LOCKS_REQUIRED(mu) {
+    inline bool IsFrameDone() TF_EXCLUSIVE_LOCKS_REQUIRED(mu) {
       return (num_pending_inputs == 0 && num_outstanding_iterations == 0);
     }
 
     // Returns true if the iteration of the frame is completed.
-    bool IsIterationDone(int64 iter) EXCLUSIVE_LOCKS_REQUIRED(mu);
+    bool IsIterationDone(int64 iter) TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
 
     // Increments the iteration id. If this is a new iteration, initialize it.
     void IncrementIteration(const GraphView* gview, TaggedNodeSeq* ready)
-        EXCLUSIVE_LOCKS_REQUIRED(mu);
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
 
     // Activate all the deferred NextIteration nodes in a new iteration.
     void ActivateNexts(const GraphView* gview, int64 iter, TaggedNodeSeq* ready)
-        EXCLUSIVE_LOCKS_REQUIRED(mu);
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
 
     // Activate all the current loop invariants in a new iteration.
     void ActivateLoopInvs(const GraphView* gview, int64 iter,
-                          TaggedNodeSeq* ready) EXCLUSIVE_LOCKS_REQUIRED(mu);
+                          TaggedNodeSeq* ready) TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
 
     // Add a new loop invariant and make it available to all active
     // iterations.
     void AddLoopInv(const NodeItem* item, const Entry& entry,
-                    TaggedNodeSeq* ready) EXCLUSIVE_LOCKS_REQUIRED(mu);
+                    TaggedNodeSeq* ready) TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
 
     // Activate the successors of a node. Contents of *outputs are left in an
     // indeterminate state after returning from this method.
     void ActivateNodes(const NodeItem* item, const bool is_dead, int64 iter,
                        EntryVector* outputs, TaggedNodeSeq* ready)
-        EXCLUSIVE_LOCKS_REQUIRED(mu);
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
 
     // Cleanup iterations of this frame starting from iteration iter.
     bool CleanupIterations(const GraphView* gview, int64 iter,
-                           TaggedNodeSeq* ready) EXCLUSIVE_LOCKS_REQUIRED(mu);
+                           TaggedNodeSeq* ready)
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
 
     void DumpIterationState(ExecutorState* parent) {
       mutex_lock l(mu);
@@ -1321,7 +1353,6 @@ class ExecutorState {
   int64 step_id_;
   // Not owned.
   RendezvousInterface* rendezvous_;
-  Executor::RendezvousFactory* create_rendezvous_ = nullptr;
   CollectiveExecutor* collective_executor_ = nullptr;
   SessionState* session_state_;
   string session_handle_;
@@ -1361,18 +1392,19 @@ class ExecutorState {
 
   // Available via OpKernelContext to every OpKernel invocation.
   mutex num_deferred_ops_mu_;
-  int64 num_deferred_ops_ GUARDED_BY(num_deferred_ops_mu_) = 0;
-  bool finish_when_deferred_ops_done_ GUARDED_BY(num_deferred_ops_mu_) = false;
+  int64 num_deferred_ops_ TF_GUARDED_BY(num_deferred_ops_mu_) = 0;
+  bool finish_when_deferred_ops_done_ TF_GUARDED_BY(num_deferred_ops_mu_) =
+      false;
 
   mutex mu_;
-  Status status_ GUARDED_BY(mu_);
+  Status status_ TF_GUARDED_BY(mu_);
 
   // Mapping from frame name to outstanding frames. A new frame is created
   // at some iteration of an active frame. So the unique key for the new
   // child frame is composed of the name of the parent frame, the iteration
   // number at which the parent frame is creating the new frame, and the
   // name of the new frame from nodedef.
-  gtl::FlatMap<string, FrameState*> outstanding_frames_ GUARDED_BY(mu_);
+  gtl::FlatMap<string, FrameState*> outstanding_frames_ TF_GUARDED_BY(mu_);
 
   // The unique name of a frame.
   inline string MakeFrameName(FrameState* frame, int64 iter_id,
@@ -1396,6 +1428,16 @@ class ExecutorState {
   // Process a ready node in current thread.
   void Process(TaggedNode node, int64 scheduled_nsec);
 
+  Status ProcessSync(const NodeItem& item, OpKernelContext::Params* params,
+                     EntryVector* outputs,
+                     NodeExecStatsInterface* stats);
+  void ProcessAsync(const NodeItem& item, const OpKernelContext::Params& params,
+                    const TaggedNode& tagged_node, Entry* first_input,
+                    NodeExecStatsInterface* stats);
+  void ProcessNoop(NodeExecStatsInterface* stats);
+  void ProcessConstTensor(const NodeItem& item, EntryVector* outputs,
+                          NodeExecStatsInterface* stats);
+
   // Before invoking item->kernel, fills in its "inputs".
   Status PrepareInputs(const NodeItem& item, Entry* first_input,
                        TensorValueVec* inputs,
@@ -1452,7 +1494,7 @@ class ExecutorState {
   // resizes and this particular iteration's array element will not
   // be changed out from under us because the iteration is still alive).
   Entry* GetInputTensors(FrameState* input_frame,
-                         int64 input_iter) const NO_THREAD_SAFETY_ANALYSIS {
+                         int64 input_iter) const TF_NO_THREAD_SAFETY_ANALYSIS {
     return input_frame->GetIteration(input_iter)->input_tensors;
   }
 };
@@ -1462,7 +1504,6 @@ ExecutorState::ExecutorState(const Executor::Args& args, ExecutorImpl* impl)
       log_memory_(LogMemory::IsEnabled()),
       step_id_(args.step_id),
       rendezvous_(args.rendezvous),
-      create_rendezvous_(&impl->params_.rendezvous_factory),
       collective_executor_(args.collective_executor),
       session_state_(args.session_state),
       session_handle_(args.session_handle),
@@ -1691,14 +1732,122 @@ bool MightTrace(const NodeItem& item,
       profiler::GetTFTraceMeLevel(item.kernel->IsExpensive()));
 }
 
+Status ExecutorState::ProcessSync(const NodeItem& item,
+                                  OpKernelContext::Params* params,
+                                  EntryVector* outputs,
+                                  NodeExecStatsInterface* stats) {
+  Status s;
+  OpKernelContext ctx(params, item.num_outputs);
+  nodestats::SetOpStart(stats);
+
+  OpKernel* op_kernel = item.kernel;
+  Device* device = impl_->params_.device;
+
+  if (TF_PREDICT_FALSE(MightTrace(item, event_collector_))) {
+    tracing::ScopedRegion region(tracing::EventCategory::kCompute,
+                                 op_kernel->name_view());
+    profiler::AnnotatedTraceMe activity(
+        [&] {
+          return op_kernel->TraceString(
+              &ctx, /*verbose=*/profiler::TfOpDetailsEnabled());
+        },
+        profiler::GetTFTraceMeLevel(op_kernel->IsExpensive()));
+    device->Compute(op_kernel, &ctx);
+    nodestats::SetOpEnd(stats);
+    s = ProcessOutputs(item, &ctx, outputs, stats);
+  } else {
+    // In the common case, avoid creating any tracing objects.
+    if (op_kernel->IsExpensive()) {
+      KernelTimer timer;
+      device->Compute(op_kernel, &ctx);
+      op_kernel->UpdateCostEstimate(timer.ElapsedCycles());
+    } else {
+      device->Compute(op_kernel, &ctx);
+    }
+    nodestats::SetOpEnd(stats);
+    s = ProcessOutputs(item, &ctx, outputs, stats);
+  }
+  nodestats::SetMemory(stats, &ctx);
+  return s;
+}
+
+void ExecutorState::ProcessAsync(const NodeItem& item,
+                                 const OpKernelContext::Params& params,
+                                 const TaggedNode& tagged_node,
+                                 Entry* first_input,
+                                 NodeExecStatsInterface* stats) {
+  AsyncOpKernel* async_kernel = item.kernel->AsAsync();
+  DCHECK(async_kernel != nullptr);
+  AsyncState* state =
+      new AsyncState(params, tagged_node, &item, first_input, stats);
+
+  auto done = [this, state]() {
+    Device* device = impl_->params_.device;
+    NodeExecStatsInterface* stats = state->stats;  // Shorthand
+    Entry* first_input = state->first_input;       // Shorthand
+
+    nodestats::SetOpEnd(stats);
+    EntryVector outputs;
+    Status s = ProcessOutputs(*state->item, &state->ctx, &outputs, stats);
+    nodestats::SetMemory(stats, &state->ctx);
+    if (vlog_) {
+      VLOG(2) << "Async kernel done: " << state->item->node_id << " step "
+              << step_id_ << " " << SummarizeNodeDef(state->item->kernel->def())
+              << (state->tagged_node.is_dead ? " is dead" : "")
+              << " device: " << device->name();
+    }
+
+    // Clears inputs.
+    const int num_inputs = state->item->num_inputs;
+    for (int i = 0; i < num_inputs; ++i) {
+      (first_input + i)->ClearVal();
+    }
+    FrameState* input_frame = state->tagged_node.input_frame;
+    const int64 input_iter = state->tagged_node.input_iter;
+    MaybeMarkCompleted(input_frame, input_iter, *state->item);
+    TaggedNodeSeq ready;
+    if (s.ok()) {
+      PropagateOutputs(state->tagged_node, state->item, &outputs, &ready);
+    }
+    outputs.clear();
+    const bool completed = NodeDone(s, &ready, stats, nullptr);
+    delete state;
+    if (completed) ScheduleFinish();
+  };
+  nodestats::SetOpStart(stats);
+  {
+    profiler::AnnotatedTraceMe activity(
+        [&] {
+          return async_kernel->TraceString(
+              &state->ctx, /*verbose=*/profiler::TfOpDetailsEnabled());
+        },
+        profiler::GetTFTraceMeLevel(async_kernel->IsExpensive()));
+    impl_->params_.device->ComputeAsync(async_kernel, &state->ctx,
+                                        std::move(done));
+  }
+}
+
+void ExecutorState::ProcessNoop(NodeExecStatsInterface* stats) {
+  nodestats::SetOpStart(stats);
+  nodestats::SetOpEnd(stats);
+}
+
+void ExecutorState::ProcessConstTensor(const NodeItem& item,
+                                       EntryVector* outputs,
+                                       NodeExecStatsInterface* stats) {
+  nodestats::SetOpStart(stats);
+  nodestats::SetOpEnd(stats);
+  outputs->resize(1);
+  Entry& output = (*outputs)[0];
+  output.state = Entry::State::HAS_CONST_TENSOR;
+  output.const_tensor = item.const_tensor;
+  output.alloc_attr = item.output_attrs()[0];
+}
+
 void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
   profiler::TraceMe activity(
       [&] {
-        int64 id = step_id_;
-        if (step_container_ && step_container_->step_id()) {
-          id = step_container_->step_id();
-        }
-        return absl::StrCat("ExecutorState::Process#id=", id,
+        return absl::StrCat("ExecutorState::Process#id=", step_id_,
                             ",iter_num=", tagged_node.input_iter, "#");
       },
       2);
@@ -1720,9 +1869,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
     params.device = device;
   }
   params.log_memory = log_memory_;
-  params.record_tensor_accesses = impl_->device_record_tensor_accesses_;
   params.rendezvous = rendezvous_;
-  params.create_rendezvous = create_rendezvous_;
   params.collective_executor = collective_executor_;
   params.session_state = session_state_;
   params.session_handle = session_handle_;
@@ -1737,6 +1884,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
   params.inputs = &inputs;
   params.input_alloc_attrs = &input_alloc_attrs;
   params.runner = &runner_;
+  params.run_all_kernels_inline = run_all_kernels_inline_;
   params.stats_collector = stats_collector_;
   params.inc_num_deferred_ops_function = [this]() {
     mutex_lock lock(num_deferred_ops_mu_);
@@ -1803,14 +1951,16 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
     Entry* first_input = input_tensors + item.input_start;
     outputs.clear();
 
-    TensorReferenceVector accessed_tensors;
-    DeviceContext* device_context = nullptr;
     // Only execute this node if it is not dead or it is a send/recv
     // transfer node. For transfer nodes, we need to propagate the "dead"
     // bit even when the node is dead.
     bool launched_asynchronously = false;
     if (tagged_node.is_dead && !item.is_transfer_node) {
       outputs.resize(item.num_outputs);
+    } else if (TF_PREDICT_FALSE(item.is_noop)) {
+      ProcessNoop(stats);
+    } else if (item.const_tensor != nullptr && !params.track_allocations) {
+      ProcessConstTensor(item, &outputs, stats);
     } else {
       // Prepares inputs.
       bool is_input_dead = false;
@@ -1818,7 +1968,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
                         &is_input_dead);
       if (!s.ok()) {
         // Clear inputs.
-        int num_inputs = item.num_inputs;
+        const int num_inputs = item.num_inputs;
         for (int i = 0; i < num_inputs; ++i) {
           (first_input + i)->ClearVal();
         }
@@ -1829,8 +1979,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
       }
 
       // Set up compute params.
-      OpKernel* op_kernel = item.kernel;
-      params.op_kernel = op_kernel;
+      params.op_kernel = item.kernel;
       params.frame_iter = FrameAndIter(input_frame->frame_id, input_iter);
       params.is_input_dead = is_input_dead;
       params.output_attr_array = item.output_attrs();
@@ -1838,111 +1987,10 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
       params.outputs_required_array = item.outputs_required.get();
 
       if (item.kernel_is_async) {
-        // Asynchronous computes.
-        AsyncOpKernel* async = item.kernel->AsAsync();
-        DCHECK(async != nullptr);
+        ProcessAsync(item, params, tagged_node, first_input, stats);
         launched_asynchronously = true;
-        AsyncState* state =
-            new AsyncState(params, tagged_node, &item, first_input, stats);
-
-        auto done = [this, state]() {
-          Device* device = impl_->params_.device;
-          NodeExecStatsInterface* stats = state->stats;  // Shorthand
-          Entry* first_input = state->first_input;       // Shorthand
-
-          nodestats::SetOpEnd(stats);
-          EntryVector outputs;
-          Status s = ProcessOutputs(*state->item, &state->ctx, &outputs, stats);
-          nodestats::SetMemory(stats, &state->ctx);
-          if (vlog_) {
-            VLOG(2) << "Async kernel done: " << state->item->node_id << " step "
-                    << step_id_ << " "
-                    << SummarizeNodeDef(state->item->kernel->def())
-                    << (state->tagged_node.is_dead ? " is dead" : "")
-                    << " device: " << device->name();
-          }
-
-          // Clears inputs.
-          const int num_inputs = state->item->num_inputs;
-          for (int i = 0; i < num_inputs; ++i) {
-            (first_input + i)->ClearVal();
-          }
-          FrameState* input_frame = state->tagged_node.input_frame;
-          const int64 input_iter = state->tagged_node.input_iter;
-          MaybeMarkCompleted(input_frame, input_iter, *state->item);
-          TaggedNodeSeq ready;
-          if (s.ok()) {
-            PropagateOutputs(state->tagged_node, state->item, &outputs, &ready);
-          }
-          outputs.clear();
-          if (s.ok() && impl_->device_record_tensor_accesses_) {
-            // Get the list of all tensors accessed during the execution
-            TensorReferenceVector accessed;
-            state->ctx.retrieve_accessed_tensors(&accessed);
-            nodestats::SetReferencedTensors(stats, accessed);
-            // callee takes ownership of the vector
-            device->ConsumeListOfAccessedTensors(state->ctx.op_device_context(),
-                                                 accessed);
-          }
-          const bool completed = NodeDone(s, &ready, stats, nullptr);
-          delete state;
-          if (completed) ScheduleFinish();
-        };
-        nodestats::SetOpStart(stats);
-        {
-          profiler::AnnotatedTraceMe activity(
-              [&] {
-                return op_kernel->TraceString(
-                    &state->ctx, /*verbose=*/profiler::TfOpDetailsEnabled());
-              },
-              profiler::GetTFTraceMeLevel(op_kernel->IsExpensive()));
-          device->ComputeAsync(async, &state->ctx, done);
-        }
       } else {
-        // Synchronous computes.
-        OpKernelContext ctx(&params, item.num_outputs);
-        nodestats::SetOpStart(stats);
-
-        if (TF_PREDICT_FALSE(item.is_noop)) {
-          nodestats::SetOpEnd(stats);
-        } else if (TF_PREDICT_FALSE(MightTrace(item, event_collector_))) {
-          tracing::ScopedRegion region(tracing::EventCategory::kCompute,
-                                       op_kernel->name_view());
-          profiler::AnnotatedTraceMe activity(
-              [&] {
-                return op_kernel->TraceString(
-                    &ctx, /*verbose=*/profiler::TfOpDetailsEnabled());
-              },
-              profiler::GetTFTraceMeLevel(op_kernel->IsExpensive()));
-          device->Compute(op_kernel, &ctx);
-          nodestats::SetOpEnd(stats);
-          s = ProcessOutputs(item, &ctx, &outputs, stats);
-        } else if (item.const_tensor != nullptr && !ctx.track_allocations()) {
-          // Special case for ConstantOp, which is very common.
-          nodestats::SetOpEnd(stats);
-          outputs.resize(1);
-          outputs[0].has_value = true;
-          outputs[0].val_field_is_set = true;
-          outputs[0].alloc_attr = ctx.output_alloc_attr(0);
-          outputs[0].val.Init(*item.const_tensor);
-        } else {
-          // In the common case, avoid creating any tracing objects.
-          if (op_kernel->IsExpensive()) {
-            KernelTimer timer;
-            device->Compute(op_kernel, &ctx);
-            op_kernel->UpdateCostEstimate(timer.ElapsedCycles());
-          } else {
-            device->Compute(op_kernel, &ctx);
-          }
-          nodestats::SetOpEnd(stats);
-          s = ProcessOutputs(item, &ctx, &outputs, stats);
-        }
-        if (s.ok() && impl_->device_record_tensor_accesses_) {
-          // Get the list of all tensors accessed during the execution
-          ctx.retrieve_accessed_tensors(&accessed_tensors);
-          device_context = ctx.op_device_context();
-        }
-        nodestats::SetMemory(stats, &ctx);
+        s = ProcessSync(item, &params, &outputs, stats);
       }
     }
 
@@ -1965,11 +2013,6 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
         PropagateOutputs(tagged_node, &item, &outputs, &ready);
       }
       outputs.clear();
-      if (!accessed_tensors.empty()) {
-        nodestats::SetReferencedTensors(stats, accessed_tensors);
-        // device_context is set above in synchronous computes
-        device->ConsumeListOfAccessedTensors(device_context, accessed_tensors);
-      }
       if (stats) {
         scheduled_nsec = nodestats::NowInNsec();
       }
@@ -2002,68 +2045,91 @@ Status ExecutorState::PrepareInputs(const NodeItem& item, Entry* first_input,
     // i-th input.
     TensorValue* inp = &(*inputs)[i];
 
-    // Only merge and transfer nodes can have no-value inputs.
-    if (!entry->has_value) {
-      if (!is_merge) {
-        DCHECK(item.is_transfer_node)
-            << item.kernel->name() << " - input " << i;
-        DCHECK(!entry->val_field_is_set)
-            << item.kernel->name() << " - input " << i;
-        entry->has_value = true;
-        entry->val_field_is_set = true;
-        entry->val.Init(*kEmptyTensor);
-        inp->tensor = entry->val.get();
-        *is_input_dead = true;
-      }
-      continue;
-    }
-    if (entry->ref == nullptr) {
-      if (expect_ref) {
-        return AttachDef(
-            errors::InvalidArgument(i, "-th input expects a ref type"),
-            item.kernel->def());
-      }
-      inp->tensor = entry->val.get();
-    } else {
-      {
-        tf_shared_lock ml(*entry->ref_mu);
-        if (!entry->ref->IsInitialized() && !item.is_initialization_op) {
-          return AttachDef(errors::FailedPrecondition(
-                               "Attempting to use uninitialized value ",
-                               item.kernel->requested_input(i)),
-                           item.kernel->def());
+    switch (entry->state) {
+      case Entry::State::NO_VALUE: {
+        // Only merge and transfer nodes can have no-value inputs.
+        if (!is_merge) {
+          DCHECK(item.is_transfer_node)
+              << item.kernel->name() << " - input " << i;
+          entry->state = Entry::State::HAS_CONST_TENSOR;
+          entry->const_tensor = kEmptyTensor;
+          // NOTE(mrry): This `const_cast` is necessary because `TensorValue`
+          // stores a non-const `Tensor*`, and relies on the `OpKernelContext`
+          // accessors making dynamic checks that prevent using an immutable
+          // tensor as a mutable tensor.
+          inp->tensor = const_cast<Tensor*>(kEmptyTensor);
+          *is_input_dead = true;
         }
+        break;
       }
-      if (expect_ref) {
-        inp->mutex_if_ref = entry->ref_mu;
-        inp->tensor = entry->ref;
-      } else {
-        // Automatically deref the tensor ref when the op expects a
-        // tensor but is given a ref to a tensor.  Need to deref it
-        // under the mutex.
-        {
-          tf_shared_lock l(*(entry->ref_mu));
-          DCHECK(!entry->val_field_is_set);
-          entry->val.Init(*entry->ref);
-          entry->val_field_is_set = true;
-        }
-        entry->ref = nullptr;
-        entry->ref_mu = nullptr;
 
-        inp->tensor = entry->val.get();
-        // The dtype of entry->ref could have been changed by another operation
-        // that ran after the operation that "produced" it executed, so
-        // re-validate that the type of the dereferenced tensor matches the
-        // expected input type.
-        if (item.input_type(i) != inp->tensor->dtype()) {
+      case Entry::State::HAS_VALUE: {
+        if (expect_ref) {
           return AttachDef(
-              errors::InvalidArgument(
-                  i, "-th input expects type ",
-                  DataTypeString(item.input_type(i)),
-                  " but automatically dereferenced input tensor has type ",
-                  DataTypeString(inp->tensor->dtype())),
+              errors::InvalidArgument(i, "-th input expects a ref type"),
               item.kernel->def());
         }
+        inp->tensor = entry->val.get();
+        break;
+      }
+
+      case Entry::State::HAS_CONST_TENSOR: {
+        if (expect_ref) {
+          return AttachDef(
+              errors::InvalidArgument(i, "-th input expects a ref type"),
+              item.kernel->def());
+        }
+        // NOTE(mrry): This `const_cast` is necessary because `TensorValue`
+        // stores a non-const `Tensor*`, and relies on the `OpKernelContext`
+        // accessors making dynamic checks that prevent using an immutable
+        // tensor as a mutable tensor.
+        inp->tensor = const_cast<Tensor*>(entry->const_tensor);
+        break;
+      }
+
+      case Entry::State::HAS_REF_TENSOR: {
+        {
+          tf_shared_lock ml(*entry->ref_tensor.mu);
+          if (!entry->ref_tensor.tensor->IsInitialized() &&
+              !item.is_initialization_op) {
+            return AttachDef(errors::FailedPrecondition(
+                                 "Attempting to use uninitialized value ",
+                                 item.kernel->requested_input(i)),
+                             item.kernel->def());
+          }
+        }
+
+        if (expect_ref) {
+          inp->mutex_if_ref = entry->ref_tensor.mu;
+          inp->tensor = entry->ref_tensor.tensor;
+        } else {
+          // Automatically deref the tensor ref when the op expects a
+          // tensor but is given a ref to a tensor.  Need to deref it
+          // under the mutex.
+          {
+            mutex* ref_mu = entry->ref_tensor.mu;
+            Tensor* ref_tensor = entry->ref_tensor.tensor;
+            tf_shared_lock l(*ref_mu);
+            entry->val.Init(*ref_tensor);
+          }
+          entry->state = Entry::State::HAS_VALUE;
+
+          inp->tensor = entry->val.get();
+          // The dtype of entry->ref_tensor.tensor could have been changed by
+          // another operation that ran after the operation that "produced" it
+          // executed, so re-validate that the type of the dereferenced tensor
+          // matches the expected input type.
+          if (item.input_type(i) != inp->tensor->dtype()) {
+            return AttachDef(
+                errors::InvalidArgument(
+                    i, "-th input expects type ",
+                    DataTypeString(item.input_type(i)),
+                    " but automatically dereferenced input tensor has type ",
+                    DataTypeString(inp->tensor->dtype())),
+                item.kernel->def());
+          }
+        }
+        break;
       }
     }
   }
@@ -2127,15 +2193,15 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
           nodestats::SetOutput(stats, i, val.tensor);
         }
         if (val.is_ref()) {
-          out->has_value = true;
-          out->ref = val.tensor;
-          out->ref_mu = val.mutex_if_ref;
+          out->state = Entry::State::HAS_REF_TENSOR;
+          out->ref_tensor.tensor = val.tensor;
+          out->ref_tensor.mu = val.mutex_if_ref;
           if (log_memory_) {
             Tensor to_log;
             {
               // Dereference the tensor under the lock.
-              tf_shared_lock l(*out->ref_mu);
-              to_log = *out->ref;
+              tf_shared_lock l(*out->ref_tensor.mu);
+              to_log = *out->ref_tensor.tensor;
             }
             LogMemory::RecordTensorOutput(ctx->op_kernel().name(),
                                           ctx->step_id(), i, to_log);
@@ -2143,9 +2209,7 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
         } else {
           // NOTE that std::move is used here, so val.tensor goes to
           // uninitialized state (val.tensor->IsInitialized return false).
-          DCHECK(!out->val_field_is_set);
-          out->has_value = true;
-          out->val_field_is_set = true;
+          out->state = Entry::State::HAS_VALUE;
           out->val.Init(std::move(*val.tensor));
           if (log_memory_) {
             LogMemory::RecordTensorOutput(ctx->op_kernel().name(),
@@ -2418,12 +2482,15 @@ inline void ExecutorState::MaybeMarkCompleted(FrameState* frame, int64 iter,
 }
 
 const Tensor* ExecutorState::GetTensorValueForDump(const Entry& input) {
-  if (!input.has_value) {
-    return kEmptyTensor;
-  } else if (input.ref == nullptr) {
-    return input.val.get();
-  } else {
-    return input.ref;
+  switch (input.state) {
+    case Entry::State::NO_VALUE:
+      return kEmptyTensor;
+    case Entry::State::HAS_VALUE:
+      return input.val.get();
+    case Entry::State::HAS_CONST_TENSOR:
+      return input.const_tensor;
+    case Entry::State::HAS_REF_TENSOR:
+      return input.ref_tensor.tensor;
   }
 }
 
@@ -2821,7 +2888,7 @@ void ExecutorState::FrameState::ActivateNodes(const NodeItem* item,
         dst_dead = (dead_cnt == dst_item->num_inputs);
         dst_ready = (count == 0) || ((count == 1) && dst_dead);
       } else {
-        if ((*outputs)[src_slot].has_value) {
+        if ((*outputs)[src_slot].state != Entry::State::NO_VALUE) {
           // This is a live data input.
           int count = iter_state->pending(dst_pending_id);
           iter_state->mark_live(dst_pending_id);
@@ -2848,7 +2915,8 @@ void ExecutorState::FrameState::ActivateNodes(const NodeItem* item,
       }
     } else {
       const bool increment_dead =
-          (is_dead || (!is_control_edge && !(*outputs)[src_slot].has_value));
+          (is_dead || (!is_control_edge &&
+                       (*outputs)[src_slot].state == Entry::State::NO_VALUE));
       int pending, dead;
       iter_state->adjust_for_activation(dst_pending_id, increment_dead,
                                         &pending, &dead);
@@ -2882,7 +2950,7 @@ void ExecutorState::FrameState::ActivateNexts(const GraphView* gview,
   for (auto& node_entry : next_iter_roots) {
     const NodeItem* item = node_entry.first;
     const Entry& entry = node_entry.second;
-    const bool is_dead = !entry.has_value;
+    const bool is_dead = entry.state == Entry::State::NO_VALUE;
     EntryVector outputs{entry};
     ActivateNodes(item, is_dead, iter, &outputs, ready);
   }
@@ -2896,7 +2964,7 @@ void ExecutorState::FrameState::ActivateLoopInvs(const GraphView* gview,
   for (auto& node_entry : inv_values) {
     const NodeItem* item = node_entry.first;
     const Entry& entry = node_entry.second;
-    const bool is_dead = !entry.has_value;
+    const bool is_dead = entry.state == Entry::State::NO_VALUE;
     EntryVector outputs{entry};
     ActivateNodes(item, is_dead, iter, &outputs, ready);
   }
@@ -2909,7 +2977,7 @@ void ExecutorState::FrameState::AddLoopInv(const NodeItem* item,
   inv_values.push_back({item, entry});
 
   // Make this value available to all iterations.
-  const bool is_dead = !entry.has_value;
+  const bool is_dead = entry.state == Entry::State::NO_VALUE;
   for (int i = 0; i <= iteration_count; ++i) {
     EntryVector outputs{entry};
     ActivateNodes(item, is_dead, i, &outputs, ready);
diff --git a/tensorflow/core/common_runtime/executor.h b/tensorflow/core/common_runtime/executor.h
index fcc64b9d986..b2a01f3497b 100644
--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@@ -83,9 +83,6 @@ class Executor {
   //
   // RunAsync() dispatches closures to "runner". Typically, "runner"
   // is backed up by a bounded threadpool.
-  typedef std::function<Status(const int64, const DeviceMgr*, Rendezvous** r)>
-      RendezvousFactory;
-
   struct Args {
     int64 step_id = 0;
     RendezvousInterface* rendezvous = nullptr;
@@ -149,9 +146,8 @@ struct LocalExecutorParams {
                        OpKernel**)>
       create_kernel;
   std::function<void(OpKernel*)> delete_kernel;
-
-  Executor::RendezvousFactory rendezvous_factory;
 };
+
 ::tensorflow::Status NewLocalExecutor(const LocalExecutorParams& params,
                                       const Graph& graph, Executor** executor);
 
@@ -188,8 +184,8 @@ class ExecutorBarrier {
   StatusCallback done_cb_ = nullptr;
 
   mutable mutex mu_;
-  int pending_ GUARDED_BY(mu_) = 0;
-  StatusGroup status_group_ GUARDED_BY(mu_);
+  int pending_ TF_GUARDED_BY(mu_) = 0;
+  StatusGroup status_group_ TF_GUARDED_BY(mu_);
 
   void WhenDone(const Status& s) {
     Rendezvous* error_rendez = nullptr;
diff --git a/tensorflow/core/common_runtime/executor_factory.cc b/tensorflow/core/common_runtime/executor_factory.cc
index 7f035cafe7a..e3e91596ff1 100644
--- a/tensorflow/core/common_runtime/executor_factory.cc
+++ b/tensorflow/core/common_runtime/executor_factory.cc
@@ -48,7 +48,7 @@ void ExecutorFactory::Register(const string& executor_type,
 
 namespace {
 const string RegisteredFactoriesErrorMessageLocked()
-    SHARED_LOCKS_REQUIRED(executor_factory_lock) {
+    TF_SHARED_LOCKS_REQUIRED(executor_factory_lock) {
   std::vector<string> factory_types;
   for (const auto& executor_factory : *executor_factories()) {
     factory_types.push_back(executor_factory.first);
diff --git a/tensorflow/core/common_runtime/executor_test.cc b/tensorflow/core/common_runtime/executor_test.cc
index 36e3e2aea42..5f012b19bc2 100644
--- a/tensorflow/core/common_runtime/executor_test.cc
+++ b/tensorflow/core/common_runtime/executor_test.cc
@@ -71,12 +71,6 @@ class ExecutorTest : public ::testing::Test {
       DeleteNonCachedKernel(kernel);
     };
     rendez_ = NewLocalRendezvous();
-    params.rendezvous_factory = [this](const int64, const DeviceMgr*,
-                                       Rendezvous** r) {
-      *r = rendez_;
-      rendez_->Ref();
-      return Status::OK();
-    };
     delete exec_;
     TF_CHECK_OK(NewLocalExecutor(params, *graph, &exec_));
     runner_ = [this](std::function<void()> fn) { thread_pool_->Schedule(fn); };
@@ -422,6 +416,7 @@ TEST_F(ExecutorTest, RecvInvalidRefDtype) {
 // maximum of 'width' nodes. All nodes are no-ops and all dependencies are
 // control dependencies.
 static void BM_executor(int iters, int width, int depth) {
+  testing::StopTiming();
 #ifdef PLATFORM_GOOGLE
   BenchmarkUseRealTime();
 #endif  // PLATFORM_GOOGLE
@@ -457,6 +452,7 @@ static void BM_executor(int iters, int width, int depth) {
   SetBenchmarkLabel(strings::StrCat("Nodes = ", cur));
   SetBenchmarkItemsProcessed(cur * static_cast<int64>(iters));
 #endif  // PLATFORM_GOOGLE
+  testing::StartTiming();
   test::Benchmark("cpu", g).Run(iters);
 }
 
@@ -472,6 +468,7 @@ BENCHMARK(BM_executor)->ArgPair(8192, 32);
 BENCHMARK(BM_executor)->ArgPair(1024, 1024);
 
 static void BM_FeedInputFetchOutput(int iters) {
+  testing::StopTiming();
   Graph* g = new Graph(OpRegistry::Global());
   // z = x + y: x and y are provided as benchmark inputs.  z is the
   // output of the benchmark.  Conceptually, the caller is ALICE, the
@@ -490,6 +487,7 @@ static void BM_FeedInputFetchOutput(int iters) {
 #ifdef PLATFORM_GOOGLE
   SetBenchmarkItemsProcessed(static_cast<int64>(iters));
 #endif  // PLATFORM_GOOGLE
+  testing::StartTiming();
   test::Benchmark("cpu", g).RunWithRendezvousArgs({{x_key, val}, {y_key, val}},
                                                   {z_key}, iters);
 }
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 614a7c3dcfa..bed6d9582b2 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -402,7 +402,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
 
   mutable mutex mu_;
 
-  int next_handle_ GUARDED_BY(mu_);
+  int next_handle_ TF_GUARDED_BY(mu_);
 
   // The instantiated and transformed function is encoded as a Graph
   // object, and an executor is created for the graph.
@@ -414,7 +414,6 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
     Executor* exec = nullptr;
     FunctionLibraryRuntimeOverlay* overlay_flr = nullptr;
     string executor_type;
-    Executor::RendezvousFactory rendezvous_factory = nullptr;
 
     ~Item() {
       delete this->func_graph;
@@ -423,7 +422,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
     }
   };
   std::unique_ptr<std::unordered_map<Handle, std::unique_ptr<Item>>> items_
-      GUARDED_BY(mu_);
+      TF_GUARDED_BY(mu_);
 
   ProcessFunctionLibraryRuntime* parent_ = nullptr;  // not owned.
 
@@ -532,6 +531,7 @@ class CallOp : public AsyncOpKernel {
     opts.step_container = ctx->step_container();
     opts.stats_collector = ctx->stats_collector();
     opts.runner = ctx->runner();
+    opts.run_all_kernels_inline = ctx->run_all_kernels_inline();
     opts.collective_executor = ctx->collective_executor();
     std::vector<Tensor> args;
     args.reserve(ctx->num_inputs());
@@ -811,11 +811,6 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
         item->overlay_flr =
             new FunctionLibraryRuntimeOverlay(this, options.lib_def);
       }
-      item->rendezvous_factory = [](const int64, const DeviceMgr* device_mgr,
-                                    Rendezvous** r) {
-        *r = new IntraProcessRendezvous(device_mgr);
-        return Status::OK();
-      };
       local_handle = next_handle_++;
       items_->emplace(local_handle, std::unique_ptr<Item>(item));
     }
@@ -971,7 +966,6 @@ Status FunctionLibraryRuntimeImpl::CreateItem(Item** item) {
   params.delete_kernel = [](OpKernel* kernel) {
     DeleteNonCachedKernel(kernel);
   };
-  params.rendezvous_factory = (*item)->rendezvous_factory;
   params.session_metadata = session_metadata_;
   std::unique_ptr<Executor> exec;
   TF_RETURN_IF_ERROR(NewExecutor(executor_type, params, *g, &exec));
@@ -1021,6 +1015,7 @@ void FunctionLibraryRuntimeImpl::ExecutorArgsFromOptions(
   }
   exec_args->collective_executor = run_opts.collective_executor;
   exec_args->call_frame = frame;
+  exec_args->run_all_kernels_inline = run_opts.run_all_kernels_inline;
 }
 
 void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index 786f505813b..9f4774d21ca 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -99,11 +99,6 @@ class FunctionTest : public ::testing::Test {
     params.delete_kernel = [](OpKernel* kernel) {
       DeleteNonCachedKernel(kernel);
     };
-    params.rendezvous_factory = [](const int64, const DeviceMgr* device_mgr,
-                                   Rendezvous** r) {
-      *r = new IntraProcessRendezvous(device_mgr);
-      return Status::OK();
-    };
     Executor* exec;
     TF_CHECK_OK(NewLocalExecutor(params, *g, &exec));
     exec_.reset(exec);
@@ -166,7 +161,13 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     device_mgr_ = absl::make_unique<StaticDeviceMgr>(std::move(devices));
     pflr_.reset(new ProcessFunctionLibraryRuntime(
         device_mgr_.get(), Env::Default(), &options.config,
-        TF_GRAPH_DEF_VERSION, lib_def_.get(), opts));
+        TF_GRAPH_DEF_VERSION, lib_def_.get(), opts, /*thread_pool=*/nullptr,
+        /*parent=*/nullptr, /*custom_kernel_creator=*/nullptr,
+        /*session_metadata=*/nullptr,
+        [](const int64, const DeviceMgr* device_mgr, Rendezvous** r) {
+          *r = new IntraProcessRendezvous(device_mgr);
+          return Status::OK();
+        }));
     flr0_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
     flr1_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:1");
     flr2_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:2");
@@ -1872,6 +1873,67 @@ TEST_F(FunctionLibraryRuntimeTest, CrossDevice) {
                               TensorShape({})));
 }
 
+class AreAllKernelsInlineOp : public OpKernel {
+ public:
+  using OpKernel::OpKernel;
+
+  void Compute(OpKernelContext* ctx) override {
+    Tensor* output;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, {}, &output));
+    output->scalar<bool>()() = ctx->run_all_kernels_inline();
+  }
+};
+
+REGISTER_OP("AreAllKernelsInline").Output("result : bool").SetIsStateful();
+REGISTER_KERNEL_BUILDER(Name("AreAllKernelsInline").Device(DEVICE_CPU),
+                        AreAllKernelsInlineOp);
+
+TEST_F(FunctionLibraryRuntimeTest, RunAllKernelsInline) {
+  // Create a function "F" that includes an AreAllKernelsInline op, and a
+  // function "G" that calls "F".
+  auto f = FDH::Create(
+      // Name
+      "F",
+      // Args
+      {},
+      // Return values
+      {"ret: bool"},
+      // Attrs
+      {},
+      // Nodes
+      {// y = AreAllKernelsInline()
+       {{"y"}, "AreAllKernelsInline", {}, {}}},
+      {{"ret", "y:result:0"}});
+
+  auto g = FDH::Create(
+      // Name
+      "G",
+      // Args
+      {},
+      // Return values
+      {"ret: bool"},
+      // Attrs
+      {},
+      // Nodes
+      {// y = F()
+       {{"y"}, "F", {}, {}}},
+      {{"ret", "y:ret:0"}});
+
+  Init({f, g});
+  FunctionLibraryRuntime::Handle handle;
+  TF_CHECK_OK(Instantiate(flr0_, "G", {}, &handle));
+
+  // Test that the `run_all_kernels_inline` flag is inherited by the kernel
+  // running inside the called function.
+  for (bool inline_option : {false, true}) {
+    FunctionLibraryRuntime::Options opts;
+    opts.run_all_kernels_inline = inline_option;
+    Tensor result;
+    TF_CHECK_OK(Run(flr0_, handle, opts, {}, {&result}, true));
+    EXPECT_EQ(result.scalar<bool>()(), inline_option);
+  }
+}
+
 namespace {
 
 bool DoNothing(Graph* g) { return false; }
diff --git a/tensorflow/core/common_runtime/function_threadpool_test.cc b/tensorflow/core/common_runtime/function_threadpool_test.cc
index 719834193be..3b14bd4a0f2 100644
--- a/tensorflow/core/common_runtime/function_threadpool_test.cc
+++ b/tensorflow/core/common_runtime/function_threadpool_test.cc
@@ -65,7 +65,13 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     device_mgr_ = absl::make_unique<StaticDeviceMgr>(std::move(devices));
     pflr_.reset(new ProcessFunctionLibraryRuntime(
         device_mgr_.get(), Env::Default(), /*config=*/nullptr,
-        TF_GRAPH_DEF_VERSION, lib_def_.get(), opts, default_thread_pool));
+        TF_GRAPH_DEF_VERSION, lib_def_.get(), opts, default_thread_pool,
+        /*parent=*/nullptr, /*custom_kernel_creator=*/nullptr,
+        /*session_metadata=*/nullptr,
+        [](const int64, const DeviceMgr* device_mgr, Rendezvous** r) {
+          *r = new IntraProcessRendezvous(device_mgr);
+          return Status::OK();
+        }));
     flr0_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
   }
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
index 2148f83fe57..4f334a8b4f7 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_allocator_retry_test.cc
@@ -58,7 +58,7 @@ class FakeAllocator {
   AllocatorRetry retry_;
   void* good_ptr_ = reinterpret_cast<void*>(0xdeadbeef);
   mutex mu_;
-  size_t memory_capacity_ GUARDED_BY(mu_);
+  size_t memory_capacity_ TF_GUARDED_BY(mu_);
   int millis_to_wait_;
 };
 
@@ -100,7 +100,7 @@ class AlternatingBarrier {
   }
 
  private:
-  void IncrementTurn() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  void IncrementTurn() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     int skipped = 0;
     while (skipped < num_users_) {
       next_turn_ = (next_turn_ + 1) % num_users_;
@@ -112,8 +112,8 @@ class AlternatingBarrier {
   mutex mu_;
   condition_variable cv_;
   int num_users_;
-  int next_turn_ GUARDED_BY(mu_);
-  std::vector<bool> done_ GUARDED_BY(mu_);
+  int next_turn_ TF_GUARDED_BY(mu_);
+  std::vector<bool> done_ TF_GUARDED_BY(mu_);
 };
 
 class GPUAllocatorRetryTest : public ::testing::Test {
@@ -174,8 +174,8 @@ class GPUAllocatorRetryTest : public ::testing::Test {
   std::vector<int> consumer_count_;
   Notification notifier_;
   mutex mu_;
-  bool has_failed_ GUARDED_BY(mu_) = false;
-  int count_ GUARDED_BY(mu_) = 0;
+  bool has_failed_ TF_GUARDED_BY(mu_) = false;
+  int count_ TF_GUARDED_BY(mu_) = 0;
 };
 
 // Verifies correct retrying when memory is slightly overcommitted but
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
index 02b1a7418d8..0f65abd6e9f 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
@@ -41,10 +41,6 @@ class GPUBFCAllocator : public BFCAllocator {
 
   TF_DISALLOW_COPY_AND_ASSIGN(GPUBFCAllocator);
 
-#ifdef TENSORFLOW_MEM_DEBUG
-  bool ShouldRecordOpName() const override { return true; }
-#endif
-
  private:
   static bool GetAllowGrowthValue(const GPUOptions& gpu_options);
   static bool GetGarbageCollectionValue();
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index d72a99f3ca7..dcc40c3d3de 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -462,13 +462,6 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
   return Status::OK();
 }
 
-bool BaseGPUDevice::RequiresRecordingAccessedTensors() const {
-  // Since there is only one stream, we release the tensor reference
-  // at the end of the kernel launch, instead of at the end of the kernel
-  // execution.
-  return false;
-}
-
 string BaseGPUDevice::ComputeOpKernelDebugString(const OpKernel& op_kernel,
                                                  const int& stream_id) {
   return strings::StrCat(op_kernel.name(), " op ", op_kernel.type_string(),
@@ -541,16 +534,6 @@ void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
   }
 }
 
-void BaseGPUDevice::ConsumeListOfAccessedTensors(
-    DeviceContext* device_context, const TensorReferenceVector& tensor_refs) {
-  GPUDeviceContext* gpu_device_context = device_context_;
-  if (device_context != nullptr) {
-    gpu_device_context = static_cast<GPUDeviceContext*>(device_context);
-  }
-  se::Stream* stream = gpu_device_context->stream();
-  em_->ThenDeleteTensors(stream, tensor_refs);
-}
-
 // Based on the semantics of Device::Sync this call should wait for
 // all streams not just the current one.
 Status BaseGPUDevice::Sync() { return GPUUtil::SyncAll(this); }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index 62fbc5d8658..3646c59cec1 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -62,15 +62,6 @@ class BaseGPUDevice : public LocalDevice {
   // Initialize the device and return the status of initialization.
   Status Init(const SessionOptions& options);
 
-  // GPU devices require the Op Compute method to save a reference to
-  // any temporary tensors that are allocated until the Op execution
-  // completes.
-  bool RequiresRecordingAccessedTensors() const override;
-
-  void ConsumeListOfAccessedTensors(
-      DeviceContext* device_context,
-      const TensorReferenceVector& tensor_refs) override;
-
   void Compute(OpKernel* op_kernel, OpKernelContext* context) override;
 
   Status Sync() override;
@@ -233,7 +224,7 @@ class GPUKernelTracker {
   // Caller is responsible for ensuring that RecordTerminate() is eventually
   // called with the same counter value.
   void RecordQueued(uint64 queued_count, int weight)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Takes a count value returned by RecordQueued and finds the corresponding
   // PendingKernel record in the ring buffer.  Marks the kernel as completed and
@@ -259,7 +250,7 @@ class GPUKernelTracker {
 
   // Yield current thread until number of pending kernels no longer
   // exceeds the cap.
-  void PauseWhilePendingExceeds(int cap) LOCKS_EXCLUDED(mu_) {
+  void PauseWhilePendingExceeds(int cap) TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
     while (num_pending_ > cap) {
       VLOG(1) << "num_pending_=" << num_pending_ << " cap=" << cap;
@@ -293,20 +284,20 @@ class GPUKernelTracker {
     PendingKernel() : queued_count(0), weight(0), terminated(false) {}
   };
   mutex mu_;
-  int32 mem_since_last_ GUARDED_BY(mu_);
-  int32 ops_since_last_ GUARDED_BY(mu_);
+  int32 mem_since_last_ TF_GUARDED_BY(mu_);
+  int32 ops_since_last_ TF_GUARDED_BY(mu_);
   // Ring buffer of PendingKernel records.
-  std::vector<PendingKernel> pending_kernels_ GUARDED_BY(mu_);
+  std::vector<PendingKernel> pending_kernels_ TF_GUARDED_BY(mu_);
   // Next unused slot in pending_kernels_.
-  int first_available_ GUARDED_BY(mu_) = 0;
+  int first_available_ TF_GUARDED_BY(mu_) = 0;
   // Last completed PendingKernel such that all prior PendingKernels are
   // also completed.  With out-of-order completion there may be a mixture
   // of completed and uncompleted entries between last_completed_ and
   // first_available_.
-  int last_completed_ GUARDED_BY(mu_) = -1;
+  int last_completed_ TF_GUARDED_BY(mu_) = -1;
   // Sum of weights of the outstanding events marking tracked kernels.
-  int num_pending_ GUARDED_BY(mu_) = 0;
-  condition_variable pending_decreased_ GUARDED_BY(mu_);
+  int num_pending_ TF_GUARDED_BY(mu_) = 0;
+  condition_variable pending_decreased_ TF_GUARDED_BY(mu_);
 };
 
 class BaseGPUDeviceFactory : public DeviceFactory {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
index 27cfe9b9799..21f13f279e5 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
@@ -91,15 +91,9 @@ void InitThreadpoolLabels(thread::ThreadPool* threadpool) {
 
 EventMgr::EventMgr(se::StreamExecutor* se, const GPUOptions& gpu_options)
     : exec_(se),
-      deferred_bytes_threshold_(gpu_options.deferred_deletion_bytes()
-                                    ? gpu_options.deferred_deletion_bytes()
-                                    : 8 * 1048576),
       polling_active_delay_usecs_(gpu_options.polling_active_delay_usecs()
                                       ? gpu_options.polling_active_delay_usecs()
                                       : 10),
-      accumulated_stream_(nullptr),
-      accumulated_tensors_(new TensorReferenceVector),
-      accumulated_tensor_bytes_(0),
       threadpool_(Env::Default(), "GPU_Event_Manager", kNumThreads) {
   gpu_event_mgr::InitThreadpoolLabels(&threadpool_);
   StartPollingLoop();
@@ -112,27 +106,9 @@ EventMgr::~EventMgr() {
   for (auto& e : free_events_) {
     delete e;
   }
-  for (auto& t : *(accumulated_tensors_)) {
-    t.Unref();
-  }
-  delete accumulated_tensors_;
   while (!used_events_.empty()) {
     InUse* ue = &used_events_[0];
     delete ue->event;
-    if (ue->mem != nullptr) {
-      for (auto& t : *(ue->mem)) {
-        t.Unref();
-      }
-      delete ue->mem;
-    }
-    if (ue->bufrec.buf) {
-      if (LogMemory::IsEnabled()) {
-        LogMemory::RecordRawDeallocation(ue->bufrec.operation,
-                                         ue->bufrec.step_id, ue->bufrec.buf,
-                                         ue->bufrec.alloc, false);
-      }
-      ue->bufrec.alloc->DeallocateRaw(ue->bufrec.buf);
-    }
     if (ue->func != nullptr) threadpool_.Schedule(ue->func);
     used_events_.pop_front();
   }
@@ -160,35 +136,6 @@ void EventMgr::StopPollingLoop() {
   }
 }
 
-void EventMgr::ThenDeleteTensors(se::Stream* stream,
-                                 const TensorReferenceVector& tensors) {
-  mutex_lock l(mu_);
-  // TODO(jeff): We currently keep one accumulated_tensors_ object.
-  // If we start to use multiple streams heavily, we might want to keep
-  // separate vectors/byte counters per stream
-  if (!accumulated_tensors_->empty() && stream != accumulated_stream_) {
-    FlushAccumulatedTensors();
-  }
-  accumulated_stream_ = stream;
-  for (const auto& t : tensors) {
-    // accumulated_tensors_ takes over ownership of the reference to "t"
-    accumulated_tensors_->push_back(t);
-    accumulated_tensor_bytes_ += t.TotalBytes();
-  }
-  if (accumulated_tensor_bytes_ >= deferred_bytes_threshold_) {
-    FlushAccumulatedTensors();
-  }
-}
-
-void EventMgr::FlushAccumulatedTensors() {
-  DCHECK(!accumulated_tensors_->empty());
-  DCHECK(accumulated_stream_ != nullptr);
-  QueueTensors(accumulated_stream_, accumulated_tensors_);
-  accumulated_tensors_ = new TensorReferenceVector;
-  accumulated_tensor_bytes_ = 0;
-  accumulated_stream_ = nullptr;
-}
-
 // A polling loop to detect completion of GPU events.
 //
 // While one or more events is outstanding, poll for completed events.  When no
@@ -218,7 +165,7 @@ void EventMgr::PollLoop() {
   polling_stopped_->Notify();
 }
 
-void EventMgr::QueueInUse(se::Stream* stream, InUse iu) {
+void EventMgr::QueueInUse(se::Stream* stream, InUse in_use) {
   VLOG(2) << "QueueInUse  free_events_ " << free_events_.size()
           << " used_events_ " << used_events_.size();
   // Events are created on demand, and repeatedly reused.  There is no
@@ -230,9 +177,9 @@ void EventMgr::QueueInUse(se::Stream* stream, InUse iu) {
   se::Event* e = free_events_.back();
   free_events_.pop_back();
   stream->ThenRecordEvent(e);
-  iu.event = e;
+  in_use.event = e;
   bool was_empty = used_events_.empty();
-  used_events_.push_back(iu);
+  used_events_.push_back(in_use);
   // Maybe wake up the polling thread
   if (was_empty) events_pending_.notify_all();
 }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
index 169a86a4986..fc3c1a892d2 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
@@ -20,7 +20,6 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_reference.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -65,32 +64,6 @@ class EventMgr {
  public:
   virtual ~EventMgr();
 
-  // Releases the references on the elements of "tensors" as soon as
-  // all events currently enqueued on "stream" have completed.
-  void ThenDeleteTensors(se::Stream* stream,
-                         const TensorReferenceVector& tensors);
-
-  struct BufRec {
-    Allocator* alloc;
-    void* buf;
-    // operation and step_id are only populated when
-    // LogMemory::IsEnabled() is true.
-    string operation;
-    int64 step_id;
-  };
-
-  // Takes ownership of *bufrec.buf and calls bufrec.alloc->DeallocateRaw()
-  // on it as soon as all events currently enqueued on *stream have completed.
-  inline void ThenDeleteBuffer(se::Stream* stream, BufRec bufrec) {
-    ToFreeVector to_free;
-    {
-      mutex_lock l(mu_);
-      QueueBuffer(stream, bufrec);
-      PollEvents(false, &to_free);
-    }
-    FreeMemory(to_free);
-  }
-
   // Execute func when all pending stream actions have completed.
   // func must be brief and non-blocking since it executes in the one
   // thread used for all such callbacks and also buffer deletions.
@@ -109,17 +82,12 @@ class EventMgr {
   friend class TEST_EventMgrHelper;
   friend class EventMgrFactory;
   se::StreamExecutor* const exec_;
-  const int64 deferred_bytes_threshold_;
   const int32 polling_active_delay_usecs_;
   mutex mu_;
-  condition_variable events_pending_ GUARDED_BY(mu_);
-
-  void FlushAccumulatedTensors() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  condition_variable events_pending_ TF_GUARDED_BY(mu_);
 
   struct InUse {
     se::Event* event;
-    TensorReferenceVector* mem;
-    BufRec bufrec;
     std::function<void()> func;
   };
 
@@ -129,20 +97,6 @@ class EventMgr {
 
   void FreeMemory(const ToFreeVector& to_free) {
     for (const auto& iu : to_free) {
-      if (iu.mem != nullptr) {
-        for (auto& t : *(iu.mem)) {
-          t.Unref();
-        }
-        delete iu.mem;
-      }
-      if (iu.bufrec.buf) {
-        if (LogMemory::IsEnabled()) {
-          LogMemory::RecordRawDeallocation(iu.bufrec.operation,
-                                           iu.bufrec.step_id, iu.bufrec.buf,
-                                           iu.bufrec.alloc, false);
-        }
-        iu.bufrec.alloc->DeallocateRaw(iu.bufrec.buf);
-      }
       // The function must be called in another thread.
       if (iu.func != nullptr) threadpool_.Schedule(iu.func);
     }
@@ -152,21 +106,11 @@ class EventMgr {
   // Tensors and/or a BufRec to be deleted only after the Event
   // records.
   void QueueInUse(se::Stream* stream, InUse in_use)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  void QueueTensors(se::Stream* stream, TensorReferenceVector* tensors)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    QueueInUse(stream, {nullptr, tensors, BufRec(), nullptr});
-  }
-
-  void QueueBuffer(se::Stream* stream, BufRec bufrec)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    QueueInUse(stream, {nullptr, nullptr, bufrec, nullptr});
-  }
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   void QueueFunc(se::Stream* stream, std::function<void()> func)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    QueueInUse(stream, {nullptr, nullptr, BufRec(), std::move(func)});
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    QueueInUse(stream, {nullptr, std::move(func)});
   }
 
   // This function should be called at roughly the same tempo as
@@ -175,7 +119,7 @@ class EventMgr {
   // to "*to_free".  The caller should call FreeMemory(to_free)
   // when this returns.
   void PollEvents(bool is_dedicated_poller, ToFreeVector* to_free)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // An internal polling loop that runs at a low frequency to clear
   // straggler Events.
@@ -186,18 +130,12 @@ class EventMgr {
   void StopPollingLoop();
 
   // A stack of unused events
-  std::vector<se::Event*> free_events_ GUARDED_BY(mu_);
-
-  // Buffered list of tensors waiting to have an event queued for deletion
-  se::Stream* accumulated_stream_ GUARDED_BY(mu_);
-  TensorReferenceVector* accumulated_tensors_ GUARDED_BY(mu_);
-  // Sum of the TotalBytes() of the tensors in "accumulated_tensors_"
-  int64 accumulated_tensor_bytes_ GUARDED_BY(mu_);
+  std::vector<se::Event*> free_events_ TF_GUARDED_BY(mu_);
 
   // A FIFO queue of InUse events and associated tensors.
-  std::deque<InUse> used_events_ GUARDED_BY(mu_);
+  std::deque<InUse> used_events_ TF_GUARDED_BY(mu_);
 
-  bool stop_polling_ GUARDED_BY(mu_);
+  bool stop_polling_ TF_GUARDED_BY(mu_);
   std::unique_ptr<Notification> polling_stopped_;
 
   // The main PollLoop for the event manager runs in this threadpool.
@@ -216,7 +154,7 @@ class EventMgrFactory {
 
   // Maintain one EventMgr per physical device (StreamExecutor is
   // per-physical-device).
-  std::map<se::StreamExecutor*, EventMgr*> event_mgr_map_ GUARDED_BY(mu_);
+  std::map<se::StreamExecutor*, EventMgr*> event_mgr_map_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
index 680aec1ab29..0df7a84d757 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
@@ -63,11 +63,6 @@ class TEST_EventMgrHelper {
     return em_->free_events_.size();
   }
 
-  void QueueTensors(se::Stream* stream, TensorReferenceVector* tensors) {
-    mutex_lock l(em_->mu_);
-    em_->QueueTensors(stream, tensors);
-  }
-
   void PollEvents() {
     while (queue_size() > 0) {
       // For ordinary tensor frees, this function
@@ -121,146 +116,6 @@ TEST(EventMgr, Empty) {
   EXPECT_EQ(0, th.free_size());
 }
 
-static void AddTensorReference(TensorReferenceVector* v, int64 size) {
-  TestTensorBuffer* buf = new TestTensorBuffer(size);
-  v->push_back(TensorReference(buf));
-  buf->Unref();
-}
-
-// Delaying polling until after several enqueings should grow the
-// total number of allocated events.  Once we have enough events for
-// the max simultaneously pending, we should not allocate any more.
-TEST(EventMgr, DelayedPolling) {
-  auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
-  TEST_EventMgr em(stream_exec, GPUOptions());
-  TEST_EventMgrHelper th(&em);
-  EXPECT_EQ(0, th.queue_size());
-  TensorReferenceVector* v = nullptr;
-  std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
-  CHECK(stream);
-  stream->Init();
-  for (int i = 0; i < 5; ++i) {
-    v = new TensorReferenceVector;
-    AddTensorReference(v, 100 * 1048576);
-    th.QueueTensors(stream.get(), v);
-    EXPECT_EQ(i + 1, th.queue_size());
-    EXPECT_EQ(0, th.free_size());
-  }
-  th.PollEvents();
-  EXPECT_EQ(0, th.queue_size());
-  EXPECT_EQ(5, th.free_size());
-  for (int j = 0; j < 2; ++j) {
-    for (int i = 0; i < 5; ++i) {
-      v = new TensorReferenceVector;
-      AddTensorReference(v, 100 * 1048576);
-      th.QueueTensors(stream.get(), v);
-      EXPECT_EQ(i + 1, th.queue_size());
-      EXPECT_EQ(4 - i, th.free_size());
-    }
-    th.PollEvents();
-    EXPECT_EQ(0, th.queue_size());
-    EXPECT_EQ(5, th.free_size());
-  }
-}
-
-TEST(EventMgr, FlushLargeTensorImmediately) {
-  auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
-  TEST_EventMgr em(stream_exec, GPUOptions());
-  TEST_EventMgrHelper th(&em);
-  EXPECT_EQ(0, live_tensor_bytes);
-  std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
-  CHECK(stream);
-  stream->Init();
-  for (int i = 0; i < 5; ++i) {
-    TensorReferenceVector v;
-    AddTensorReference(&v, 100 * 1048576);
-    em.ThenDeleteTensors(stream.get(), v);
-    th.PollEvents();  // Ensure things get registered to be freed by Poll
-    EXPECT_EQ(0, live_tensor_bytes);
-  }
-}
-
-TEST(EventMgr, ManySmallTensorsFlushedImmediately) {
-  auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
-  TEST_EventMgr em(stream_exec, GPUOptions());
-  TEST_EventMgrHelper th(&em);
-  EXPECT_EQ(0, live_tensor_bytes);
-  std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
-  CHECK(stream);
-  stream->Init();
-  for (int i = 0; i < 5; ++i) {
-    TensorReferenceVector v;
-    for (int i = 0; i < 1000; i++) {
-      AddTensorReference(&v, 100 * 1024);
-    }
-    em.ThenDeleteTensors(stream.get(), v);
-    th.PollEvents();  // Harvest the tensors ready to be freed.
-    EXPECT_EQ(0, live_tensor_bytes);
-  }
-}
-
-TEST(EventMgr, StreamSwitchingFlushesImmediately) {
-  auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
-  TEST_EventMgr em(stream_exec, GPUOptions());
-  TEST_EventMgrHelper th(&em);
-  EXPECT_EQ(0, live_tensor_bytes);
-  std::unique_ptr<se::Stream> stream1(new se::Stream(stream_exec));
-  std::unique_ptr<se::Stream> stream2(new se::Stream(stream_exec));
-  stream1->Init();
-  stream2->Init();
-  TensorReferenceVector v1;
-  AddTensorReference(&v1, 1024);
-  em.ThenDeleteTensors(stream1.get(), v1);
-
-  TensorReferenceVector v2;
-  AddTensorReference(&v2, 1024);
-  int64 initial_live_bytes = live_tensor_bytes;
-  em.ThenDeleteTensors(stream2.get(), v2);
-  th.PollEvents();  // Ensure things get registered to be freed by Poll
-  // Different stream should cause first tensor to get deleted
-  EXPECT_GT(initial_live_bytes, live_tensor_bytes);
-}
-
-TEST(EventMgr, ManySmallTensorsSeparateCallsFlushed) {
-  auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
-  TEST_EventMgr em(stream_exec, GPUOptions());
-  TEST_EventMgrHelper th(&em);
-  EXPECT_EQ(0, live_tensor_bytes);
-  std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
-  CHECK(stream);
-  stream->Init();
-  for (int i = 0; i < 5; ++i) {
-    for (int i = 0; i < 1000; i++) {
-      TensorReferenceVector v;
-      AddTensorReference(&v, 100 * 1024);
-      em.ThenDeleteTensors(stream.get(), v);
-    }
-    th.PollEvents();  // Ensure things get registered to be freed by Poll
-    // Some of the tensors at least should be flushed
-    EXPECT_GT(1000 * 100 * 1024, live_tensor_bytes);
-  }
-}
-
-// Deleting the EventMgr when events are still pending should shut
-// down gracefully.
-TEST(EventMgr, NonEmptyShutdown) {
-  auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
-  TEST_EventMgr em(stream_exec, GPUOptions());
-  TEST_EventMgrHelper th(&em);
-  EXPECT_EQ(0, th.queue_size());
-  EXPECT_EQ(0, th.free_size());
-  std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
-  CHECK(stream);
-  stream->Init();
-  for (int i = 0; i < 5; ++i) {
-    TensorReferenceVector* v = new TensorReferenceVector;
-    AddTensorReference(v, 100 * 1048576);
-    th.QueueTensors(stream.get(), v);
-    EXPECT_EQ(1 + i, th.queue_size());
-    EXPECT_EQ(0, th.free_size());
-  }
-}
-
 // Tests that WarnIfInCallback() triggers correctly.
 TEST(EventMgr, WarnIfInCallback) {
   auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
@@ -426,7 +281,6 @@ class EMBenchmarkHelper {
     params->step_id = 1;
     params->device = gpu_helper_->gpu();
     params->log_memory = false;
-    params->record_tensor_accesses = false;
     params->rendezvous = nullptr;
     params->collective_executor = nullptr;
     params->session_state = nullptr;  // ???
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc b/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc
index 84eb84102d7..28ef0ac3570 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc
@@ -35,7 +35,7 @@ class TfToPlatformGpuIdMap {
   }
 
   Status Insert(TfGpuId tf_gpu_id, PlatformGpuId platform_gpu_id)
-      LOCKS_EXCLUDED(mu_) {
+      TF_LOCKS_EXCLUDED(mu_) {
     std::pair<IdMapType::iterator, bool> result;
     {
       mutex_lock lock(mu_);
@@ -58,7 +58,7 @@ class TfToPlatformGpuIdMap {
   }
 
   bool Find(TfGpuId tf_gpu_id, PlatformGpuId* platform_gpu_id) const
-      LOCKS_EXCLUDED(mu_) {
+      TF_LOCKS_EXCLUDED(mu_) {
     // TODO(mrry): Consider replacing this with an atomic `is_initialized` bit,
     // to avoid writing to a shared cache line in the tf_shared_lock.
     tf_shared_lock lock(mu_);
@@ -71,14 +71,14 @@ class TfToPlatformGpuIdMap {
  private:
   TfToPlatformGpuIdMap() = default;
 
-  void TestOnlyReset() LOCKS_EXCLUDED(mu_) {
+  void TestOnlyReset() TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock lock(mu_);
     id_map_.clear();
   }
 
   using IdMapType = std::unordered_map<int32, int32>;
   mutable mutex mu_;
-  IdMapType id_map_ GUARDED_BY(mu_);
+  IdMapType id_map_ TF_GUARDED_BY(mu_);
 
   friend class ::tensorflow::GpuIdManager;
   TF_DISALLOW_COPY_AND_ASSIGN(TfToPlatformGpuIdMap);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.h b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
index f6c1ef7221e..daab6d65ed6 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
@@ -56,13 +56,13 @@ class GPUProcessState {
 
   // Query whether any GPU device has been created so far.
   // Disable thread safety analysis since a race is benign here.
-  bool HasGPUDevice() const NO_THREAD_SAFETY_ANALYSIS {
+  bool HasGPUDevice() const TF_NO_THREAD_SAFETY_ANALYSIS {
     return gpu_device_enabled_;
   }
 
   // Set the flag to indicate a GPU device has been created.
   // Disable thread safety analysis since a race is benign here.
-  void EnableGPUDevice() NO_THREAD_SAFETY_ANALYSIS {
+  void EnableGPUDevice() TF_NO_THREAD_SAFETY_ANALYSIS {
     gpu_device_enabled_ = true;
   }
 
@@ -147,14 +147,15 @@ class GPUProcessState {
     SubAllocator* sub_allocator;  // owned by allocator
     std::unique_ptr<Allocator> recording_allocator;
   };
-  std::vector<AllocatorParts> gpu_allocators_ GUARDED_BY(mu_);
-  std::vector<std::vector<SubAllocator::Visitor>> gpu_visitors_ GUARDED_BY(mu_);
+  std::vector<AllocatorParts> gpu_allocators_ TF_GUARDED_BY(mu_);
+  std::vector<std::vector<SubAllocator::Visitor>> gpu_visitors_
+      TF_GUARDED_BY(mu_);
 
-  std::vector<AllocatorParts> gpu_host_allocators_ GUARDED_BY(mu_);
+  std::vector<AllocatorParts> gpu_host_allocators_ TF_GUARDED_BY(mu_);
   std::vector<std::vector<SubAllocator::Visitor>> gpu_host_alloc_visitors_
-      GUARDED_BY(mu_);
+      TF_GUARDED_BY(mu_);
   std::vector<std::vector<SubAllocator::Visitor>> gpu_host_free_visitors_
-      GUARDED_BY(mu_);
+      TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index 4c56eccdad5..42247c664ec 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -732,8 +732,9 @@ Status GraphExecutionState::OptimizeGraph(
     }
     grappler::VirtualCluster cluster(device_set_);
     GraphDef new_graph;
-    TF_RETURN_IF_ERROR(grappler::RunMetaOptimizer(
-        item, session_options_->config, cpu_device, &cluster, &new_graph));
+    TF_RETURN_IF_ERROR(
+        grappler::RunMetaOptimizer(std::move(item), session_options_->config,
+                                   cpu_device, &cluster, &new_graph));
 
     // Merge optimized graph function library with an original library.
     // Optimized graph might have new functions specialized for it's
diff --git a/tensorflow/core/common_runtime/graph_runner.cc b/tensorflow/core/common_runtime/graph_runner.cc
index 7ffb860a2ce..0b3970a468f 100644
--- a/tensorflow/core/common_runtime/graph_runner.cc
+++ b/tensorflow/core/common_runtime/graph_runner.cc
@@ -86,7 +86,7 @@ class SimpleRendezvous : public RendezvousInterface {
   typedef std::unordered_map<string, Tensor> Table;
 
   mutex mu_;
-  Table table_ GUARDED_BY(mu_);
+  Table table_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace
@@ -164,11 +164,6 @@ Status GraphRunner::Run(Graph* graph, FunctionLibraryRuntime* function_library,
                                  kernel);
   };
   params.delete_kernel = [](OpKernel* kernel) { delete kernel; };
-  params.rendezvous_factory = [](const int64, const DeviceMgr* device_mgr,
-                                 Rendezvous** r) {
-    *r = new IntraProcessRendezvous(device_mgr);
-    return Status::OK();
-  };
 
   Executor* executor;
   TF_RETURN_IF_ERROR(NewLocalExecutor(params, *graph_to_run, &executor));
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
index b1a79f29c8d..66f77bd403e 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
@@ -312,7 +312,7 @@ void HierarchicalTreeBroadcaster::RunTree() {
     }
 
     mutex mu;               // also guards status_ while callbacks are pending
-    int pending_count = 0;  // GUARDED_BY(mu)
+    int pending_count = 0;  // TF_GUARDED_BY(mu)
     condition_variable all_done;
 
     if (my_rank >= 0 && my_rank != source_rank) {
@@ -409,6 +409,7 @@ void HierarchicalTreeBroadcaster::DispatchSend(int subdiv, int dst_rank,
                                                int src_rank,
                                                const Tensor* src_tensor,
                                                const StatusCallback& done) {
+  MEMDEBUG_CACHE_OP(col_ctx_->op_ctx->op_kernel().name().c_str());
   string send_buf_key =
       BroadcastBufKey(col_ctx_->exec_key, subdiv, src_rank, dst_rank);
   int dst_idx =
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
index b855d4908b8..15a25c176dd 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
@@ -188,7 +188,7 @@ class FailTestRMA : public CollectiveRemoteAccessLocal {
   }
 
   mutex mu_;
-  int fail_after_ GUARDED_BY(mu_);
+  int fail_after_ TF_GUARDED_BY(mu_);
 };
 
 class HierarchicalTreeBroadcasterTest : public ::testing::Test {
@@ -725,9 +725,9 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
   std::unique_ptr<tensorflow::DeviceMgr> dev_mgr_;
   std::unique_ptr<string> gpu_ring_order_;
   mutex mu_;
-  int bcast_recv_counter_ GUARDED_BY(mu_) = 0;
-  int bcast_send_counter_ GUARDED_BY(mu_) = 0;
-  int failure_count_ GUARDED_BY(mu_) = 0;
+  int bcast_recv_counter_ TF_GUARDED_BY(mu_) = 0;
+  int bcast_send_counter_ TF_GUARDED_BY(mu_) = 0;
+  int failure_count_ TF_GUARDED_BY(mu_) = 0;
 };
 
 TEST_F(HierarchicalTreeBroadcasterTest, InitializeParams1Task8GPU) {
diff --git a/tensorflow/core/common_runtime/local_device.h b/tensorflow/core/common_runtime/local_device.h
index f305c212c5a..63c4b988daa 100644
--- a/tensorflow/core/common_runtime/local_device.h
+++ b/tensorflow/core/common_runtime/local_device.h
@@ -52,7 +52,7 @@ class LocalDevice : public Device {
   // computations.
   static mutex global_tp_mu_;
   static gtl::InlinedVector<EigenThreadPoolInfo*, 4> global_tp_info_
-      GUARDED_BY(global_tp_mu_);
+      TF_GUARDED_BY(global_tp_mu_);
 
   friend class test::Benchmark;
 
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index 5ec76f59671..5794f49dd17 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -98,7 +98,7 @@ class MklSmallSizeAllocator : public Allocator {
 
  private:
   // Increment statistics for the allocator handling small allocations.
-  inline void IncrementStats(size_t alloc_size) LOCKS_EXCLUDED(mutex_) {
+  inline void IncrementStats(size_t alloc_size) TF_LOCKS_EXCLUDED(mutex_) {
     mutex_lock l(mutex_);
     ++stats_.num_allocs;
     stats_.bytes_in_use += alloc_size;
@@ -109,7 +109,7 @@ class MklSmallSizeAllocator : public Allocator {
   }
 
   // Decrement statistics for the allocator handling small allocations.
-  inline void DecrementStats(size_t dealloc_size) LOCKS_EXCLUDED(mutex_) {
+  inline void DecrementStats(size_t dealloc_size) TF_LOCKS_EXCLUDED(mutex_) {
     mutex_lock l(mutex_);
     stats_.bytes_in_use -= dealloc_size;
   }
@@ -123,7 +123,7 @@ class MklSmallSizeAllocator : public Allocator {
   string name_;
 
   // Allocator stats for small allocs
-  AllocatorStats stats_ GUARDED_BY(mutex_);
+  AllocatorStats stats_ TF_GUARDED_BY(mutex_);
 };
 
 /// CPU allocator for MKL that wraps BFC allocator and intercepts
@@ -199,19 +199,20 @@ class MklCPUAllocator : public Allocator {
 
   inline string Name() override { return kName; }
   inline bool IsSmallSizeAllocation(const void* ptr) const
-      LOCKS_EXCLUDED(mutex_) {
+      TF_LOCKS_EXCLUDED(mutex_) {
     mutex_lock l(mutex_);
     return large_allocations_map_.find(ptr) == large_allocations_map_.end();
   }
   // AddLargeAllocMap and RemoveLargeAllocMap are always called with a lock held
   inline void AddLargeAllocMap(void* ptr, size_t num_bytes)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
     if (ptr != nullptr) {
       std::pair<void*, size_t> map_val(ptr, num_bytes);
       large_allocations_map_.insert(map_val);
     }
   }
-  inline void RemoveLargeAllocMap(void* ptr) EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+  inline void RemoveLargeAllocMap(void* ptr)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
     auto map_iter = large_allocations_map_.find(ptr);
     if (map_iter != large_allocations_map_.end()) {
       large_allocations_map_.erase(map_iter);
@@ -313,12 +314,12 @@ class MklCPUAllocator : public Allocator {
 
   SubAllocator* sub_allocator_;  // not owned by this class
   mutable mutex mutex_;
-  AllocatorStats stats_ GUARDED_BY(mutex_);
+  AllocatorStats stats_ TF_GUARDED_BY(mutex_);
 
   // Hash map to keep track of "BFC" allocations
   // We do not use BFC allocator for small allocations.
   std::unordered_map<const void*, size_t> large_allocations_map_
-      GUARDED_BY(mutex_);
+      TF_GUARDED_BY(mutex_);
 
   // Size in bytes that defines the upper-bound for "small" allocations.
   // Any allocation below this threshold is "small" allocation.
diff --git a/tensorflow/core/common_runtime/pool_allocator.h b/tensorflow/core/common_runtime/pool_allocator.h
index 603e28b39e1..7c896cd4261 100644
--- a/tensorflow/core/common_runtime/pool_allocator.h
+++ b/tensorflow/core/common_runtime/pool_allocator.h
@@ -81,21 +81,21 @@ class PoolAllocator : public Allocator {
   // consistency with other threads is not important.
 
   // Number of Get() requests satisfied from pool.
-  int64 get_from_pool_count() const NO_THREAD_SAFETY_ANALYSIS {
+  int64 get_from_pool_count() const TF_NO_THREAD_SAFETY_ANALYSIS {
     return get_from_pool_count_;
   }
   // Number of Put() requests.
-  int64 put_count() const NO_THREAD_SAFETY_ANALYSIS { return put_count_; }
+  int64 put_count() const TF_NO_THREAD_SAFETY_ANALYSIS { return put_count_; }
   // Number of Get() requests requiring a fresh allocation.
-  int64 allocated_count() const NO_THREAD_SAFETY_ANALYSIS {
+  int64 allocated_count() const TF_NO_THREAD_SAFETY_ANALYSIS {
     return allocated_count_;
   }
   // Number of pool evictions.
-  int64 evicted_count() const NO_THREAD_SAFETY_ANALYSIS {
+  int64 evicted_count() const TF_NO_THREAD_SAFETY_ANALYSIS {
     return evicted_count_;
   }
   // Current size limit.
-  size_t size_limit() const NO_THREAD_SAFETY_ANALYSIS {
+  size_t size_limit() const TF_NO_THREAD_SAFETY_ANALYSIS {
     return pool_size_limit_;
   }
 
@@ -108,13 +108,13 @@ class PoolAllocator : public Allocator {
   };
 
   // Remove "pr" from the double-linked LRU list.
-  void RemoveFromList(PtrRecord* pr) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  void RemoveFromList(PtrRecord* pr) TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   // Add "pr" to the head of the double-linked LRU list.
-  void AddToList(PtrRecord* pr) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  void AddToList(PtrRecord* pr) TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   // Delete the least recently used record.
-  void EvictOne() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  void EvictOne() TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   const string name_;
   const bool has_size_limit_;
@@ -123,13 +123,13 @@ class PoolAllocator : public Allocator {
   std::unique_ptr<SubAllocator> allocator_;
   std::unique_ptr<RoundUpInterface> size_rounder_;
   mutex mutex_;
-  std::multimap<const size_t, PtrRecord*> pool_ GUARDED_BY(mutex_);
-  PtrRecord* lru_head_ GUARDED_BY(mutex_) = nullptr;
-  PtrRecord* lru_tail_ GUARDED_BY(mutex_) = nullptr;
-  int64 get_from_pool_count_ GUARDED_BY(mutex_) = 0;
-  int64 put_count_ GUARDED_BY(mutex_) = 0;
-  int64 allocated_count_ GUARDED_BY(mutex_) = 0;
-  int64 evicted_count_ GUARDED_BY(mutex_) = 0;
+  std::multimap<const size_t, PtrRecord*> pool_ TF_GUARDED_BY(mutex_);
+  PtrRecord* lru_head_ TF_GUARDED_BY(mutex_) = nullptr;
+  PtrRecord* lru_tail_ TF_GUARDED_BY(mutex_) = nullptr;
+  int64 get_from_pool_count_ TF_GUARDED_BY(mutex_) = 0;
+  int64 put_count_ TF_GUARDED_BY(mutex_) = 0;
+  int64 allocated_count_ TF_GUARDED_BY(mutex_) = 0;
+  int64 evicted_count_ TF_GUARDED_BY(mutex_) = 0;
 };
 
 // Do-nothing rounder. Passes through sizes unchanged.
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 1d48af45035..73d83fbbd5e 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -83,7 +83,8 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
     thread::ThreadPool* default_thread_pool,
     DistributedFunctionLibraryRuntime* parent,
     const CustomKernelCreator* custom_kernel_creator,
-    const SessionMetadata* session_metadata)
+    const SessionMetadata* session_metadata,
+    Rendezvous::Factory rendezvous_factory)
     : parent_(parent),
       env_(env),
       config_(config ? absl::make_optional(*config) : absl::nullopt),
@@ -93,7 +94,8 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
       flr_map_(new std::unordered_map<Device*,
                                       std::unique_ptr<FunctionLibraryRuntime>>),
       next_handle_(0),
-      session_metadata_(session_metadata) {
+      session_metadata_(session_metadata),
+      rendezvous_factory_(std::move(rendezvous_factory)) {
   if (device_mgr == nullptr) {
     (*flr_map_)[nullptr] = NewFunctionLibraryRuntime(
         nullptr, env, config_ ? &(*config_) : nullptr, nullptr,
@@ -1245,16 +1247,21 @@ Status ProcessFunctionLibraryRuntime::ReleaseHandle(
 FunctionLibraryRuntime::DoneCallback
 ProcessFunctionLibraryRuntime::ApplyCleanUpToDoneCallback(
     std::vector<std::unique_ptr<CleanUpItem>>* items,
-    FunctionLibraryRuntime::DoneCallback done) const {
-  return [this, items, done](const Status& status) {
-    auto* local_status = new Status(status);
-    CleanUp(items, [local_status, done](const Status& cleanup_status) {
-      local_status->Update(cleanup_status);
-      done(*local_status);
-      delete local_status;
-    });
-    delete items;
-  };
+    FunctionLibraryRuntime::DoneCallback done,
+    const Rendezvous* rendezvous) const {
+  return
+      [this, items, done = std::move(done), rendezvous](const Status& status) {
+        if (rendezvous) {
+          rendezvous->Unref();
+        }
+        auto* local_status = new Status(status);
+        CleanUp(items, [local_status, done](const Status& cleanup_status) {
+          local_status->Update(cleanup_status);
+          done(*local_status);
+          delete local_status;
+        });
+        delete items;
+      };
 }
 
 void ProcessFunctionLibraryRuntime::Run(
@@ -1262,8 +1269,28 @@ void ProcessFunctionLibraryRuntime::Run(
     FunctionLibraryRuntime::Handle handle, gtl::ArraySlice<Tensor> args,
     std::vector<Tensor>* rets,
     FunctionLibraryRuntime::DoneCallback done) const {
+  FunctionLibraryRuntime::Options new_opts = opts;
+  Rendezvous* rendezvous = nullptr;
+  if (!opts.rendezvous) {
+    if (rendezvous_factory_) {
+      Status s = rendezvous_factory_(opts.step_id, device_mgr_, &rendezvous);
+      if (!s.ok()) {
+        done(s);
+        return;
+      }
+      new_opts.rendezvous = rendezvous;
+    } else {
+      done(
+          errors::FailedPrecondition("The caller does not provide a rendezvous "
+                                     "and ProcessFunctionLibraryRuntime was "
+                                     "created without a rendezvous factory."));
+      return;
+    }
+    new_opts.create_rendezvous = false;
+  }
+
   auto* cleanup_items = new std::vector<std::unique_ptr<CleanUpItem>>;
-  done = ApplyCleanUpToDoneCallback(cleanup_items, done);
+  done = ApplyCleanUpToDoneCallback(cleanup_items, std::move(done), rendezvous);
   bool multi_device;
   {
     tf_shared_lock l(mu_);
@@ -1275,11 +1302,11 @@ void ProcessFunctionLibraryRuntime::Run(
       comp_args->local_args = GetArgsForIndices(comp_data.arg_indices_, args);
       return Status::OK();
     };
-    return RunMultiDevice(opts, handle, rets, cleanup_items, std::move(done),
-                          std::move(get_component_args));
+    return RunMultiDevice(new_opts, handle, rets, cleanup_items,
+                          std::move(done), std::move(get_component_args));
   }
   InternalArgsView internal_args(args);
-  RunInternal(opts, handle, internal_args, rets, cleanup_items,
+  RunInternal(new_opts, handle, internal_args, rets, cleanup_items,
               std::move(done));
 }
 
@@ -1472,7 +1499,7 @@ Status ProcessFunctionLibraryRuntime::Clone(
   *out_pflr = absl::make_unique<ProcessFunctionLibraryRuntime>(
       device_mgr_, env, config_ ? &(*config_) : nullptr, graph_def_version,
       out_lib_def->get(), optimizer_options, default_thread_pool_, parent_,
-      custom_kernel_creator, session_metadata_);
+      custom_kernel_creator, session_metadata_, rendezvous_factory_);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index bafce903c8b..80b52904235 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -69,7 +69,8 @@ class ProcessFunctionLibraryRuntime {
       thread::ThreadPool* thread_pool = nullptr,
       DistributedFunctionLibraryRuntime* parent = nullptr,
       const CustomKernelCreator* custom_kernel_creator = nullptr,
-      const SessionMetadata* metadata = nullptr);
+      const SessionMetadata* session_metadata = nullptr,
+      Rendezvous::Factory rendezvous_factory = nullptr);
 
   virtual ~ProcessFunctionLibraryRuntime() {
     // Deleting the FunctionLibraryRuntime map will delete the function handles
@@ -309,7 +310,8 @@ class ProcessFunctionLibraryRuntime {
 
   FunctionLibraryRuntime::DoneCallback ApplyCleanUpToDoneCallback(
       std::vector<std::unique_ptr<CleanUpItem>>* items,
-      FunctionLibraryRuntime::DoneCallback done) const;
+      FunctionLibraryRuntime::DoneCallback done,
+      const Rendezvous* rendezvous) const;
 
   DistributedFunctionLibraryRuntime* const parent_;
 
@@ -317,7 +319,7 @@ class ProcessFunctionLibraryRuntime {
   FunctionLibraryRuntime::Handle AddHandleLocked(
       const string& function_key, const string& device_name,
       FunctionLibraryRuntime::LocalHandle local_handle)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // For a given device_name, returns a DeviceContext for copying
   // tensors to/from the device.
@@ -423,11 +425,11 @@ class ProcessFunctionLibraryRuntime {
     mutex mu_;
 
     const string target_device_;
-    FunctionLibraryRuntime::LocalHandle local_handle_ GUARDED_BY(mu_);
+    FunctionLibraryRuntime::LocalHandle local_handle_ TF_GUARDED_BY(mu_);
     const string function_key_;
-    bool is_cross_process_ GUARDED_BY(mu_) = false;
-    bool init_started_ GUARDED_BY(mu_) = false;
-    Status init_result_ GUARDED_BY(mu_);
+    bool is_cross_process_ TF_GUARDED_BY(mu_) = false;
+    bool init_started_ TF_GUARDED_BY(mu_) = false;
+    Status init_result_ TF_GUARDED_BY(mu_);
     Notification init_done_;
   };
 
@@ -442,23 +444,24 @@ class ProcessFunctionLibraryRuntime {
 
   // Holds all the function instantiations. Maps function_keys to handles.
   std::unordered_map<string, FunctionLibraryRuntime::Handle> table_
-      GUARDED_BY(mu_);
+      TF_GUARDED_BY(mu_);
 
   // Function data for instantiated remote functions.
   std::unordered_map<FunctionLibraryRuntime::Handle,
                      std::unique_ptr<FunctionData>>
-      function_data_ GUARDED_BY(mu_);
+      function_data_ TF_GUARDED_BY(mu_);
 
   // Function data for instantiated multi-device functions.
   std::unordered_map<FunctionLibraryRuntime::Handle,
                      std::unique_ptr<MultiDeviceFunctionData>>
-      mdevice_data_ GUARDED_BY(mu_);
+      mdevice_data_ TF_GUARDED_BY(mu_);
 
   std::unique_ptr<
       std::unordered_map<Device*, std::unique_ptr<FunctionLibraryRuntime>>>
       flr_map_;
-  int next_handle_ GUARDED_BY(mu_);
+  int next_handle_ TF_GUARDED_BY(mu_);
   const SessionMetadata* const session_metadata_;
+  const Rendezvous::Factory rendezvous_factory_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index 0739744c2fe..0b2d231e500 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -74,7 +74,7 @@ class TestClusterFLR : public DistributedFunctionLibraryRuntime {
 
  private:
   mutex mu_;
-  int next_handle_ GUARDED_BY(mu_) = 0;
+  int next_handle_ TF_GUARDED_BY(mu_) = 0;
   DeviceMgr* device_mgr_;
 };
 
@@ -119,10 +119,13 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
     cluster_flr_.reset(new TestClusterFLR(device_mgr_.get()));
     proc_flr_.reset(new ProcessFunctionLibraryRuntime(
         device_mgr_.get(), Env::Default(), /*config=*/nullptr,
-        TF_GRAPH_DEF_VERSION, lib_def_.get(), opts, nullptr, cluster_flr_.get(),
-        nullptr, session_metadata));
-    rendezvous_ =
-        absl::make_unique<PrivateIntraProcessRendezvous>(device_mgr_.get());
+        TF_GRAPH_DEF_VERSION, lib_def_.get(), opts,
+        /*thread_pool=*/nullptr, cluster_flr_.get(),
+        /*custom_kernel_creator=*/nullptr, session_metadata,
+        [](const int64, const DeviceMgr* device_mgr, Rendezvous** r) {
+          *r = new IntraProcessRendezvous(device_mgr);
+          return Status::OK();
+        }));
   }
 
   Status Instantiate(
@@ -258,7 +261,6 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
           test::function::FunctionTestSchedClosure(fn);
         };
 
-    opts.rendezvous = rendezvous_.get();
     opts.runner = &runner;
     Status status;
     Notification done;
@@ -287,7 +289,6 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
   std::unique_ptr<FunctionLibraryDefinition> lib_def_;
   std::unique_ptr<TestClusterFLR> cluster_flr_;
   std::unique_ptr<ProcessFunctionLibraryRuntime> proc_flr_;
-  std::unique_ptr<PrivateIntraProcessRendezvous> rendezvous_ = nullptr;
 };
 
 TEST_F(ProcessFunctionLibraryRuntimeTest, GetFLRNull) {
@@ -339,7 +340,6 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, SingleCall) {
   Init({test::function::XTimesTwo()});
   FunctionLibraryRuntime::Options opts;
   opts.source_device = "/job:a/replica:0/task:0/cpu:0";
-  opts.rendezvous = rendezvous_.get();
   opts.remote_execution = true;
   FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
   instantiate_opts.target = "/job:a/replica:0/task:0/cpu:0";
@@ -354,7 +354,6 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, SingleCallFindDevice) {
   Init({test::function::FindDevice()});
   FunctionLibraryRuntime::Options opts;
   opts.source_device = "/job:a/replica:0/task:0/cpu:0";
-  opts.rendezvous = rendezvous_.get();
   opts.remote_execution = true;
   FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
   instantiate_opts.target = "/job:a/replica:0/task:0/cpu:0";
@@ -370,7 +369,6 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultipleCallsSameDeviceXTimes) {
   auto x = test::AsTensor<float>({1, 2, 3, 4});
   FunctionLibraryRuntime::Options opts;
   opts.source_device = "/job:a/replica:0/task:0/cpu:0";
-  opts.rendezvous = rendezvous_.get();
   opts.remote_execution = true;
   FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
   instantiate_opts.target = "/job:a/replica:0/task:0/cpu:0";
@@ -387,7 +385,6 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultipleCallsSameDeviceFindDevice) {
   Init({test::function::FindDevice()});
   FunctionLibraryRuntime::Options opts;
   opts.source_device = "/job:a/replica:0/task:0/cpu:0";
-  opts.rendezvous = rendezvous_.get();
   opts.remote_execution = true;
   FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
   instantiate_opts.target = "/job:a/replica:0/task:0/cpu:1";
@@ -406,7 +403,6 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultipleCallsDiffDeviceFindDevice) {
   Init({test::function::FindDevice()});
   FunctionLibraryRuntime::Options opts;
   opts.source_device = "/job:a/replica:0/task:0/cpu:0";
-  opts.rendezvous = rendezvous_.get();
   opts.remote_execution = true;
   Tensor y;
   FunctionLibraryRuntime::InstantiateOptions instantiate_opts_0;
@@ -427,7 +423,6 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, ClusterFLRSerialTest) {
   Init({test::function::FindDevice()});
   FunctionLibraryRuntime::Options opts;
   opts.source_device = "/job:a/replica:0/task:0/cpu:0";
-  opts.rendezvous = rendezvous_.get();
   opts.remote_execution = true;
   FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
   instantiate_opts.target = "/job:b/replica:0/task:0/device:CPU:0";
@@ -457,7 +452,6 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, ClusterFLRParallelTest) {
   Init({test::function::FindDevice()});
   FunctionLibraryRuntime::Options opts;
   opts.source_device = "/job:a/replica:0/task:0/cpu:0";
-  opts.rendezvous = rendezvous_.get();
   opts.remote_execution = true;
   FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
   instantiate_opts.target = "/job:b/replica:0/task:0/device:CPU:0";
@@ -504,7 +498,6 @@ void TestTwoDeviceMult(
     const string& error = "") {
   fixture->Init({test::function::TwoDeviceMult()});
   FunctionLibraryRuntime::Options opts;
-  opts.rendezvous = fixture->rendezvous_.get();
   auto x = test::AsTensor<float>({1, 2, 3});
   Tensor y_cpu;
   Tensor y_gpu;
@@ -537,7 +530,6 @@ void TestTwoDeviceInputOutput(
   fixture->Init({test::function::TwoDeviceInputOutput()});
 
   FunctionLibraryRuntime::Options opts;
-  opts.rendezvous = fixture->rendezvous_.get();
   Tensor x1 = test::AsTensor<float>({1, 2});
   if (absl::StrContains(inst_opts.input_devices[0], "GPU")) {
     x1 = fixture->CPUToGPU(x1);
@@ -738,7 +730,6 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ResourceOutput_GPU) {
 
   // Run the function taking a resource and outputing it
   FunctionLibraryRuntime::Options opts;
-  opts.rendezvous = rendezvous_.get();
   Tensor x1 = CPUToGPU(test::AsTensor<float>({1, 2}));
   Tensor x2 = GetResourceHandle("my_gpu_var", mgr->default_container(),
                                 "/job:a/replica:0/task:0/device:GPU:0");
@@ -980,7 +971,6 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, SessionMetadataAbsent) {
   Init({SessionMetadataReaderOpFn()}, /*session_metadata=*/nullptr);
   FunctionLibraryRuntime::Options opts;
   opts.source_device = "/job:a/replica:0/task:0/cpu:0";
-  opts.rendezvous = rendezvous_.get();
   opts.remote_execution = true;
   FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
   instantiate_opts.target = "/job:a/replica:0/task:0/cpu:0";
@@ -996,7 +986,6 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, SessionMetadataPresent) {
   Init({SessionMetadataReaderOpFn()}, &session_metadata);
   FunctionLibraryRuntime::Options opts;
   opts.source_device = "/job:a/replica:0/task:0/cpu:0";
-  opts.rendezvous = rendezvous_.get();
   opts.remote_execution = true;
   FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
   instantiate_opts.target = "/job:a/replica:0/task:0/cpu:0";
@@ -1022,7 +1011,6 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, SessionMetadataPresentAfterCloning) {
   TF_ASSERT_OK(flr->Clone(&cloned_lib_def, &cloned_proc_flr, &cloned_flr));
   FunctionLibraryRuntime::Options opts;
   opts.source_device = "/job:a/replica:0/task:0/cpu:0";
-  opts.rendezvous = rendezvous_.get();
   opts.remote_execution = true;
   FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
   instantiate_opts.target = "/job:a/replica:0/task:0/cpu:0";
diff --git a/tensorflow/core/common_runtime/process_state.h b/tensorflow/core/common_runtime/process_state.h
index 19921cdc18a..a106a934677 100644
--- a/tensorflow/core/common_runtime/process_state.h
+++ b/tensorflow/core/common_runtime/process_state.h
@@ -98,14 +98,14 @@ class ProcessState : public ProcessStateInterface {
 
   // Indexed by numa_node.  If we want numa-specific allocators AND a
   // non-specific allocator, maybe should index by numa_node+1.
-  std::vector<Allocator*> cpu_allocators_ GUARDED_BY(mu_);
-  std::vector<SubAllocator::Visitor> cpu_alloc_visitors_ GUARDED_BY(mu_);
-  std::vector<SubAllocator::Visitor> cpu_free_visitors_ GUARDED_BY(mu_);
+  std::vector<Allocator*> cpu_allocators_ TF_GUARDED_BY(mu_);
+  std::vector<SubAllocator::Visitor> cpu_alloc_visitors_ TF_GUARDED_BY(mu_);
+  std::vector<SubAllocator::Visitor> cpu_free_visitors_ TF_GUARDED_BY(mu_);
 
   // Optional RecordingAllocators that wrap the corresponding
   // Allocators for runtime attribute use analysis.
   MDMap mem_desc_map_;
-  std::vector<Allocator*> cpu_al_ GUARDED_BY(mu_);
+  std::vector<Allocator*> cpu_al_ TF_GUARDED_BY(mu_);
 };
 
 namespace internal {
diff --git a/tensorflow/core/common_runtime/renamed_device.h b/tensorflow/core/common_runtime/renamed_device.h
index c2ddd3a8dd1..cbec750e86c 100644
--- a/tensorflow/core/common_runtime/renamed_device.h
+++ b/tensorflow/core/common_runtime/renamed_device.h
@@ -36,11 +36,6 @@ class RenamedDevice : public Device {
 
   ~RenamedDevice() override;
 
-  // Below are virtual methods defined on DeviceBase
-  bool RequiresRecordingAccessedTensors() const override {
-    return underlying_device_->RequiresRecordingAccessedTensors();
-  }
-
   const DeviceBase* UnderlyingDevice() const override {
     return underlying_device_->UnderlyingDevice();
   }
@@ -138,11 +133,6 @@ class RenamedDevice : public Device {
     underlying_device_->ComputeAsync(op_kernel, context, std::move(done));
   }
 
-  void ConsumeListOfAccessedTensors(
-      DeviceContext* context, const TensorReferenceVector& tensors) override {
-    underlying_device_->ConsumeListOfAccessedTensors(context, tensors);
-  }
-
   Status Sync() override { return underlying_device_->Sync(); }
 
   Status MaybeRewriteGraph(std::unique_ptr<Graph>* graph) override {
diff --git a/tensorflow/core/common_runtime/ring_alg.h b/tensorflow/core/common_runtime/ring_alg.h
index 645fff6ea40..c2da62c86d7 100644
--- a/tensorflow/core/common_runtime/ring_alg.h
+++ b/tensorflow/core/common_runtime/ring_alg.h
@@ -102,8 +102,8 @@ class RingAlg : public CollectiveImplementationInterface {
    private:
     mutex pcq_mu_;
     condition_variable cv_;
-    int waiter_count_ GUARDED_BY(pcq_mu_) = 0;
-    std::deque<RingField*> deque_ GUARDED_BY(pcq_mu_);
+    int waiter_count_ TF_GUARDED_BY(pcq_mu_) = 0;
+    std::deque<RingField*> deque_ TF_GUARDED_BY(pcq_mu_);
   };
 
   const CollectiveType type_;
@@ -117,7 +117,7 @@ class RingAlg : public CollectiveImplementationInterface {
   Notification group_size_tensor_ready_;
   std::unique_ptr<CollectiveAdapter> ca_;
   mutex status_mu_;
-  Status status_ GUARDED_BY(status_mu_);
+  Status status_ TF_GUARDED_BY(status_mu_);
   std::vector<RingField> rfv_;
 };
 
diff --git a/tensorflow/core/common_runtime/ring_gatherer_test.cc b/tensorflow/core/common_runtime/ring_gatherer_test.cc
index e5dda048ea4..bb2b6f3e0cf 100644
--- a/tensorflow/core/common_runtime/ring_gatherer_test.cc
+++ b/tensorflow/core/common_runtime/ring_gatherer_test.cc
@@ -94,7 +94,7 @@ class FailTestRMA : public CollectiveRemoteAccessLocal {
   }
 
   mutex mu_;
-  int fail_after_ GUARDED_BY(mu_);
+  int fail_after_ TF_GUARDED_BY(mu_);
 };
 
 std::unique_ptr<OpKernel> GetKernel(const NodeDef& node,
@@ -528,7 +528,7 @@ class RingGathererTest : public ::testing::Test {
   std::unique_ptr<tensorflow::DeviceMgr> dev_mgr_;
   std::unique_ptr<string> gpu_ring_order_;
   mutex mu_;
-  int32 gather_counter_ GUARDED_BY(mu_) = 0;
+  int32 gather_counter_ TF_GUARDED_BY(mu_) = 0;
 };
 
 CollectiveParams SetUpCollectiveParams(const int num_devs_per_task,
diff --git a/tensorflow/core/common_runtime/ring_reducer_test.cc b/tensorflow/core/common_runtime/ring_reducer_test.cc
index 2a5db7989c8..d1c705437ba 100644
--- a/tensorflow/core/common_runtime/ring_reducer_test.cc
+++ b/tensorflow/core/common_runtime/ring_reducer_test.cc
@@ -94,7 +94,7 @@ class FailTestRMA : public CollectiveRemoteAccessLocal {
   }
 
   mutex mu_;
-  int fail_after_ GUARDED_BY(mu_);
+  int fail_after_ TF_GUARDED_BY(mu_);
 };
 
 std::unique_ptr<OpKernel> GetKernel(const NodeDef& node,
@@ -556,7 +556,7 @@ class RingReducerTest : public ::testing::Test {
   std::unique_ptr<tensorflow::DeviceMgr> dev_mgr_;
   std::unique_ptr<string> gpu_ring_order_;
   mutex mu_;
-  int32 reduce_counter_ GUARDED_BY(mu_) = 0;
+  int32 reduce_counter_ TF_GUARDED_BY(mu_) = 0;
 };
 
 CollectiveParams SetUpCollectiveParams(const int num_devs_per_task,
diff --git a/tensorflow/core/common_runtime/scoped_allocator.h b/tensorflow/core/common_runtime/scoped_allocator.h
index 1b458aacb23..8ab5ee038c8 100644
--- a/tensorflow/core/common_runtime/scoped_allocator.h
+++ b/tensorflow/core/common_runtime/scoped_allocator.h
@@ -51,7 +51,7 @@ class ScopedAllocator {
 
   // Automatically deletes when last use expires, or when
   // ScopedAllocatorContainer decides to delete.
-  ~ScopedAllocator() LOCKS_EXCLUDED(mu_);
+  ~ScopedAllocator() TF_LOCKS_EXCLUDED(mu_);
 
   // For debugging: returns true iff p is a pointer that could have
   // been returned by AllocateRaw.
@@ -66,8 +66,8 @@ class ScopedAllocator {
   friend class ScopedAllocatorInstance;
   // Only ScopedAllocatorInstances can call AllocateRaw and DeallocateRaw on a
   // ScopedAllocator
-  void* AllocateRaw(int32 field_index, size_t num_bytes) LOCKS_EXCLUDED(mu_);
-  void DeallocateRaw(void* p) LOCKS_EXCLUDED(mu_);
+  void* AllocateRaw(int32 field_index, size_t num_bytes) TF_LOCKS_EXCLUDED(mu_);
+  void DeallocateRaw(void* p) TF_LOCKS_EXCLUDED(mu_);
   Tensor backing_tensor_;
   TensorBuffer* tbuf_;
   int32 id_;
@@ -75,8 +75,8 @@ class ScopedAllocator {
   ScopedAllocatorContainer* container_;
   std::vector<Field> fields_;
   mutex mu_;
-  int32 expected_call_count_ GUARDED_BY(mu_);
-  int32 live_alloc_count_ GUARDED_BY(mu_);
+  int32 expected_call_count_ TF_GUARDED_BY(mu_);
+  int32 live_alloc_count_ TF_GUARDED_BY(mu_);
 };
 
 // An Allocator that will return a pointer into the backing buffer of
@@ -98,14 +98,14 @@ class ScopedAllocatorInstance : public Allocator {
   // When a ScopedAllocatorContainer "Drops" a scope_id, it calls DropFromTable
   // on the underlying ScopedAllocatorInstance.  If this instance has already
   // deallocated the tensor slice, we can safely delete this.
-  void DropFromTable() LOCKS_EXCLUDED(mu_);
+  void DropFromTable() TF_LOCKS_EXCLUDED(mu_);
   void* AllocateRaw(size_t alignment, size_t num_bytes)
-      LOCKS_EXCLUDED(mu_) override;
+      TF_LOCKS_EXCLUDED(mu_) override;
   void* AllocateRaw(size_t alignment, size_t num_bytes,
                     const AllocationAttributes& allocator_attr) override {
     return AllocateRaw(alignment, num_bytes);
   }
-  void DeallocateRaw(void* p) LOCKS_EXCLUDED(mu_) override;
+  void DeallocateRaw(void* p) TF_LOCKS_EXCLUDED(mu_) override;
   bool TracksAllocationSizes() const override { return false; }
   size_t RequestedSize(const void* ptr) const override { return 0; }
   size_t AllocatedSize(const void* ptr) const override { return 0; }
@@ -117,9 +117,9 @@ class ScopedAllocatorInstance : public Allocator {
   mutex mu_;
   ScopedAllocator* scoped_allocator_;
   int32 field_index_;
-  bool allocated_ GUARDED_BY(mu_);
-  bool deallocated_ GUARDED_BY(mu_);
-  bool in_table_ GUARDED_BY(mu_);
+  bool allocated_ TF_GUARDED_BY(mu_);
+  bool deallocated_ TF_GUARDED_BY(mu_);
+  bool in_table_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/scoped_allocator_mgr.h b/tensorflow/core/common_runtime/scoped_allocator_mgr.h
index 8c5e853472d..d10679a5488 100644
--- a/tensorflow/core/common_runtime/scoped_allocator_mgr.h
+++ b/tensorflow/core/common_runtime/scoped_allocator_mgr.h
@@ -66,7 +66,7 @@ class ScopedAllocatorContainer : public core::RefCounted {
         : field_index(ScopedAllocator::kBackingIndex),
           scoped_allocator(nullptr) {}
   };
-  std::unordered_map<int32, SAField> allocators_ GUARDED_BY(mu_);
+  std::unordered_map<int32, SAField> allocators_ TF_GUARDED_BY(mu_);
 };
 
 // At most one of these exists per device.
@@ -103,7 +103,7 @@ class ScopedAllocatorMgr {
   string device_name_;
   mutex mu_;
   std::unordered_map<int64, ScopedAllocatorContainer*> per_step_map_
-      GUARDED_BY(mu_);
+      TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/shape_refiner_test.cc b/tensorflow/core/common_runtime/shape_refiner_test.cc
index 5a1ea0e0658..3812a8c181d 100644
--- a/tensorflow/core/common_runtime/shape_refiner_test.cc
+++ b/tensorflow/core/common_runtime/shape_refiner_test.cc
@@ -980,10 +980,10 @@ TEST_F(ShapeRefinerTest, ConstantValueAsShape_PackInt64) {
 
   InputList inputs{
       // clang-format off
-      Input(ops::Const<int64>(root, 10LL)),
-      Input(ops::Const<int64>(root, 20LL)),
+      Input(ops::Const<int64>(root, int64{10})),
+      Input(ops::Const<int64>(root, int64{20})),
       Input(Output(scalar_non_const)),
-      Input(ops::Const<int64>(root, 1LL << 40)),
+      Input(ops::Const<int64>(root, int64{1} << 40)),
   };  // clang-format on
   auto pack = ops::Stack(root, inputs);
   TF_ASSERT_OK(root.status());
@@ -1008,8 +1008,8 @@ TEST_F(ShapeRefinerTest, ConstantValueAsShape_PackUnknownDim) {
   Scope root = Scope::NewRootScope();
 
   InputList inputs{
-      Input(ops::Const<int64>(root, 10LL)),
-      Input(ops::Const<int64>(root, -1LL)),
+      Input(ops::Const<int64>(root, int64{10})),
+      Input(ops::Const<int64>(root, int64{-1})),
   };
   auto pack = ops::Stack(root, inputs);
   TF_ASSERT_OK(root.status());
@@ -1035,8 +1035,8 @@ TEST_F(ShapeRefinerTest, ConstantValueAsShape_PackInvalidInput) {
 
   // Inputs are length 2 vectors instead of scalars.
   InputList inputs{
-      Input(ops::Const<int64>(root, {10LL, 20LL})),
-      Input(ops::Const<int64>(root, {10LL, 21LL})),
+      Input(ops::Const<int64>(root, {int64{10}, int64{20}})),
+      Input(ops::Const<int64>(root, {int64{10}, int64{21}})),
   };
   auto pack = ops::Stack(root, inputs);
   TF_ASSERT_OK(root.status());
diff --git a/tensorflow/core/common_runtime/step_stats_collector.cc b/tensorflow/core/common_runtime/step_stats_collector.cc
index 004ebb3f0f8..e9f913d72b9 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.cc
+++ b/tensorflow/core/common_runtime/step_stats_collector.cc
@@ -165,16 +165,6 @@ void NodeExecStatsWrapper::SetOutput(int slot, const Tensor* tensor) {
   tensor->FillDescription(node_output->mutable_tensor_description());
 }
 
-void NodeExecStatsWrapper::SetReferencedTensors(
-    const TensorReferenceVector& tensors) {
-  // be careful not to increment the reference count on any tensor
-  // while recording the information
-  for (size_t i = 0; i < tensors.size(); ++i) {
-    AllocationDescription* description = stats_->add_referenced_tensor();
-    tensors.at(i).FillDescription(description);
-  }
-}
-
 void NodeExecStatsWrapper::AddAllocation(
     Allocator* allocator, TrackingAllocator* tracking_allocator) {
   AllocatorMemoryUsed* memory = stats_->add_memory();
diff --git a/tensorflow/core/common_runtime/step_stats_collector.h b/tensorflow/core/common_runtime/step_stats_collector.h
index cb610e9a84f..f2789c729b9 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.h
+++ b/tensorflow/core/common_runtime/step_stats_collector.h
@@ -81,10 +81,6 @@ class NodeExecStatsInterface {
   // output slot.
   virtual void SetOutput(int slot, const Tensor* tensor) = 0;
 
-  // Records information about the tensors that were accessed during the
-  // execution of this node.
-  virtual void SetReferencedTensors(const TensorReferenceVector& tensors) = 0;
-
   // Records the absolute time in nanoseconds at which this node became
   // runnable (i.e. was scheduled for execution).
   virtual void SetScheduled(int64 nanos) = 0;
@@ -113,7 +109,6 @@ class NodeExecStatsWrapper : public NodeExecStatsInterface {
   bool TrackAllocations() const override { return true; }
   void SetMemory(OpKernelContext* ctx) override;
   void SetOutput(int slot, const Tensor* tensor) override;
-  void SetReferencedTensors(const TensorReferenceVector& tensors) override;
   void SetScheduled(int64 nanos) override;
 
  private:
@@ -198,14 +193,14 @@ class StepStatsCollector : public StepStatsCollectorInterface {
   typedef std::vector<std::unique_ptr<NodeExecStatsWrapper>> NodeStatsVector;
   typedef std::unordered_map<uint32, string> ThreadNamesMap;
 
-  void FinalizeInternal() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  void FinalizeInternal() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   mutex mu_;
-  bool finalized_ GUARDED_BY(mu_);
-  std::unordered_map<string, NodeStatsVector> dev_stats_ GUARDED_BY(mu_);
-  std::unordered_map<string, ThreadNamesMap> thread_names_ GUARDED_BY(mu_);
-  StepStats* step_stats_ GUARDED_BY(mu_);
-  uint64 collected_nodes_ GUARDED_BY(mu_) = 0;
+  bool finalized_ TF_GUARDED_BY(mu_);
+  std::unordered_map<string, NodeStatsVector> dev_stats_ TF_GUARDED_BY(mu_);
+  std::unordered_map<string, ThreadNamesMap> thread_names_ TF_GUARDED_BY(mu_);
+  StepStats* step_stats_ TF_GUARDED_BY(mu_);
+  uint64 collected_nodes_ TF_GUARDED_BY(mu_) = 0;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/sycl/sycl_allocator.h b/tensorflow/core/common_runtime/sycl/sycl_allocator.h
index b4b74985a07..a70291181d0 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_allocator.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_allocator.h
@@ -64,8 +64,8 @@ class SYCLAllocator : public Allocator {
 
  private:
   mutable mutex mu_;
-  Eigen::SyclDevice* sycl_device_ GUARDED_BY(mu_);  // owned
-  AllocatorStats stats_ GUARDED_BY(mu_);
+  Eigen::SyclDevice* sycl_device_ TF_GUARDED_BY(mu_);  // owned
+  AllocatorStats stats_ TF_GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(SYCLAllocator);
 };
diff --git a/tensorflow/core/common_runtime/test_collective_executor_mgr.h b/tensorflow/core/common_runtime/test_collective_executor_mgr.h
index 6436dea4f2a..be99be3a738 100644
--- a/tensorflow/core/common_runtime/test_collective_executor_mgr.h
+++ b/tensorflow/core/common_runtime/test_collective_executor_mgr.h
@@ -114,7 +114,7 @@ class TestCollectiveExecutorMgr : public CollectiveExecutorMgrInterface {
   void RetireStepId(int64 graph_key, int64 step_id) override {}
 
   mutex mu_;
-  gtl::FlatMap<int64, CollectiveExecutor*> table_ GUARDED_BY(mu_);
+  gtl::FlatMap<int64, CollectiveExecutor*> table_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/BUILD b/tensorflow/core/data/BUILD
new file mode 100644
index 00000000000..5170bb27498
--- /dev/null
+++ b/tensorflow/core/data/BUILD
@@ -0,0 +1,35 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core/platform:build_config.bzl", "tf_protos_all")
+
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "standalone",
+    srcs = ["standalone.cc"],
+    hdrs = ["standalone.h"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:session_options",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+tf_cc_test(
+    name = "standalone_test",
+    srcs = ["standalone_test.cc"],
+    deps = [
+        ":standalone",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ] + tf_protos_all(),
+)
diff --git a/tensorflow/core/data/standalone.cc b/tensorflow/core/data/standalone.cc
new file mode 100644
index 00000000000..cf2d921e976
--- /dev/null
+++ b/tensorflow/core/data/standalone.cc
@@ -0,0 +1,149 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/data/standalone.h"
+
+#include <memory>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace data {
+namespace standalone {
+
+Status Iterator::GetNext(std::vector<Tensor>* outputs, bool* end_of_input) {
+  return iterator_->GetNext(ctx_.get(), outputs, end_of_input);
+}
+
+Iterator::Iterator(IteratorBase* iterator, IteratorContext* ctx)
+    : iterator_(iterator), ctx_(ctx) {}
+
+Status Dataset::FromGraph(Params params, const GraphDef& graph_def,
+                          std::unique_ptr<Dataset>* result) {
+  Graph graph(OpRegistry::Global());
+  TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, nullptr));
+
+  // Instantiate enough of the TF runtime to run `graph` on a single CPU device.
+  auto device_mgr = absl::make_unique<StaticDeviceMgr>(DeviceFactory::NewDevice(
+      "CPU", params.session_options, "/job:localhost/replica:0/task:0"));
+  Device* device = device_mgr->ListDevices()[0];
+  // Clone the `FunctionLibraryDefinition` to extend its lifetime extends beyond
+  // the lifetime of `graph`.
+  auto flib_def =
+      absl::make_unique<FunctionLibraryDefinition>(graph.flib_def());
+  auto pflr = absl::make_unique<ProcessFunctionLibraryRuntime>(
+      device_mgr.get(), Env::Default(), /*config=*/nullptr,
+      TF_GRAPH_DEF_VERSION, flib_def.get(), OptimizerOptions{},
+      /*thread_pool=*/nullptr, /*parent=*/nullptr,
+      /*custom_kernel_creator=*/nullptr,
+      /*session_metadata=*/nullptr,
+      [](const int64, const DeviceMgr* device_mgr, Rendezvous** r) {
+        *r = new IntraProcessRendezvous(device_mgr);
+        return Status::OK();
+      });
+
+  string fetch_node = "";
+  for (auto node : graph_def.node()) {
+    if (node.op() == "_Retval") {
+      fetch_node = node.input(0);
+    }
+  }
+  if (fetch_node.empty()) {
+    return errors::NotFound("Failed to find a _Retval op in the given dataset");
+  }
+
+  // Run graph up to `output_node` and extract the `DatasetBase` stored in the
+  // DT_VARIANT output tensor.
+  data::DatasetBase* dataset;
+  {
+    std::vector<Tensor> outputs;
+    GraphRunner graph_runner(device);
+    TF_RETURN_IF_ERROR(graph_runner.Run(&graph, pflr->GetFLR("/device:CPU:0"),
+                                        {}, {fetch_node}, &outputs));
+    TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(outputs[0], &dataset));
+    // NOTE(mrry): The dataset is currently owned by `outputs[0]`, so acquire an
+    // additional reference.
+    dataset->Ref();
+  }
+
+  std::unique_ptr<thread::ThreadPool> pool(
+      NewThreadPoolFromSessionOptions(params.session_options));
+  *result =
+      WrapUnique(new Dataset(dataset, device_mgr.release(), pflr.release(),
+                             flib_def.release(), pool.release()));
+  return Status::OK();
+}  // static
+
+Status Dataset::MakeIterator(std::unique_ptr<Iterator>* result) {
+  // Create an `IteratorContext`, which bundles together the necessary runtime
+  // support to create and get elements from an iterator.
+  std::unique_ptr<IteratorContext> ctx;
+  {
+    // NOTE(mrry): In the current API, an `IteratorContext` is always initially
+    // created from an `OpKernelContext*`, so we need to create a fake
+    // `OpKernelContext` with the appropriate subset of parameters.
+    OpKernelContext::Params op_params;
+    op_params.function_library = pflr_->GetFLR("/device:CPU:0");
+    op_params.device = device_mgr_->ListDevices()[0];
+    op_params.runner = &runner_;
+    OpKernelContext op_ctx(&op_params, 0);
+    IteratorContext::Params params(&op_ctx);
+    params.function_handle_cache = function_handle_cache_.get();
+    params.resource_mgr = &resource_mgr_;
+    params.cancellation_manager = &cancellation_manager_;
+
+    ctx = absl::make_unique<IteratorContext>(std::move(params));
+  }
+
+  // Create the iterator from the dataset.
+  std::unique_ptr<IteratorBase> iterator;
+  TF_RETURN_IF_ERROR(dataset_->MakeIterator(ctx.get(), /*parent=*/nullptr,
+                                            "iterator", &iterator));
+
+  *result = WrapUnique(new Iterator(iterator.release(), ctx.release()));
+
+  return Status::OK();
+}
+
+Dataset::Dataset(DatasetBase* dataset, DeviceMgr* device_mgr,
+                 ProcessFunctionLibraryRuntime* pflr,
+                 FunctionLibraryDefinition* flib_def, thread::ThreadPool* pool)
+    : dataset_(dataset),
+      device_mgr_(device_mgr),
+      flib_def_(flib_def),
+      pflr_(pflr),
+      pool_(pool) {
+  runner_ = [this](std::function<void()> c) { pool_->Schedule(std::move(c)); };
+  function_handle_cache_ =
+      absl::make_unique<FunctionHandleCache>(pflr_->GetFLR("/device:CPU:0"));
+}
+
+Dataset::~Dataset() { dataset_->Unref(); }
+
+}  // namespace standalone
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/standalone.h b/tensorflow/core/data/standalone.h
new file mode 100644
index 00000000000..91d174fd384
--- /dev/null
+++ b/tensorflow/core/data/standalone.h
@@ -0,0 +1,122 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DATA_STANDALONE_H_
+#define TENSORFLOW_CORE_DATA_STANDALONE_H_
+
+#include <memory>
+
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace data {
+namespace standalone {
+
+// The purpose of the API in this file is to facilitate standalone execution of
+// a tf.data input pipeline graph.
+//
+// The API exposes two abstractions -- a `Dataset` and an `Iterator` -- which
+// encapsulate TensorFlow runtime.
+//
+// The `Dataset` abstraction represents an input pipeline as a collection
+// of data sources and a logical plan of transformations that operate over the
+// data.
+//
+// The `Iterator` abstraction represents an execution of an input pipeline that
+// can be used to enumerate its elements.
+//
+// Example usage:
+//
+//   // Create a `Dataset` by running the `graph_def` graph.
+//   tensorflow::data:standalone::Dataset::Params params;
+//   std::unique_ptr<tensorflow::data::standalone::Dataset> dataset;
+//   Status s = tensorflow::data::standalone::Dataset::FromGraph(
+//      params, graph_def, &dataset);
+//   if (!s.ok()) { /* error handling */ }
+//
+//   std::unique_ptr<tensorflow::data::standalone::Iterator> iterator;
+//   s = dataset->MakeIterator(&iterator);
+//   if (!s.ok()) { /* error handling */ }
+//
+//   bool end_of_input = false;
+//   while (!end_of_input) {
+//     std::vector<tensorflow::Tensor> outputs;
+//     s = iterator->GetNext(&outputs, &end_of_input);
+//     if (!s.ok()) { /* error handling */ }
+//     if (!end_of_input) { /* output handling */ }
+//   }
+
+class Dataset;
+
+// Represents an execution of an input pipeline that can be used to enumerate
+// its elements.
+class Iterator {
+ public:
+  // Returns the next element of the input pipeline (if there is one) and an
+  // indication of whether the end of the input pipeline has been reached.
+  Status GetNext(std::vector<Tensor>* outputs, bool* end_of_input);
+
+ private:
+  friend class Dataset;
+
+  Iterator(IteratorBase* iterator, IteratorContext* ctx);
+
+  std::unique_ptr<IteratorBase> iterator_;
+  std::unique_ptr<IteratorContext> ctx_;
+};
+
+// Represents an input pipeline as a collection of data sources and a logical
+// plan of transformations that operate over the data.
+class Dataset {
+ public:
+  // Parameters for `Dataset` creation (e.g. TensorFlow runtime configuration).
+  struct Params {
+    SessionOptions session_options;
+  };
+
+  // Creates a new `Dataset` instance by running the given dataset graph.
+  static Status FromGraph(Params params, const GraphDef& graph_def,
+                          std::unique_ptr<Dataset>* result);
+
+  ~Dataset();
+
+  // Creates an iterator for this dataset.
+  Status MakeIterator(std::unique_ptr<Iterator>* result);
+
+ private:
+  Dataset(DatasetBase* dataset, DeviceMgr* device_mgr,
+          ProcessFunctionLibraryRuntime* pflr,
+          FunctionLibraryDefinition* flib_def, thread::ThreadPool* pool);
+
+  DatasetBase* dataset_;  // owned
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  std::unique_ptr<thread::ThreadPool> pool_;
+  std::unique_ptr<FunctionHandleCache> function_handle_cache_;
+  std::function<void(std::function<void()>)> runner_;
+  ResourceMgr resource_mgr_;
+  CancellationManager cancellation_manager_;
+};
+
+}  // namespace standalone
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_STANDALONE_H_
diff --git a/tensorflow/core/data/standalone_test.cc b/tensorflow/core/data/standalone_test.cc
new file mode 100644
index 00000000000..fc2b4f4b723
--- /dev/null
+++ b/tensorflow/core/data/standalone_test.cc
@@ -0,0 +1,307 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/data/standalone.h"
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace data {
+namespace standalone {
+namespace {
+
+constexpr const char* const kRangeGraphProto = R"proto(
+  node {
+    name: "Const/_0"
+    op: "Const"
+    attr {
+      key: "dtype"
+      value { type: DT_INT64 }
+    }
+    attr {
+      key: "value"
+      value {
+        tensor {
+          dtype: DT_INT64
+          tensor_shape {}
+          int64_val: 0
+        }
+      }
+    }
+  }
+  node {
+    name: "Const/_1"
+    op: "Const"
+    attr {
+      key: "dtype"
+      value { type: DT_INT64 }
+    }
+    attr {
+      key: "value"
+      value {
+        tensor {
+          dtype: DT_INT64
+          tensor_shape {}
+          int64_val: 10
+        }
+      }
+    }
+  }
+  node {
+    name: "Const/_2"
+    op: "Const"
+    attr {
+      key: "dtype"
+      value { type: DT_INT64 }
+    }
+    attr {
+      key: "value"
+      value {
+        tensor {
+          dtype: DT_INT64
+          tensor_shape {}
+          int64_val: 1
+        }
+      }
+    }
+  }
+  node {
+    name: "RangeDataset/_3"
+    op: "RangeDataset"
+    input: "Const/_0"
+    input: "Const/_1"
+    input: "Const/_2"
+    attr {
+      key: "output_shapes"
+      value { list { shape {} } }
+    }
+    attr {
+      key: "output_types"
+      value { list { type: DT_INT64 } }
+    }
+  }
+  node {
+    name: "dataset"
+    op: "_Retval"
+    input: "RangeDataset/_3"
+    attr {
+      key: "T"
+      value { type: DT_VARIANT }
+    }
+    attr {
+      key: "index"
+      value { i: 0 }
+    }
+  }
+  library {}
+  versions { producer: 96 }
+)proto";
+
+// range(10).map(lambda x: x*x)
+constexpr const char* const kMapGraphProto = R"proto(
+  node {
+    name: "Const/_0"
+    op: "Const"
+    attr {
+      key: "dtype"
+      value { type: DT_INT64 }
+    }
+    attr {
+      key: "value"
+      value {
+        tensor {
+          dtype: DT_INT64
+          tensor_shape {}
+          int64_val: 0
+        }
+      }
+    }
+  }
+  node {
+    name: "Const/_1"
+    op: "Const"
+    attr {
+      key: "dtype"
+      value { type: DT_INT64 }
+    }
+    attr {
+      key: "value"
+      value {
+        tensor {
+          dtype: DT_INT64
+          tensor_shape {}
+          int64_val: 10
+        }
+      }
+    }
+  }
+  node {
+    name: "Const/_2"
+    op: "Const"
+    attr {
+      key: "dtype"
+      value { type: DT_INT64 }
+    }
+    attr {
+      key: "value"
+      value {
+        tensor {
+          dtype: DT_INT64
+          tensor_shape {}
+          int64_val: 1
+        }
+      }
+    }
+  }
+  node {
+    name: "RangeDataset/_3"
+    op: "RangeDataset"
+    input: "Const/_0"
+    input: "Const/_1"
+    input: "Const/_2"
+    attr {
+      key: "output_shapes"
+      value { list { shape {} } }
+    }
+    attr {
+      key: "output_types"
+      value { list { type: DT_INT64 } }
+    }
+  }
+  node {
+    name: "MapDataset/_4"
+    op: "MapDataset"
+    input: "RangeDataset/_3"
+    attr {
+      key: "Targuments"
+      value { list {} }
+    }
+    attr {
+      key: "f"
+      value { func { name: "__inference_Dataset_map_<lambda>_67" } }
+    }
+    attr {
+      key: "output_shapes"
+      value { list { shape {} } }
+    }
+    attr {
+      key: "output_types"
+      value { list { type: DT_INT64 } }
+    }
+    attr {
+      key: "preserve_cardinality"
+      value { b: false }
+    }
+    attr {
+      key: "use_inter_op_parallelism"
+      value { b: true }
+    }
+  }
+  node {
+    name: "dataset"
+    op: "_Retval"
+    input: "MapDataset/_4"
+    attr {
+      key: "T"
+      value { type: DT_VARIANT }
+    }
+    attr {
+      key: "index"
+      value { i: 0 }
+    }
+  }
+  library {
+    function {
+      signature {
+        name: "__inference_Dataset_map_<lambda>_67"
+        input_arg { name: "args_0" type: DT_INT64 }
+        output_arg { name: "identity" type: DT_INT64 }
+      }
+      node_def {
+        name: "mul"
+        op: "Mul"
+        input: "args_0"
+        input: "args_0"
+        attr {
+          key: "T"
+          value { type: DT_INT64 }
+        }
+      }
+      node_def {
+        name: "Identity"
+        op: "Identity"
+        input: "mul:z:0"
+        attr {
+          key: "T"
+          value { type: DT_INT64 }
+        }
+      }
+      ret { key: "identity" value: "Identity:output:0" }
+      arg_attr {
+        key: 0
+        value {
+          attr {
+            key: "_user_specified_name"
+            value { s: "args_0" }
+          }
+        }
+      }
+    }
+  }
+  versions { producer: 96 min_consumer: 12 }
+)proto";
+
+TEST(Scalar, Standalone) {
+  struct TestCase {
+    string graph_string;
+    std::vector<int64> expected_outputs;
+  };
+  auto test_cases = {
+      TestCase{kRangeGraphProto, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}},
+      TestCase{kMapGraphProto, {0, 1, 4, 9, 16, 25, 36, 49, 64, 81}},
+  };
+  for (auto test_case : test_cases) {
+    GraphDef graph_def;
+    protobuf::TextFormat::ParseFromString(test_case.graph_string, &graph_def);
+    std::unique_ptr<Dataset> dataset;
+    auto s = Dataset::FromGraph({}, graph_def, &dataset);
+    TF_EXPECT_OK(s);
+    std::unique_ptr<Iterator> iterator;
+    s = dataset->MakeIterator(&iterator);
+    TF_EXPECT_OK(s);
+    bool end_of_input = false;
+    for (int num_outputs = 0; !end_of_input; ++num_outputs) {
+      std::vector<tensorflow::Tensor> outputs;
+      s = iterator->GetNext(&outputs, &end_of_input);
+      TF_EXPECT_OK(s);
+      if (!end_of_input) {
+        EXPECT_EQ(outputs[0].scalar<int64>()(),
+                  test_case.expected_outputs[num_outputs]);
+      } else {
+        EXPECT_EQ(test_case.expected_outputs.size(), num_outputs);
+      }
+    }
+  }
+}
+
+}  // namespace
+}  // namespace standalone
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/debug/debug_callback_registry.h b/tensorflow/core/debug/debug_callback_registry.h
index bcd4ddc50c8..94b57401418 100644
--- a/tensorflow/core/debug/debug_callback_registry.h
+++ b/tensorflow/core/debug/debug_callback_registry.h
@@ -61,7 +61,7 @@ class DebugCallbackRegistry {
   mutex mu_;
 
   // Maps debug_url keys to callbacks for routing observed tensors.
-  std::map<string, EventCallback> keyed_callback_ GUARDED_BY(mu_);
+  std::map<string, EventCallback> keyed_callback_ TF_GUARDED_BY(mu_);
 
   static DebugCallbackRegistry* instance_;
 };
diff --git a/tensorflow/core/debug/debug_grpc_io_utils_test.cc b/tensorflow/core/debug/debug_grpc_io_utils_test.cc
index 26fd376cc6a..318368b7309 100644
--- a/tensorflow/core/debug/debug_grpc_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_grpc_io_utils_test.cc
@@ -192,8 +192,8 @@ TEST_F(GrpcDebugTest, SendMultipleDebugTensorsSynchronizedViaGrpcTest) {
 
   mutex mu;
   Notification all_done;
-  int tensor_count GUARDED_BY(mu) = 0;
-  std::vector<Status> statuses GUARDED_BY(mu);
+  int tensor_count TF_GUARDED_BY(mu) = 0;
+  std::vector<Status> statuses TF_GUARDED_BY(mu);
 
   const std::vector<string> urls({server_data_.url});
 
diff --git a/tensorflow/core/debug/debug_grpc_testlib.h b/tensorflow/core/debug/debug_grpc_testlib.h
index 93376613b60..f875861066a 100644
--- a/tensorflow/core/debug/debug_grpc_testlib.h
+++ b/tensorflow/core/debug/debug_grpc_testlib.h
@@ -60,9 +60,9 @@ class TestEventListenerImpl final : public EventListener::Service {
   std::atomic_bool stop_requested_;
   std::atomic_bool stopped_;
 
-  std::vector<DebugNodeKey> debug_node_keys_ GUARDED_BY(states_mu_);
+  std::vector<DebugNodeKey> debug_node_keys_ TF_GUARDED_BY(states_mu_);
   std::vector<EventReply::DebugOpStateChange::State> new_states_
-      GUARDED_BY(states_mu_);
+      TF_GUARDED_BY(states_mu_);
 
   std::unordered_set<DebugNodeKey> write_enabled_debug_node_keys_;
 
diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc
index 643dde7ad8c..c4e99adf0e2 100644
--- a/tensorflow/core/debug/debug_io_utils.cc
+++ b/tensorflow/core/debug/debug_io_utils.cc
@@ -395,11 +395,12 @@ Status DebugIO::PublishDebugMetadata(
     } else if (absl::StartsWith(absl::AsciiStrToLower(url), kFileURLScheme)) {
       const string dump_root_dir = url.substr(strlen(kFileURLScheme));
       const string core_metadata_path = AppendTimestampToFilePath(
-          io::JoinPath(
-              dump_root_dir,
-              strings::StrCat(DebugNodeKey::kMetadataFilePrefix,
-                              DebugIO::kCoreMetadataTag, "sessionrun",
-                              strings::Printf("%.14lld", session_run_index))),
+          io::JoinPath(dump_root_dir,
+                       strings::StrCat(
+                           DebugNodeKey::kMetadataFilePrefix,
+                           DebugIO::kCoreMetadataTag, "sessionrun",
+                           strings::Printf("%.14lld", static_cast<long long>(
+                                                          session_run_index)))),
           Env::Default()->NowMicros());
       status.Update(DebugFileIO::DumpEventProtoToFile(
           event, string(io::Dirname(core_metadata_path)),
diff --git a/tensorflow/core/debug/debug_io_utils_test.cc b/tensorflow/core/debug/debug_io_utils_test.cc
index 3eebcb3f138..e381290271a 100644
--- a/tensorflow/core/debug/debug_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_io_utils_test.cc
@@ -357,11 +357,11 @@ TEST_F(DebugIOUtilsTest, PublishTensorConcurrentlyToPartiallyOverlappingPaths) {
   const string dump_root_base = testing::TmpDir();
 
   mutex mu;
-  std::vector<string> dump_roots GUARDED_BY(mu);
-  std::vector<string> dump_file_paths GUARDED_BY(mu);
+  std::vector<string> dump_roots TF_GUARDED_BY(mu);
+  std::vector<string> dump_file_paths TF_GUARDED_BY(mu);
 
-  int dump_count GUARDED_BY(mu) = 0;
-  int done_count GUARDED_BY(mu) = 0;
+  int dump_count TF_GUARDED_BY(mu) = 0;
+  int done_count TF_GUARDED_BY(mu) = 0;
   Notification all_done;
 
   auto fn = [this, &dump_count, &done_count, &mu, &dump_root_base, &dump_roots,
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
index fd794c4a749..4e536aa8ee1 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
@@ -103,7 +103,7 @@ class BaseRendezvousMgr : public RendezvousMgrInterface {
   const WorkerEnv* const worker_env_;
 
   mutex mu_;
-  Table table_ GUARDED_BY(mu_);
+  Table table_ TF_GUARDED_BY(mu_);
 
   BaseRemoteRendezvous* FindOrCreate(int64 step_id);
 
@@ -182,9 +182,9 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
   mutable mutex init_mu_;
 
   // Status given by StartAbort() if any.
-  Status status_ GUARDED_BY(init_mu_);
+  Status status_ TF_GUARDED_BY(init_mu_);
 
-  WorkerSession* session_ GUARDED_BY(init_mu_);  // Not owned.
+  WorkerSession* session_ TF_GUARDED_BY(init_mu_);  // Not owned.
 
   // Data structures to handle calls when partially initialized.
   struct DeferredCall {
@@ -193,16 +193,16 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
 
     DeferredCall(const ParsedKey& parsed, DoneCallback done);
   };
-  std::vector<DeferredCall> deferred_calls_ GUARDED_BY(init_mu_);
+  std::vector<DeferredCall> deferred_calls_ TF_GUARDED_BY(init_mu_);
 
   typedef std::function<void()> InactiveCallback;
 
   // Active outstanding RecvTensor calls.
   mutex active_mu_;
   std::unordered_map<BaseRecvTensorCall*, InactiveCallback> active_
-      GUARDED_BY(active_mu_);
+      TF_GUARDED_BY(active_mu_);
 
-  bool is_initialized_locked() SHARED_LOCKS_REQUIRED(init_mu_) {
+  bool is_initialized_locked() TF_SHARED_LOCKS_REQUIRED(init_mu_) {
     return session_ != nullptr;
   }
 
diff --git a/tensorflow/core/distributed_runtime/call_options.h b/tensorflow/core/distributed_runtime/call_options.h
index 19917046a52..4492493b710 100644
--- a/tensorflow/core/distributed_runtime/call_options.h
+++ b/tensorflow/core/distributed_runtime/call_options.h
@@ -68,10 +68,10 @@ class CallOptions {
 
  private:
   mutex mu_;
-  CancelFunction cancel_func_ GUARDED_BY(mu_);
+  CancelFunction cancel_func_ TF_GUARDED_BY(mu_);
 
   // RPC operation timeout in milliseconds.
-  int64 timeout_in_ms_ GUARDED_BY(mu_) = 0;
+  int64 timeout_in_ms_ TF_GUARDED_BY(mu_) = 0;
 
   TF_DISALLOW_COPY_AND_ASSIGN(CallOptions);
 };
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
index 5e44fbf0967..78fd550366b 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
@@ -82,7 +82,7 @@ class ClusterFunctionLibraryRuntime : public DistributedFunctionLibraryRuntime {
           recv_keys(recv_keys) {}
   };
 
-  std::vector<FunctionData> function_data_ GUARDED_BY(mu_);
+  std::vector<FunctionData> function_data_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h
index a35131d8350..7d30c3d5e55 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h
@@ -48,11 +48,11 @@ class CollectiveParamResolverDistributed : public CollectiveParamResolverLocal {
  protected:
   // Returns true iff there's an entry for this group_key in the
   // local group_table_.
-  bool GroupIsCached(int32 group_key) LOCKS_EXCLUDED(group_mu_);
+  bool GroupIsCached(int32 group_key) TF_LOCKS_EXCLUDED(group_mu_);
 
   // Updates group_table_ with contents of resp.
   Status UpdateGroupCache(const CompleteGroupResponse& resp)
-      LOCKS_EXCLUDED(group_mu_);
+      TF_LOCKS_EXCLUDED(group_mu_);
 
   // Finds the GroupRec that corresponds to cp->group_key and also
   // populates cp->group from that GroupRec.
@@ -65,13 +65,13 @@ class CollectiveParamResolverDistributed : public CollectiveParamResolverLocal {
 
   // Returns true iff there's an entry for this instance_key in the
   // local instance_table_.
-  bool InstanceIsCached(int32 instance_key) LOCKS_EXCLUDED(instance_mu_);
+  bool InstanceIsCached(int32 instance_key) TF_LOCKS_EXCLUDED(instance_mu_);
 
   // Updates instance_table_ with contents of resp.
   void UpdateInstanceCache(const GroupRec* gr, CollectiveParams* cp,
                            const CompleteInstanceResponse& resp,
                            const StatusCallback& done)
-      LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
+      TF_LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
 
   // Finish populating *cp.  Semantics are like those of
   // CompleteInstanceLocal but will make a remote call to the group
@@ -80,7 +80,7 @@ class CollectiveParamResolverDistributed : public CollectiveParamResolverLocal {
                                    CollectiveParams* cp,
                                    CancellationManager* cancel_mgr,
                                    const StatusCallback& done)
-      LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
+      TF_LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
 
   WorkerCacheInterface* worker_cache_;  // Not owned
   const string group_leader_;
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
index aba84864f08..cd6a3d53b7d 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
@@ -293,7 +293,7 @@ class DeviceResDistTest : public ::testing::Test {
   std::vector<CollectiveParams> cp_;
   std::vector<Status> status_;
   mutex mu_;
-  int num_done_ GUARDED_BY(mu_);
+  int num_done_ TF_GUARDED_BY(mu_);
   condition_variable done_;
 };
 
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
index b2af3c218a8..c7d218258e8 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
@@ -129,6 +129,9 @@ void CollectiveRemoteAccessDistributed::RecvFromPeer(
         }
         AllocatorAttributes cpu_attr;
         cpu_attr.set_gpu_compatible(true);
+        MEMDEBUG_CACHE_OP(
+            "CollectiveRemoteAccessDistributed::RecvFromPeer"
+            "::recv_buf_callback");
         Tensor* cpu_tensor = new Tensor(cpu_dev->GetAllocator(cpu_attr),
                                         to_tensor->dtype(), to_tensor->shape());
         PopulateTensorFromExtra(extra, cpu_tensor);
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
index fd6517ed41f..2975442d988 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
@@ -264,7 +264,7 @@ class CollRMADistTest : public ::testing::Test {
   std::vector<FakeWorker*> workers_;
   std::unique_ptr<CollectiveRemoteAccessDistributed> rma_;
   mutex mu_;
-  int num_done_ GUARDED_BY(mu_);
+  int num_done_ TF_GUARDED_BY(mu_);
   condition_variable done_;
   Tensor expected_value_;
   Tensor to_tensor_;
diff --git a/tensorflow/core/distributed_runtime/device_resolver_distributed.h b/tensorflow/core/distributed_runtime/device_resolver_distributed.h
index f4391f822f5..93d51a52fef 100644
--- a/tensorflow/core/distributed_runtime/device_resolver_distributed.h
+++ b/tensorflow/core/distributed_runtime/device_resolver_distributed.h
@@ -50,7 +50,8 @@ class DeviceResolverDistributed : public DeviceResolverInterface {
  protected:
   // Loads attr_table_ with device attributes retrieved from remote task.
   void RefreshRemoteAttributes(const string& device, const string& task,
-                               const StatusCallback& done) LOCKS_EXCLUDED(mu_);
+                               const StatusCallback& done)
+      TF_LOCKS_EXCLUDED(mu_);
 
   // Subroutine used by GetAllDeviceAttributesAsync.  Recursively extends
   // *attributes with DeviceAttributes of the corresponding device named
@@ -63,7 +64,7 @@ class DeviceResolverDistributed : public DeviceResolverInterface {
   WorkerCacheInterface* worker_cache_;  // Not owned
   const string task_name_;
   mutex mu_;
-  absl::flat_hash_map<string, DeviceAttributes> attr_table_ GUARDED_BY(mu_);
+  absl::flat_hash_map<string, DeviceAttributes> attr_table_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/eager/BUILD b/tensorflow/core/distributed_runtime/eager/BUILD
index 411e3d3afaa..49d657e3bf8 100644
--- a/tensorflow/core/distributed_runtime/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/eager/BUILD
@@ -112,6 +112,7 @@ cc_library(
         "//tensorflow/core/common_runtime/eager:execute",
         "//tensorflow/core/common_runtime/eager:process_function_library_runtime",
         "//tensorflow/core/common_runtime/eager:tensor_handle",
+        "//tensorflow/core/distributed_runtime:message_wrappers",
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/distributed_runtime:session_mgr",
         "//tensorflow/core/distributed_runtime:worker_cache",
@@ -149,6 +150,7 @@ tf_cc_test(
         "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
         "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:variant",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
index 24961501964..4232d5223e5 100644
--- a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
@@ -120,7 +120,18 @@ void EagerClusterFunctionLibraryRuntime::Run(
     const FunctionLibraryRuntime::Options& opts,
     FunctionLibraryRuntime::LocalHandle handle, gtl::ArraySlice<Tensor> args,
     std::vector<Tensor>* rets, FunctionLibraryRuntime::DoneCallback done) {
-  done(errors::Unimplemented("Not implemented"));
+  if (args.empty() && rets->empty()) {
+    FunctionLibraryRuntime::Options opts_copy = opts;
+    opts_copy.op_id = ctx_->RemoteMgr()->NextOpId();
+    Run(opts_copy, handle, /*args=*/nullptr, std::move(done));
+  } else {
+    // TODO(b/150963957): Support remote inputs and outputs which are passed as
+    // Tensors.
+    done(errors::Unimplemented(
+        "Not implemented. Users could set the input devices and output devices "
+        "in FunctionLibraryRuntime::Options to the default multi-device "
+        "function device as a workaround."));
+  }
 }
 
 void EagerClusterFunctionLibraryRuntime::Run(
@@ -158,8 +169,10 @@ void EagerClusterFunctionLibraryRuntime::Run(
   eager::EnqueueRequest* request = new eager::EnqueueRequest;
   request->set_context_id(context_id_);
   eager::Operation* remote_op = request->add_queue()->mutable_operation();
-  for (size_t i = 0; i < args->size(); ++i) {
-    remote_op->add_inputs()->Swap(&(*args)[i]);
+  if (args) {
+    for (size_t i = 0; i < args->size(); ++i) {
+      remote_op->add_inputs()->Swap(&(*args)[i]);
+    }
   }
   // The remote component function should use the same op_id as its parent
   // multi-device function's in order to get the global unique op_id generated
diff --git a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h
index 3d21637225e..da6d5111bcd 100644
--- a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h
+++ b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h
@@ -83,7 +83,7 @@ class EagerClusterFunctionLibraryRuntime
   };
 
   mutable mutex mu_;
-  std::vector<FunctionData> function_data_ GUARDED_BY(mu_);
+  std::vector<FunctionData> function_data_ TF_GUARDED_BY(mu_);
 };
 
 DistributedFunctionLibraryRuntime* CreateClusterFLR(
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index c023d5ebe48..ee2ea755bfa 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h"
 #include "tensorflow/core/distributed_runtime/eager/remote_mgr.h"
 #include "tensorflow/core/distributed_runtime/eager/remote_tensor_handle.h"
+#include "tensorflow/core/distributed_runtime/message_wrappers.h"
 #include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
 #include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
@@ -359,6 +360,34 @@ Status EagerServiceImpl::ExecuteOp(const Operation& operation,
   {
     profiler::TraceMe activity("EagerService:RemoteTensorHandleInternal",
                                profiler::TraceMeLevel::kVerbose);
+    if (!operation.op_inputs().empty() && !operation.inputs().empty()) {
+      return errors::InvalidArgument(
+          "Both operation.inputs and operation.op_inputs are specified in the "
+          "same request.");
+    }
+    for (const auto& input : operation.op_inputs()) {
+      tensorflow::TensorHandle* handle;
+      if (input.has_remote_handle()) {
+        TF_RETURN_IF_ERROR(
+            eager_context->RemoteMgr()->DeserializeRemoteTensorHandle(
+                input.remote_handle(), &handle));
+        op->AddInput(handle);
+      } else {
+        Tensor tensor;
+        if (!ParseTensorProtoToTensor(input.tensor(), &tensor)) {
+          return errors::InvalidArgument("Invalid TensorProto: ",
+                                         input.tensor().DebugString());
+        } else {
+          TF_RETURN_IF_ERROR(TensorHandle::CreateLocalHandle(
+              std::move(tensor), nullptr, nullptr, eager_context, &handle));
+          op->AddInput(handle);
+        }
+      }
+      // Unref handle since it has a ref as an input now.
+      handle->Unref();
+    }
+    // TODO(b/150963957): Remove this once the migration from operation.inputs
+    // to operation.op_inputs completes.
     for (const auto& remote_handle : operation.inputs()) {
       tensorflow::TensorHandle* handle;
       TF_RETURN_IF_ERROR(
@@ -557,7 +586,7 @@ tensorflow::Status EagerServiceImpl::GetServerContext(
     return errors::InvalidArgument(strings::Printf(
         "Unable to find a context_id matching the specified one "
         "(%llu). Perhaps the worker was restarted, or the context was GC'd?",
-        context_id));
+        static_cast<unsigned long long>(context_id)));
   }
 
   *server_context = iter->second;
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
index f786d70e51f..f1c02f0938c 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
@@ -155,7 +155,7 @@ class EagerServiceImpl {
     const WorkerEnv* const env_;  // Not owned.
 
     mutex last_accessed_mu_;
-    int64 last_accessed_micros_ GUARDED_BY(last_accessed_mu_);
+    int64 last_accessed_micros_ TF_GUARDED_BY(last_accessed_mu_);
     int64 destroy_after_micros_;
 
     const bool is_master_;
@@ -214,7 +214,8 @@ class EagerServiceImpl {
   const WorkerEnv* const env_;  // Not owned.
 
   mutex contexts_mu_;
-  std::unordered_map<uint64, ServerContext*> contexts_ GUARDED_BY(contexts_mu_);
+  std::unordered_map<uint64, ServerContext*> contexts_
+      TF_GUARDED_BY(contexts_mu_);
 
   // Mutex to guard access to EagerContext in `contexts_`. Different from
   // `contexts_mu_` which guards adding / removing item from the map, this mutex
@@ -225,7 +226,7 @@ class EagerServiceImpl {
   std::unique_ptr<Thread> gc_thread_;
   mutex gc_thread_shutdown_mu_;
   condition_variable gc_thread_cv_;
-  bool shutting_down_ GUARDED_BY(gc_thread_shutdown_mu_) = false;
+  bool shutting_down_ TF_GUARDED_BY(gc_thread_shutdown_mu_) = false;
 
   TF_DISALLOW_COPY_AND_ASSIGN(EagerServiceImpl);
 };
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index 41db645507b..2d7ee1143c6 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 
 #include "absl/types/optional.h"
+#include "absl/types/variant.h"
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/eager/process_function_library_runtime.h"
@@ -172,7 +173,8 @@ void SetTensorProto(TensorProto* tensor_proto) {
 
 void AddOperationToEnqueueRequest(
     int64 id, const string& name,
-    const std::vector<std::pair<int64, int32>>& inputs,
+    const std::vector<absl::variant<TensorProto, std::pair<int64, int32>>>&
+        inputs,
     const std::unordered_map<string, AttrValue>& attrs, const string& device,
     EnqueueRequest* request) {
   auto* operation = request->add_queue()->mutable_operation();
@@ -181,12 +183,19 @@ void AddOperationToEnqueueRequest(
   operation->set_name(name);
   operation->set_device(device);
 
-  for (const auto& tensor_handle_pair : inputs) {
-    auto* input = operation->add_inputs();
-    input->set_op_id(tensor_handle_pair.first);
-    input->set_output_num(tensor_handle_pair.second);
-    input->set_op_device(device);
-    input->set_device(device);
+  for (const auto& input : inputs) {
+    if (input.index() == 0) {
+      *operation->add_op_inputs()->mutable_tensor() =
+          absl::get<TensorProto>(input);
+    } else {
+      const auto& tensor_handle_pair =
+          absl::get<std::pair<int64, int32>>(input);
+      auto* input = operation->add_op_inputs()->mutable_remote_handle();
+      input->set_op_id(tensor_handle_pair.first);
+      input->set_output_num(tensor_handle_pair.second);
+      input->set_op_device(device);
+      input->set_device(device);
+    }
   }
 
   for (const auto& attr_entry : attrs) {
@@ -323,9 +332,9 @@ TEST_F(EagerServiceImplTest, BasicTest) {
   attrs.insert({"transpose_a", val});
   attrs.insert({"transpose_b", val});
 
-  AddOperationToEnqueueRequest(2, "MatMul", {{1, 0}, {1, 0}}, attrs,
-                               "/job:localhost/replica:0/task:0/device:CPU:0",
-                               &remote_enqueue_request);
+  AddOperationToEnqueueRequest(
+      2, "MatMul", {std::make_pair(1, 0), std::make_pair(1, 0)}, attrs,
+      "/job:localhost/replica:0/task:0/device:CPU:0", &remote_enqueue_request);
 
   TF_ASSERT_OK(eager_service_impl.Enqueue(&remote_enqueue_request,
                                           &remote_enqueue_response));
@@ -367,7 +376,8 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
 
   // Creates a context and attempts to execute a function.
   void TestFunction(const RegisterFunctionOp& register_op,
-                    const string& function_name) {
+                    const string& function_name,
+                    const bool local_inputs = false) {
     TestEagerServiceImpl eager_service_impl(&worker_env_);
 
     uint64 context_id = random::New64();
@@ -392,22 +402,35 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
     remote_enqueue_request.set_context_id(context_id);
     EnqueueResponse remote_enqueue_response;
 
-    std::unordered_map<string, AttrValue> const_attrs;
-    AttrValue val;
-    val.set_type(tensorflow::DataType::DT_FLOAT);
-    const_attrs.insert({"dtype", val});
-    val.Clear();
+    if (local_inputs) {
+      TensorProto tensor_proto;
+      SetTensorProto(&tensor_proto);
+      AddOperationToEnqueueRequest(
+          2, function_name, {tensor_proto},
+          std::unordered_map<string, AttrValue>(),
+          "/job:localhost/replica:0/task:0/device:CPU:0",
+          &remote_enqueue_request);
 
-    SetTensorProto(val.mutable_tensor());
-    const_attrs.insert({"value", val});
+    } else {
+      std::unordered_map<string, AttrValue> const_attrs;
+      AttrValue val;
+      val.set_type(tensorflow::DataType::DT_FLOAT);
+      const_attrs.insert({"dtype", val});
+      val.Clear();
 
-    AddOperationToEnqueueRequest(1, "Const", {}, const_attrs,
-                                 "/job:localhost/replica:0/task:0/device:CPU:0",
-                                 &remote_enqueue_request);
-    AddOperationToEnqueueRequest(2, function_name, {{1, 0}},
-                                 std::unordered_map<string, AttrValue>(),
-                                 "/job:localhost/replica:0/task:0/device:CPU:0",
-                                 &remote_enqueue_request);
+      SetTensorProto(val.mutable_tensor());
+      const_attrs.insert({"value", val});
+
+      AddOperationToEnqueueRequest(
+          1, "Const", {}, const_attrs,
+          "/job:localhost/replica:0/task:0/device:CPU:0",
+          &remote_enqueue_request);
+      AddOperationToEnqueueRequest(
+          2, function_name, {std::make_pair(1, 0)},
+          std::unordered_map<string, AttrValue>(),
+          "/job:localhost/replica:0/task:0/device:CPU:0",
+          &remote_enqueue_request);
+    }
 
     TF_ASSERT_OK(eager_service_impl.Enqueue(&remote_enqueue_request,
                                             &remote_enqueue_response));
@@ -441,6 +464,12 @@ TEST_F(EagerServiceImplFunctionTest, BasicFunctionTest) {
   TestFunction(register_op, "MatMulFunction");
 }
 
+TEST_F(EagerServiceImplFunctionTest, FunctionWithLocalInputsTest) {
+  RegisterFunctionOp register_op;
+  *register_op.mutable_function_def() = MatMulFunction();
+  TestFunction(register_op, "MatMulFunction", /*local_inputs=*/true);
+}
+
 TEST_F(EagerServiceImplFunctionTest, NestedFunctionTest) {
   RegisterFunctionOp register_op;
   *register_op.mutable_function_def() = MatMulNestedFunction();
@@ -526,8 +555,8 @@ class FunctionWithRemoteInputsTest : public EagerServiceImplTest {
     fdef_ = MatMulFunction();
     TF_ASSERT_OK(func_lib_def_.AddFunctionDef(fdef_));
     eager_pflr_ = absl::make_unique<EagerProcessFunctionLibraryRuntime>(
-        remote_device_mgr_.get(), Env::Default(), /*config=*/nullptr,
-        TF_GRAPH_DEF_VERSION, &func_lib_def_, OptimizerOptions(),
+        remote_device_mgr_.get(), Env::Default(), /*config=*/
+        nullptr, TF_GRAPH_DEF_VERSION, &func_lib_def_, OptimizerOptions(),
         /*thread_pool=*/nullptr, eager_cluster_flr_.get());
   }
 
@@ -699,9 +728,9 @@ TEST_F(EagerServiceImplTest, SendTensorTest) {
   attrs.insert({"transpose_a", val});
   attrs.insert({"transpose_b", val});
 
-  AddOperationToEnqueueRequest(2, "MatMul", {{1, 0}, {1, 0}}, attrs,
-                               "/job:localhost/replica:0/task:0/device:CPU:0",
-                               &remote_enqueue_request);
+  AddOperationToEnqueueRequest(
+      2, "MatMul", {std::make_pair(1, 0), std::make_pair(1, 0)}, attrs,
+      "/job:localhost/replica:0/task:0/device:CPU:0", &remote_enqueue_request);
 
   TF_ASSERT_OK(eager_service_impl.Enqueue(&remote_enqueue_request,
                                           &remote_enqueue_response));
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
index 51c1e763021..54fb10e721d 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
@@ -164,22 +164,24 @@ Status RemoteMgr::DeserializeRemoteTensorHandle(const RemoteTensorHandle& in,
         parent_->FindDeviceFromName(device_name.c_str(), &device));
     TF_RETURN_IF_ERROR(TensorHandle::CreateLazyRemoteHandle(
         in.op_id(), in.output_num(), in.dtype(), device, parent_, out));
-    std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes;
+    TensorHandle::ResourceHandleInfo resource_handle_info;
+    std::vector<DtypeAndPartialTensorShape>* dtypes_and_shapes =
+        &resource_handle_info.dtypes_and_shapes;
     if (!GetMirroredResourceShape(RemoteTensorHandleInternal(in),
-                                  &dtypes_and_shapes)
+                                  dtypes_and_shapes)
              .ok()) {
       for (const auto& dtype_and_shape_proto :
            in.resource_dtypes_and_shapes()) {
-        dtypes_and_shapes.push_back(DtypeAndPartialTensorShape{
+        dtypes_and_shapes->push_back(DtypeAndPartialTensorShape{
             dtype_and_shape_proto.dtype(),
             TensorShape(dtype_and_shape_proto.shape())});
       }
       mutex_lock l(mirrored_resource_shape_mu_);
       mirrored_resource_shape_map_.emplace(
           RemoteTensorHandleInternal(in.op_id(), in.output_num()),
-          dtypes_and_shapes);
+          *dtypes_and_shapes);
     }
-    (*out)->SetResourceHandleDtypeAndShape(std::move(dtypes_and_shapes));
+    (*out)->SetResourceHandleInfo(std::move(resource_handle_info));
   }
 
   return Status::OK();
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.h b/tensorflow/core/distributed_runtime/eager/remote_mgr.h
index 4fd3b09fbbb..d075345a027 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.h
@@ -75,18 +75,18 @@ class RemoteMgr {
 
  protected:
   mutex next_id_mutex_;
-  uint64 next_op_id_ GUARDED_BY(next_id_mutex_) = 1;
+  uint64 next_op_id_ TF_GUARDED_BY(next_id_mutex_) = 1;
 
  private:
   // Returns the op_id and output_num if the given local TensorHandle exists in
   // remote_tensor_handle_map_.
   Status GetRemoteTensorHandle(const tensorflow::TensorHandle* handle,
                                int64* op_id, int32* output_num)
-      SHARED_LOCKS_REQUIRED(remote_tensor_handle_mu_);
+      TF_SHARED_LOCKS_REQUIRED(remote_tensor_handle_mu_);
 
   Status GetTensorHandleImpl(const RemoteTensorHandleInternal& remote_handle,
                              tensorflow::TensorHandle** handle)
-      SHARED_LOCKS_REQUIRED(remote_tensor_handle_mu_);
+      TF_SHARED_LOCKS_REQUIRED(remote_tensor_handle_mu_);
 
   Status GetMirroredResourceShape(
       const RemoteTensorHandleInternal& remote_handle,
@@ -107,20 +107,20 @@ class RemoteMgr {
   // in the cluster. Each map key is generated by the master, so it should be
   // globally unique. This map owns references on the handles it contains.
   RemoteTensorHandleMap remote_tensor_handle_map_
-      GUARDED_BY(remote_tensor_handle_mu_);
+      TF_GUARDED_BY(remote_tensor_handle_mu_);
 
   mutex mirrored_resource_shape_mu_;
   // This map maintains the data types and shapes of resource variables required
   // by remote workers in the cluster. Each map key is generated by the master,
   // so it should be globally unique.
   MirroredResourceShapeMap mirrored_resource_shape_map_
-      GUARDED_BY(mirrored_resource_shape_mu_);
+      TF_GUARDED_BY(mirrored_resource_shape_mu_);
 
   EagerContext* parent_;  // not owned.
 
   mutex executor_map_mu_;
   std::unordered_map<uint64, EagerExecutor> executor_map_
-      GUARDED_BY(executor_map_mu_);
+      TF_GUARDED_BY(executor_map_mu_);
 };
 
 }  // namespace eager
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
index 6c3e060d934..d6a4a221300 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
@@ -54,9 +54,9 @@ class RemoteTensorHandleData {
   Status WaitReady(const char* caller) const;
 
   mutable mutex mu_;
-  bool is_ready_ GUARDED_BY(mu_);
-  Status is_poisoned_ GUARDED_BY(mu_);
-  TensorShape shape_ GUARDED_BY(mu_);
+  bool is_ready_ TF_GUARDED_BY(mu_);
+  Status is_poisoned_ TF_GUARDED_BY(mu_);
+  TensorShape shape_ TF_GUARDED_BY(mu_);
 
   // IDs required when this class is representing a remote tensor handle.
   const int64 op_id_;
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index 96fc4f3d4f3..8239dbcc72d 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -139,8 +139,15 @@ Status GraphMgr::InitItem(
   item->proc_flr.reset(new ProcessFunctionLibraryRuntime(
       device_mgr_, worker_env_->env, /*config=*/&config_proto,
       gdef.versions().producer(), item->lib_def.get(),
-      graph_options.optimizer_options(), worker_env_->compute_pool,
-      cluster_flr));
+      graph_options.optimizer_options(), worker_env_->compute_pool, cluster_flr,
+      /*custom_kernel_creator=*/nullptr, /*session_metadata=*/nullptr,
+      [this, session](const int64 step_id, const DeviceMgr*,
+                      Rendezvous** r) -> Status {
+        auto* remote_r = this->worker_env_->rendezvous_mgr->Find(step_id);
+        TF_RETURN_IF_ERROR(remote_r->Initialize(session));
+        *r = remote_r;
+        return Status::OK();
+      }));
 
   // Constructs the graph out of "gdef".
   Graph graph(OpRegistry::Global());
@@ -257,14 +264,6 @@ Status GraphMgr::InitItem(
         delete kernel;
       }
     };
-    params.rendezvous_factory = [this, session](const int64 step_id,
-                                                const DeviceMgr*,
-                                                Rendezvous** r) -> Status {
-      auto* remote_r = this->worker_env_->rendezvous_mgr->Find(step_id);
-      TF_RETURN_IF_ERROR(remote_r->Initialize(session));
-      *r = remote_r;
-      return Status::OK();
-    };
 
     optimizer.Optimize(lib, worker_env_->env, params.device, &subgraph,
                        /*shape_map=*/nullptr);
@@ -304,7 +303,8 @@ Status GraphMgr::Register(
   // Inserts one item into table_.
   {
     mutex_lock l(mu_);
-    *graph_handle = strings::Printf("%016llx", ++next_id_);
+    *graph_handle =
+        strings::Printf("%016llx", static_cast<long long>(++next_id_));
     item->handle = *graph_handle;
     CHECK(table_.insert({*graph_handle, item}).second);
   }
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.h b/tensorflow/core/distributed_runtime/graph_mgr.h
index e043c82f927..50190ab337e 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.h
+++ b/tensorflow/core/distributed_runtime/graph_mgr.h
@@ -151,7 +151,7 @@ class GraphMgr {
 
   // Owned.
   mutex mu_;
-  int64 next_id_ GUARDED_BY(mu_) = 0;
+  int64 next_id_ TF_GUARDED_BY(mu_) = 0;
 
   // If true, blocks until device has finished all queued operations in a step.
   bool sync_on_finish_ = true;
diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index 658182c6243..adaecf861e6 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -298,14 +298,14 @@ class DeviceFinder {
   std::vector<DeviceNameUtils::ParsedName> filters_;
 
   mutex mu_;
-  int num_pending_ GUARDED_BY(mu_);
+  int num_pending_ TF_GUARDED_BY(mu_);
   condition_variable pending_zero_;
-  std::vector<Device*> found_ GUARDED_BY(mu_);
+  std::vector<Device*> found_ TF_GUARDED_BY(mu_);
   // List of targets to be contacted by this DeviceFinder. The
   // respective `bool` in `seen_targets_` indicates whether we have
   // heard from this target or not.
   std::vector<string> targets_;
-  std::vector<bool> seen_targets_ GUARDED_BY(mu_);
+  std::vector<bool> seen_targets_ TF_GUARDED_BY(mu_);
   Status status_;
 
   void WhenFound(int target_index, const Status& s,
diff --git a/tensorflow/core/distributed_runtime/master.h b/tensorflow/core/distributed_runtime/master.h
index 0524582ac78..61a45b56843 100644
--- a/tensorflow/core/distributed_runtime/master.h
+++ b/tensorflow/core/distributed_runtime/master.h
@@ -80,17 +80,17 @@ class Master {
 
   // shutdown_ is set to true by the dtor.
   condition_variable shutdown_cv_;
-  bool shutdown_ GUARDED_BY(mu_) = false;
+  bool shutdown_ TF_GUARDED_BY(mu_) = false;
   Thread* gc_thread_;
 
   // Maps session handles to sessions.
-  std::unordered_map<string, MasterSession*> sessions_ GUARDED_BY(mu_);
+  std::unordered_map<string, MasterSession*> sessions_ TF_GUARDED_BY(mu_);
 
   // Moving average of step times.
-  MovingAverage last_1000_steps_ GUARDED_BY(mu_);
+  MovingAverage last_1000_steps_ TF_GUARDED_BY(mu_);
 
   // Cumulative number of steps executed.
-  int64 step_count_ GUARDED_BY(mu_);
+  int64 step_count_ TF_GUARDED_BY(mu_);
 
   // If a session is not active for this many seconds, it will be
   // closed automatically.
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 13ace23f8a3..9d0a03805d0 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/cost_graph.pb.h"
+#include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -228,7 +229,7 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   const BuildGraphOptions bg_opts_;
 
   // NOTE(mrry): This pointer will be null after `RegisterPartitions()` returns.
-  std::unique_ptr<ClientGraph> client_graph_before_register_ GUARDED_BY(mu_);
+  std::unique_ptr<ClientGraph> client_graph_before_register_ TF_GUARDED_BY(mu_);
   const SessionOptions session_opts_;
   const bool is_partial_;
   const CallableOptions callable_opts_;
@@ -281,7 +282,7 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   Notification init_done_;
 
   // init_result_ remembers the initialization error if any.
-  Status init_result_ GUARDED_BY(mu_);
+  Status init_result_ TF_GUARDED_BY(mu_);
 
   std::unique_ptr<StatsPublisherInterface> stats_publisher_;
 
@@ -472,8 +473,8 @@ Status MasterSession::ReffedClientGraph::DoRegisterPartitions(
     c->req.set_session_handle(session_handle_);
     c->req.set_create_worker_session_called(!should_deregister_);
     c->req.mutable_graph_def()->Swap(&graph_partitions[part.name]);
-    // TODO(b/146354085): Default attributes should be stripped here from
-    // c->req.graph_def(), but this causes some TFX pipelines to fail.
+    StripDefaultAttributes(*OpRegistry::Global(),
+                           c->req.mutable_graph_def()->mutable_node());
     *c->req.mutable_config_proto() = session_opts_.config;
     *c->req.mutable_graph_options() = session_opts_.config.graph_options();
     *c->req.mutable_debug_options() =
@@ -582,10 +583,10 @@ class RunManyGraphs {
 
   BlockingCounter pending_;
   mutable mutex mu_;
-  StatusGroup status_group_ GUARDED_BY(mu_);
-  bool cancel_issued_ GUARDED_BY(mu_) = false;
+  StatusGroup status_group_ TF_GUARDED_BY(mu_);
+  bool cancel_issued_ TF_GUARDED_BY(mu_) = false;
 
-  void ReportBadStatus(const Status& s) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  void ReportBadStatus(const Status& s) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     VLOG(1) << "Master received error status " << s;
     if (!cancel_issued_ && !StatusGroup::IsDerived(s)) {
       // Only start cancelling other workers upon receiving a non-derived
@@ -910,9 +911,9 @@ class CleanupBroadcastHelper {
 
   mutex mu_;
   // Number of requests remaining to be collected.
-  int num_pending_ GUARDED_BY(mu_);
+  int num_pending_ TF_GUARDED_BY(mu_);
   // Aggregate status of the operation.
-  Status status_ GUARDED_BY(mu_);
+  Status status_ TF_GUARDED_BY(mu_);
   // Callback to be called when all operations complete.
   StatusCallback done_;
 
diff --git a/tensorflow/core/distributed_runtime/master_session.h b/tensorflow/core/distributed_runtime/master_session.h
index 02ccf898018..7b000258615 100644
--- a/tensorflow/core/distributed_runtime/master_session.h
+++ b/tensorflow/core/distributed_runtime/master_session.h
@@ -142,7 +142,7 @@ class MasterSession : public core::RefCounted {
   uint64 NewStepId(int64 graph_key);
 
   mutex mu_;
-  std::unique_ptr<GraphExecutionState> execution_state_ GUARDED_BY(mu_);
+  std::unique_ptr<GraphExecutionState> execution_state_ TF_GUARDED_BY(mu_);
   int64 graph_version_;
 
   // We keep a map from a signature of a run request to the
@@ -152,10 +152,10 @@ class MasterSession : public core::RefCounted {
   // scope and lose their state.
   class ReffedClientGraph;
   typedef std::unordered_map<uint64, ReffedClientGraph*> RCGMap;
-  RCGMap run_graphs_ GUARDED_BY(mu_);
-  RCGMap partial_run_graphs_ GUARDED_BY(mu_);
-  int64 next_callable_handle_ GUARDED_BY(mu_) = 0;
-  RCGMap callables_ GUARDED_BY(mu_);
+  RCGMap run_graphs_ TF_GUARDED_BY(mu_);
+  RCGMap partial_run_graphs_ TF_GUARDED_BY(mu_);
+  int64 next_callable_handle_ TF_GUARDED_BY(mu_) = 0;
+  RCGMap callables_ TF_GUARDED_BY(mu_);
 
   struct PerStepState {
     bool collect_costs = false;
@@ -190,20 +190,21 @@ class MasterSession : public core::RefCounted {
     ~RunState();
   };
   std::unordered_map<string, std::unique_ptr<RunState>> partial_runs_
-      GUARDED_BY(mu_);
+      TF_GUARDED_BY(mu_);
 
   // Active RunStep calls.
   condition_variable num_running_is_zero_;
-  int32 num_running_ GUARDED_BY(mu_) = 0;
+  int32 num_running_ TF_GUARDED_BY(mu_) = 0;
 
-  bool closed_ GUARDED_BY(mu_) = false;
-  bool garbage_collected_ GUARDED_BY(mu_) = false;
+  bool closed_ TF_GUARDED_BY(mu_) = false;
+  bool garbage_collected_ TF_GUARDED_BY(mu_) = false;
 
-  std::unordered_map<uint64, int64> subgraph_execution_counts_ GUARDED_BY(mu_);
+  std::unordered_map<uint64, int64> subgraph_execution_counts_
+      TF_GUARDED_BY(mu_);
 
   // We need to ensure that certain nodes added (e.g., send and recv
   // nodes) are unique across all sub-graphs within this session.
-  int64 next_node_id_ GUARDED_BY(mu_) = 0;
+  int64 next_node_id_ TF_GUARDED_BY(mu_) = 0;
 
   // Used to cancel running steps on Close().
   CancellationManager cancellation_manager_;
@@ -224,7 +225,7 @@ class MasterSession : public core::RefCounted {
   Status StartStep(const BuildGraphOptions& opts, bool is_partial,
                    ReffedClientGraph** out_rcg, int64* out_count);
   void ClearRunsTable(std::vector<ReffedClientGraph*>* to_unref,
-                      RCGMap* rcg_map) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+                      RCGMap* rcg_map) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
   void FillPerStepState(MasterSession::ReffedClientGraph* rcg,
                         const RunOptions& run_options, uint64 step_id,
                         int64 count, PerStepState* out_pss,
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.cc b/tensorflow/core/distributed_runtime/message_wrappers.cc
index aaae523b546..ea204b9b0ec 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.cc
+++ b/tensorflow/core/distributed_runtime/message_wrappers.cc
@@ -23,8 +23,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-namespace {
-
 bool ParseTensorProtoToTensor(const TensorProto& tensor_proto,
                               Tensor* out_tensor) {
   if (tensor_proto.dtype() > 0 && tensor_proto.dtype() <= DataType_MAX) {
@@ -37,8 +35,6 @@ bool ParseTensorProtoToTensor(const TensorProto& tensor_proto,
   return false;
 }
 
-}  // namespace
-
 const string& InMemoryRunStepRequest::session_handle() const {
   return session_handle_;
 }
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.h b/tensorflow/core/distributed_runtime/message_wrappers.h
index 2566c05fe14..dddf42c5f62 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.h
+++ b/tensorflow/core/distributed_runtime/message_wrappers.h
@@ -737,6 +737,9 @@ class NonOwnedProtoRunStepResponse : public MutableRunStepResponseWrapper {
   RunStepResponse* response_;  // Not owned.
 };
 
+bool ParseTensorProtoToTensor(const TensorProto& tensor_proto,
+                              Tensor* out_tensor);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MESSAGE_WRAPPERS_H_
diff --git a/tensorflow/core/distributed_runtime/partial_run_mgr.h b/tensorflow/core/distributed_runtime/partial_run_mgr.h
index e95f4da6c30..5c104125029 100644
--- a/tensorflow/core/distributed_runtime/partial_run_mgr.h
+++ b/tensorflow/core/distributed_runtime/partial_run_mgr.h
@@ -79,7 +79,7 @@ class PartialRunMgr {
   mutex mu_;
 
   std::unordered_map<int, std::unique_ptr<PartialRunState>>
-      step_id_to_partial_run_ GUARDED_BY(mu_);
+      step_id_to_partial_run_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/recent_request_ids.h b/tensorflow/core/distributed_runtime/recent_request_ids.h
index 553dbbc9acb..84581c43c5a 100644
--- a/tensorflow/core/distributed_runtime/recent_request_ids.h
+++ b/tensorflow/core/distributed_runtime/recent_request_ids.h
@@ -71,9 +71,9 @@ class RecentRequestIds {
   // next_index_ indexes into circular_buffer_, and points to the next storage
   // space to use. When the buffer is full, next_index_ points at the oldest
   // request_id.
-  int next_index_ GUARDED_BY(mu_) = 0;
-  std::vector<int64> circular_buffer_ GUARDED_BY(mu_);
-  std::unordered_set<int64> set_ GUARDED_BY(mu_);
+  int next_index_ TF_GUARDED_BY(mu_) = 0;
+  std::vector<int64> circular_buffer_ TF_GUARDED_BY(mu_);
+  std::unordered_set<int64> set_ TF_GUARDED_BY(mu_);
 };
 
 // Implementation details
diff --git a/tensorflow/core/distributed_runtime/remote_device.cc b/tensorflow/core/distributed_runtime/remote_device.cc
index 1e1151ca587..cf8a2d90ea4 100644
--- a/tensorflow/core/distributed_runtime/remote_device.cc
+++ b/tensorflow/core/distributed_runtime/remote_device.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/remote_device.h"
 
+#include <stdlib.h>
+
 #include <vector>
 
 #include "tensorflow/core/common_runtime/device.h"
@@ -124,7 +126,18 @@ void NewRemoteDevices(Env* env, WorkerCacheInterface* worker_cache,
               worker_name_parsed.task, device_name_parsed.type,
               device_name_parsed.id));
           auto d = new RemoteDevice(env, da_rewritten);
-          remote_devices.push_back(d);
+
+          // Experimental: Skipping over adding any TPU-type devices that aren't
+          // on the job called "worker" (but still adds the CPUs of other jobs).
+          if (getenv("TPU_NO_POPULATE_DEVICE_LIST_FROM_CLUSTER_SPEC") !=
+              nullptr) {
+            if (worker_name_parsed.job != "worker" ||
+                device_name_parsed.type.find("TPU") != std::string::npos) {
+              remote_devices.push_back(d);
+            }
+          } else {
+            remote_devices.push_back(d);
+          }
         }
       }
     }
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
index c1811303bc9..cbbd76b42ad 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
@@ -199,7 +199,7 @@ class GrpcEagerClient : public EagerClient {
   mutable mutex mu_;
 
   std::unordered_map<uint64, StreamingRPCDispatcher<EnqueueResponse>>
-      enqueue_dispatchers_ GUARDED_BY(mu_);
+      enqueue_dispatchers_ TF_GUARDED_BY(mu_);
 
   StatusCallback callback_wrapper(StatusCallback done) {
     Ref();
@@ -247,8 +247,8 @@ class GrpcEagerClientCache : public EagerClientCache {
  private:
   mutex assignment_mu_;
   std::unordered_map<std::string, size_t> target_assignments_
-      GUARDED_BY(assignment_mu_);
-  size_t next_round_robin_assignment_ GUARDED_BY(assignment_mu_);
+      TF_GUARDED_BY(assignment_mu_);
+  size_t next_round_robin_assignment_ TF_GUARDED_BY(assignment_mu_);
 
   size_t AssignClientToThread(const string& target) {
     // Round-robin target assignment, but keeps the same target on the same
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_call.h b/tensorflow/core/distributed_runtime/rpc/grpc_call.h
index e85baac0f70..6f1c4b16216 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_call.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_call.h
@@ -272,7 +272,7 @@ class Call : public UntypedCall<Service> {
   Tag cancelled_tag_{this, Tag::kCancelled};
 
   mutex mu_;
-  std::function<void()> cancel_callback_ GUARDED_BY(mu_);
+  std::function<void()> cancel_callback_ TF_GUARDED_BY(mu_);
 };
 
 // Lifetime of a server-side bidirectional streaming call:
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
index 65ab0810235..985b0454837 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
@@ -219,7 +219,7 @@ class CachingGrpcChannelCache : public GrpcChannelCache {
  private:
   // TODO(zhifengc): Eviction when the map becomes too big.
   mutex mu_;
-  std::unordered_map<string, SharedGrpcChannelPtr> channels_ GUARDED_BY(mu_);
+  std::unordered_map<string, SharedGrpcChannelPtr> channels_ TF_GUARDED_BY(mu_);
 };
 
 // A ChannelCache that is the union of multiple ChannelCaches.
@@ -286,7 +286,8 @@ class MultiGrpcChannelCache : public CachingGrpcChannelCache {
   mutex mu_;
   // Cache of channels keyed by the target they are handling.
   // The same GrpcChannelCache can appear multiple times in the cache.
-  std::unordered_map<string, GrpcChannelCache*> target_caches_ GUARDED_BY(mu_);
+  std::unordered_map<string, GrpcChannelCache*> target_caches_
+      TF_GUARDED_BY(mu_);
 };
 
 class SparseGrpcChannelCache : public CachingGrpcChannelCache {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
index 773b6a1bd15..1ae48a7e4e8 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
@@ -138,7 +138,7 @@ class GrpcMasterService : public AsyncServiceInterface {
   grpc::MasterService::AsyncService master_service_;
 
   mutex mu_;
-  bool is_shutdown_ GUARDED_BY(mu_);
+  bool is_shutdown_ TF_GUARDED_BY(mu_);
   const ConfigProto default_session_config_;
   ::grpc::Alarm* shutdown_alarm_ = nullptr;
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h b/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h
index 279e982c4b4..022e4a328f2 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h
@@ -87,7 +87,7 @@ class GrpcResponseCache {
   mutex mu_;
   // response_cache_ is expected to be small, as entries are cleared immediately
   // on ack from the receiver.
-  gtl::FlatMap<int64, ResponseCacheEntry> response_cache_ GUARDED_BY(mu_);
+  gtl::FlatMap<int64, ResponseCacheEntry> response_cache_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h
index 29394c84b55..ae9abf765df 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h
@@ -69,7 +69,7 @@ class GrpcRPCFactory : public RPCFactory {
 
   mutex mu_;
   typedef std::unique_ptr<::grpc::GenericStub> StubPtr;
-  std::unordered_map<string, StubPtr> stubs_ GUARDED_BY(mu_);
+  std::unordered_map<string, StubPtr> stubs_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index 35b6af6c22d..8e25b8835eb 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -149,30 +149,30 @@ class GrpcServer : public ServerInterface {
   //    \________________________/
   //            Stop(), Join()
   enum State { NEW, STARTED, STOPPED };
-  State state_ GUARDED_BY(mu_);
+  State state_ TF_GUARDED_BY(mu_);
 
   // Implementation of a TensorFlow master, and RPC polling thread.
   MasterEnv master_env_;
   std::unique_ptr<Master> master_impl_;
   AsyncServiceInterface* master_service_ = nullptr;
-  std::unique_ptr<Thread> master_thread_ GUARDED_BY(mu_);
+  std::unique_ptr<Thread> master_thread_ TF_GUARDED_BY(mu_);
 
   // Implementation of a TensorFlow worker, and RPC polling thread.
   WorkerEnv worker_env_;
   std::unique_ptr<GrpcWorker> worker_impl_;
   AsyncServiceInterface* worker_service_ = nullptr;
-  std::unique_ptr<Thread> worker_thread_ GUARDED_BY(mu_);
+  std::unique_ptr<Thread> worker_thread_ TF_GUARDED_BY(mu_);
   std::unique_ptr<GrpcWorkerEnv> grpc_worker_env_;
 
   // TensorFlow Eager implementation, and RPC polling thread.
   AsyncServiceInterface* eager_service_ = nullptr;
-  std::unique_ptr<Thread> eager_thread_ GUARDED_BY(mu_);
+  std::unique_ptr<Thread> eager_thread_ TF_GUARDED_BY(mu_);
   std::shared_ptr<WorkerSession> worker_session_;
 
   // The overall server configuration.
-  ServerDef server_def_ GUARDED_BY(mu_);
+  ServerDef server_def_ TF_GUARDED_BY(mu_);
 
-  std::unique_ptr<::grpc::Server> server_ GUARDED_BY(mu_);
+  std::unique_ptr<::grpc::Server> server_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.h b/tensorflow/core/distributed_runtime/rpc/grpc_session.h
index 7aa85384503..de640880f23 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.h
@@ -112,7 +112,7 @@ class GrpcSession : public Session {
   void SetRemoteMaster(std::unique_ptr<MasterInterface> master);
   // Allows subclasses to customize Session creation.
   void SetHandleAndGraphVersion(string handle, int64 graph_version)
-      LOCKS_EXCLUDED(mu_);
+      TF_LOCKS_EXCLUDED(mu_);
 
  private:
   const SessionOptions options_;
@@ -120,14 +120,14 @@ class GrpcSession : public Session {
   mutex mu_;
 
   // handle_ returned by the master to identify this session.
-  string handle_ GUARDED_BY(mu_);
+  string handle_ TF_GUARDED_BY(mu_);
 
   // The current version of the graph.
-  int64 current_graph_version_ GUARDED_BY(mu_);
+  int64 current_graph_version_ TF_GUARDED_BY(mu_);
 
   bool is_local_ = false;
 
-  Status Handle(string* out_handle) LOCKS_EXCLUDED(mu_);
+  Status Handle(string* out_handle) TF_LOCKS_EXCLUDED(mu_);
 
   Status RunHelper(const RunOptions& run_options,
                    const std::vector<std::pair<string, Tensor> >& inputs,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
index 982ea71a6ed..ea0a8559077 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
@@ -533,8 +533,8 @@ class StreamingRPCState : public UntypedStreamingRPCState {
     kDone,
   };
 
-  void MarkDoneAndCompleteExchanges(Status status) EXCLUSIVE_LOCKS_REQUIRED(mu_)
-      UNLOCK_FUNCTION(mu_) {
+  void MarkDoneAndCompleteExchanges(Status status)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) TF_UNLOCK_FUNCTION(mu_) {
     call_state_ = State::kDone;
     VLOG(2) << "Ending gRPC streaming call on the client side due to "
             << status.ToString();
@@ -548,7 +548,7 @@ class StreamingRPCState : public UntypedStreamingRPCState {
     queue.CompleteAll(status);
   }
 
-  void MaybeIssueRequestWriteLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  void MaybeIssueRequestWriteLocked() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     Exchange* exchange = exchanges_.GetReadyForRequestWriting();
     if (exchange == nullptr) {
       // There are no queued exchanges, there is already an outstanding write,
@@ -561,7 +561,7 @@ class StreamingRPCState : public UntypedStreamingRPCState {
     call_->Write(exchange->request_buf(), &request_write_completed_tag_);
   }
 
-  void MaybeIssueResponseReadLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  void MaybeIssueResponseReadLocked() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     Exchange* exchange = exchanges_.GetReadyForResponseReading();
     if (exchange == nullptr) {
       return;
@@ -572,7 +572,7 @@ class StreamingRPCState : public UntypedStreamingRPCState {
     call_->Read(exchange->response_buf(), &response_read_completed_tag_);
   }
 
-  void IssueCallFinishLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  void IssueCallFinishLocked() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     call_state_ = State::kFinishing;
     Ref();
     VLOG(3) << "StreamingRPCState(" << this << ") calling grpc::Finish";
@@ -593,9 +593,9 @@ class StreamingRPCState : public UntypedStreamingRPCState {
   std::unique_ptr<grpc::GenericClientAsyncReaderWriter> call_;
 
   mutable mutex mu_;
-  ExchangeQueue exchanges_ GUARDED_BY(mu_);
-  State call_state_ GUARDED_BY(mu_);
-  ::grpc::Status call_status_ GUARDED_BY(mu_);
+  ExchangeQueue exchanges_ TF_GUARDED_BY(mu_);
+  State call_state_ TF_GUARDED_BY(mu_);
+  ::grpc::Status call_status_ TF_GUARDED_BY(mu_);
 
   // We can get away with having single instances of these tags per
   // StreamingRPCState because we make sure (as gRPC requires) that
@@ -669,7 +669,7 @@ class StreamingRPCDispatcher {
   }
 
  private:
-  void CreateStreamingState() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  void CreateStreamingState() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     // ClientContext cannot be reused across calls.
     context_ = std::make_shared<::grpc::ClientContext>();
     // Don't immediately fail StartCall if the channel is not ready. Wait for
@@ -691,8 +691,8 @@ class StreamingRPCDispatcher {
   // Does not need synchronization since it is constant.
   const ::grpc::string method_;
 
-  std::shared_ptr<::grpc::ClientContext> context_ GUARDED_BY(mu_);
-  core::RefCountPtr<StreamingRPCState<Response>> state_ GUARDED_BY(mu_);
+  std::shared_ptr<::grpc::ClientContext> context_ TF_GUARDED_BY(mu_);
+  core::RefCountPtr<StreamingRPCState<Response>> state_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
index d1382c42f4a..f6b6e15a2ba 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
@@ -123,8 +123,8 @@ class GrpcWorkerCache : public WorkerCachePartial {
 
   mutex assignment_mu_;
   std::unordered_map<std::string, size_t> target_assignments_
-      GUARDED_BY(assignment_mu_);
-  size_t next_round_robin_assignment_ GUARDED_BY(assignment_mu_);
+      TF_GUARDED_BY(assignment_mu_);
+  size_t next_round_robin_assignment_ TF_GUARDED_BY(assignment_mu_);
 };
 
 }  // namespace
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index cfa3a1b3338..a12b392f83a 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -360,7 +360,7 @@ class GrpcWorkerServiceThread {
   grpc::WorkerService::AsyncService* const worker_service_;
 
   mutex shutdown_mu_;
-  bool is_shutdown_ GUARDED_BY(shutdown_mu_);
+  bool is_shutdown_ TF_GUARDED_BY(shutdown_mu_);
   TF_DISALLOW_COPY_AND_ASSIGN(GrpcWorkerServiceThread);
 };
 
@@ -411,7 +411,7 @@ class GrpcWorkerService : public AsyncServiceInterface {
 
   std::unique_ptr<GrpcResponseCache> cache_;
   mutex service_shutdown_mu_;
-  bool is_shutdown_ GUARDED_BY(service_shutdown_mu_);
+  bool is_shutdown_ TF_GUARDED_BY(service_shutdown_mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(GrpcWorkerService);
 };
@@ -669,6 +669,7 @@ void GrpcWorker::RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
           AllocatorAttributes cpu_attr;
           cpu_attr.set_gpu_compatible(true);
           cpu_attr.set_nic_compatible(true);
+          MEMDEBUG_CACHE_OP("GrpcWorker::RecvBufAsync::consumer_callback");
           Tensor* cpu_tensor =
               new Tensor(cpu_dev->GetAllocator(cpu_attr),
                          hook->prod_value->dtype(), hook->prod_value->shape());
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index 011d668f56a..b758ec9e08c 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -158,7 +158,7 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
   Rendezvous::DoneCallback done_;
 
   mutable mutex mu_;
-  Status status_ GUARDED_BY(mu_);
+  Status status_ TF_GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(RpcRecvTensorCall);
 };
@@ -200,7 +200,7 @@ class RpcRecvTensorFreeList {
   static const int kMaxObjects = 1000;
 
   mutex mu_;
-  std::vector<RpcRecvTensorCall*> objects_ GUARDED_BY(mu_);
+  std::vector<RpcRecvTensorCall*> objects_ TF_GUARDED_BY(mu_);
 };
 
 static RpcRecvTensorFreeList* get_call_freelist() {
diff --git a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h
index 98eb1467700..60e428f65a5 100644
--- a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h
+++ b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h
@@ -78,7 +78,7 @@ class RpcCollectiveExecutorMgr : public CollectiveExecutorMgr {
 
   mutex sequence_mu_;
   gtl::FlatMap<int64, GraphKeySequence*> sequence_table_
-      GUARDED_BY(sequence_mu_);
+      TF_GUARDED_BY(sequence_mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/session_mgr.h b/tensorflow/core/distributed_runtime/session_mgr.h
index 09bb41d5314..8c438dbd83e 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.h
+++ b/tensorflow/core/distributed_runtime/session_mgr.h
@@ -100,11 +100,11 @@ class SessionMgr {
 
   Status WorkerSessionForSessionLocked(
       const string& session_handle, std::shared_ptr<WorkerSession>* out_session)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   mutex mu_;
   // A map from session identifier to internal session structure.
-  std::map<string, std::shared_ptr<WorkerSession>> sessions_ GUARDED_BY(mu_);
+  std::map<string, std::shared_ptr<WorkerSession>> sessions_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/worker_cache_logger.h b/tensorflow/core/distributed_runtime/worker_cache_logger.h
index 00846c25fec..33fbeedb865 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_logger.h
+++ b/tensorflow/core/distributed_runtime/worker_cache_logger.h
@@ -70,7 +70,7 @@ class WorkerCacheLogger {
 
  private:
   mutex count_mu_;
-  int32 want_logging_count_ GUARDED_BY(count_mu_) = 0;
+  int32 want_logging_count_ TF_GUARDED_BY(count_mu_) = 0;
 
   struct StepLog {
     StepStats step_stats;
@@ -78,12 +78,12 @@ class WorkerCacheLogger {
   };
   typedef std::unordered_map<int64, StepLog> LogMap;
   mutex mu_;
-  LogMap log_map_ GUARDED_BY(mu_);
+  LogMap log_map_ TF_GUARDED_BY(mu_);
 
   // Records "ns" in log_map_ under the given device and step.
   void Save(const string& device, int64 step_id, NodeExecStats* ns);
 
-  void ClearLogsWithLock() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  void ClearLogsWithLock() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 };
 }  // namespace tensorflow
 #endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_CACHE_LOGGER_H_
diff --git a/tensorflow/core/distributed_runtime/worker_cache_partial.h b/tensorflow/core/distributed_runtime/worker_cache_partial.h
index b80b66cef6b..57f2aca898b 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_partial.h
+++ b/tensorflow/core/distributed_runtime/worker_cache_partial.h
@@ -50,7 +50,7 @@ class WorkerCachePartial : public WorkerCacheInterface {
   Status RefreshDeviceStatus(const string& device_name);
 
   typedef std::unordered_map<string, DeviceAttributes> StatusMap;
-  StatusMap device_status_cache_ GUARDED_BY(mu_);
+  StatusMap device_status_cache_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/worker_session.cc b/tensorflow/core/distributed_runtime/worker_session.cc
index fac28514115..a8ee08e2f36 100644
--- a/tensorflow/core/distributed_runtime/worker_session.cc
+++ b/tensorflow/core/distributed_runtime/worker_session.cc
@@ -99,7 +99,7 @@ class WorkerFreeListCache : public WorkerCacheInterface {
 
   // TODO(jeff,sanjay): Eviction when the map becomes too big.
   mutex mu_;
-  std::unordered_map<string, WorkerState> workers_ GUARDED_BY(mu_);
+  std::unordered_map<string, WorkerState> workers_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace
diff --git a/tensorflow/core/framework/BUILD b/tensorflow/core/framework/BUILD
index bff95bc60a5..ae1601f9da8 100644
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@@ -50,6 +50,7 @@ exports_files(
         "model.h",
         "node_def_builder.h",
         "numeric_op.h",
+        "numeric_op_base.h",
         "op_kernel.h",
         "op_requires.h",
         "op_segment.h",
@@ -73,7 +74,6 @@ exports_files(
         "tensor_util.h",
         "thread_factory.h",
         "tracking_allocator.h",
-        "unique_tensor_references.h",
         "versions.h",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
@@ -129,6 +129,7 @@ exports_files(
     [
         "attr_value_util.h",
         "common_shape_fns.h",
+        "kernel_shape_util.h",
         "node_def_util.h",
         "node_properties.h",
         "op.h",
@@ -166,6 +167,7 @@ filegroup(
         "graph_to_functiondef.h",
         "kernel_def_builder.h",
         "kernel_def_util.h",
+        "kernel_shape_util.h",
         "local_rendezvous.h",
         "log_memory.h",
         "logging.h",
@@ -176,6 +178,7 @@ filegroup(
         "node_def_util.h",
         "node_properties.h",
         "numeric_op.h",
+        "numeric_op_base.h",
         "numeric_types.h",
         "op.h",
         "op_def_builder.h",
@@ -214,7 +217,6 @@ filegroup(
         "type_traits.h",
         "typed_allocator.h",
         "types.h",
-        "unique_tensor_references.h",
         "variant.h",
         "variant_encode_decode.h",
         "variant_op_registry.h",
@@ -252,7 +254,6 @@ filegroup(
         "run_handler_util.cc",
         "tensor_slice.cc",
         "tensor_util.cc",
-        "unique_tensor_references.cc",
         "versions.cc",
     ],
 )
@@ -269,10 +270,15 @@ filegroup(
         "bfloat16.h",
         "bounds_check.h",
         "cpu_allocator_impl.cc",
+        "kernel_shape_util.cc",
+        "kernel_shape_util.h",
         "log_memory.cc",
         "log_memory.h",
+        "numeric_op_base.h",
         "numeric_types.h",
         "op_requires.h",
+        "ops_util.cc",
+        "ops_util.h",
         "register_types.h",
         "resource_handle.cc",
         "resource_handle.h",
@@ -356,8 +362,6 @@ filegroup(
         "op_kernel.h",
         "op_segment.cc",
         "op_segment.h",
-        "ops_util.cc",
-        "ops_util.h",
         "partial_tensor_shape.h",
         "queue_interface.h",
         "reader_interface.h",
@@ -384,8 +388,6 @@ filegroup(
         "tensor_util.cc",
         "tensor_util.h",
         "thread_factory.h",
-        "unique_tensor_references.cc",
-        "unique_tensor_references.h",
         "versions.cc",
         "versions.h",
     ],
@@ -775,6 +777,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "kernel_shape_util",
+    srcs = ["kernel_shape_util.cc"],
+    hdrs = ["kernel_shape_util.h"],
+    deps = [
+        ":tensor",
+        ":tensor_shape",
+        "//tensorflow/core/lib/core:errors",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/util:padding",
+    ],
+)
+
 cc_library(
     name = "common_shape_fns",
     srcs = ["common_shape_fns.cc"],
@@ -1019,7 +1034,6 @@ tf_cc_tests(
         "tensor_util_test.cc",
         "tracking_allocator_test.cc",
         "types_test.cc",
-        "unique_tensor_references_test.cc",
         "variant_op_registry_test.cc",
         "variant_test.cc",
     ],
diff --git a/tensorflow/core/framework/allocator_registry.h b/tensorflow/core/framework/allocator_registry.h
index 85b0fbac8f6..5144409310a 100644
--- a/tensorflow/core/framework/allocator_registry.h
+++ b/tensorflow/core/framework/allocator_registry.h
@@ -101,12 +101,12 @@ class AllocatorFactoryRegistry {
     // 1).
     std::vector<std::unique_ptr<SubAllocator>> sub_allocators;
   };
-  std::vector<FactoryEntry> factories_ GUARDED_BY(mu_);
+  std::vector<FactoryEntry> factories_ TF_GUARDED_BY(mu_);
 
   // Returns any FactoryEntry registered under 'name' and 'priority',
   // or 'nullptr' if none found.
   const FactoryEntry* FindEntry(const string& name, int priority) const
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(AllocatorFactoryRegistry);
 };
diff --git a/tensorflow/core/framework/cancellation.h b/tensorflow/core/framework/cancellation.h
index 7e60eb54065..912f7260340 100644
--- a/tensorflow/core/framework/cancellation.h
+++ b/tensorflow/core/framework/cancellation.h
@@ -167,19 +167,19 @@ class CancellationManager {
 
   // If this CancellationManager is associated with a parent, this member will
   // be set to `true` after this is removed from the parent's list of children.
-  bool is_removed_from_parent_ GUARDED_BY(parent_->mu_) = false;
+  bool is_removed_from_parent_ TF_GUARDED_BY(parent_->mu_) = false;
 
   // If this CancellationManager is associated with a parent, these members form
   // a doubly-linked list of that parent's children.
   //
   // These fields are valid only when `this->is_removed_from_parent_` is false.
-  CancellationManager* prev_sibling_ GUARDED_BY(parent_->mu_) =
+  CancellationManager* prev_sibling_ TF_GUARDED_BY(parent_->mu_) =
       nullptr;  // Not owned.
-  CancellationManager* next_sibling_ GUARDED_BY(parent_->mu_) =
+  CancellationManager* next_sibling_ TF_GUARDED_BY(parent_->mu_) =
       nullptr;  // Not owned.
 
   mutex mu_;
-  std::unique_ptr<State> state_ GUARDED_BY(mu_);
+  std::unique_ptr<State> state_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 2d39be1379e..c8283f92f20 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -27,118 +27,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-Status GetWindowedOutputSizeVerboseV2(int64 input_size, int64 filter_size,
-                                      int64 dilation_rate, int64 stride,
-                                      Padding padding_type, int64* output_size,
-                                      int64* padding_before,
-                                      int64* padding_after) {
-  if (stride <= 0) {
-    return errors::InvalidArgument("Stride must be > 0, but got ", stride);
-  }
-  if (dilation_rate < 1) {
-    return errors::InvalidArgument("Dilation rate must be >= 1, but got ",
-                                   dilation_rate);
-  }
-
-  // See also the parallel implementation in GetWindowedOutputSizeFromDimsV2.
-  int64 effective_filter_size = (filter_size - 1) * dilation_rate + 1;
-  switch (padding_type) {
-    case Padding::VALID:
-      *output_size = (input_size - effective_filter_size + stride) / stride;
-      *padding_before = *padding_after = 0;
-      break;
-    case Padding::EXPLICIT:
-      *output_size = (input_size + *padding_before + *padding_after -
-                      effective_filter_size + stride) /
-                     stride;
-      break;
-    case Padding::SAME:
-      *output_size = (input_size + stride - 1) / stride;
-      const int64 padding_needed =
-          std::max(int64{0}, (*output_size - 1) * stride +
-                                 effective_filter_size - input_size);
-      // For odd values of total padding, add more padding at the 'right'
-      // side of the given dimension.
-      *padding_before = padding_needed / 2;
-      *padding_after = padding_needed - *padding_before;
-      break;
-  }
-  if (*output_size < 0) {
-    return errors::InvalidArgument(
-        "Computed output size would be negative: ", *output_size,
-        " [input_size: ", input_size,
-        ", effective_filter_size: ", effective_filter_size,
-        ", stride: ", stride, "]");
-  }
-  return Status::OK();
-}
-
-Status GetWindowedOutputSizeVerbose(int64 input_size, int64 filter_size,
-                                    int64 stride, Padding padding_type,
-                                    int64* output_size, int64* padding_before,
-                                    int64* padding_after) {
-  return GetWindowedOutputSizeVerboseV2(input_size, filter_size,
-                                        /*dilation_rate=*/1, stride,
-                                        padding_type, output_size,
-                                        padding_before, padding_after);
-}
-
-Status GetWindowedOutputSize(int64 input_size, int64 filter_size, int64 stride,
-                             Padding padding_type, int64* output_size,
-                             int64* padding_size) {
-  if (padding_type == Padding::EXPLICIT) {
-    return errors::Internal(
-        "GetWindowedOutputSize does not handle EXPLICIT padding; call "
-        "GetWindowedOutputSizeVerbose instead");
-  }
-  int64 padding_after_unused;
-  return GetWindowedOutputSizeVerbose(input_size, filter_size, stride,
-                                      padding_type, output_size, padding_size,
-                                      &padding_after_unused);
-}
-
-Status GetWindowedOutputSizeV2(int64 input_size, int64 filter_size,
-                               int64 dilation_rate, int64 stride,
-                               Padding padding_type, int64* output_size,
-                               int64* padding_size) {
-  if (padding_type == Padding::EXPLICIT) {
-    return errors::Internal(
-        "GetWindowedOutputSizeV2 does not handle EXPLICIT padding; call "
-        "GetWindowedOutputSizeVerboseV2 instead");
-  }
-  int64 padding_after_unused;
-  return GetWindowedOutputSizeVerboseV2(input_size, filter_size, dilation_rate,
-                                        stride, padding_type, output_size,
-                                        padding_size, &padding_after_unused);
-}
-
-Status Get3dOutputSize(const std::array<int64, 3>& input,
-                       const std::array<int64, 3>& window,
-                       const std::array<int64, 3>& strides,
-                       Padding padding_type, std::array<int64, 3>* output_ptr,
-                       std::array<int64, 3>* padding_ptr) {
-  for (size_t i = 0; i < input.size(); ++i) {
-    TF_RETURN_IF_ERROR(GetWindowedOutputSize(input[i], window[i], strides[i],
-                                             padding_type, &(*output_ptr)[i],
-                                             &(*padding_ptr)[i]));
-  }
-  return Status::OK();
-}
-
-Status Get3dOutputSizeV2(const std::array<int64, 3>& input,
-                         const std::array<int64, 3>& window,
-                         const std::array<int64, 3>& dilations,
-                         const std::array<int64, 3>& strides,
-                         Padding padding_type, std::array<int64, 3>* output_ptr,
-                         std::array<int64, 3>* padding_ptr) {
-  for (size_t i = 0; i < input.size(); ++i) {
-    TF_RETURN_IF_ERROR(GetWindowedOutputSizeV2(
-        input[i], window[i], dilations[i], strides[i], padding_type,
-        &(*output_ptr)[i], &(*padding_ptr)[i]));
-  }
-  return Status::OK();
-}
-
 namespace shape_inference {
 
 // The V2 version computes windowed output size with arbitrary dilation_rate,
@@ -950,10 +838,24 @@ Status DepthwiseConv2DNativeShape(shape_inference::InferenceContext* c) {
         strides.size());
   }
 
+  std::vector<int32> dilations;
+  if (!c->GetAttr("dilations", &dilations).ok()) {
+    dilations.resize(4, 1);
+  }
+
+  if (dilations.size() != 4) {
+    return errors::InvalidArgument(
+        "DepthwiseConv2D requires the dilations attribute to contain 4 values, "
+        "but got: ",
+        dilations.size());
+  }
+
   string data_format;
   Status s = c->GetAttr("data_format", &data_format);
   int32 stride_rows;
   int32 stride_cols;
+  int32 dilation_rows;
+  int32 dilation_cols;
   if (s.ok() && data_format == "NCHW") {
     // Canonicalize input shape to NHWC so the shape inference code below can
     // process it.
@@ -962,9 +864,13 @@ Status DepthwiseConv2DNativeShape(shape_inference::InferenceContext* c) {
                        c->Dim(input_shape, 3), c->Dim(input_shape, 1)}});
     stride_rows = strides[2];
     stride_cols = strides[3];
+    dilation_rows = dilations[2];
+    dilation_cols = dilations[3];
   } else {
     stride_rows = strides[1];
     stride_cols = strides[2];
+    dilation_rows = dilations[1];
+    dilation_cols = dilations[2];
   }
 
   DimensionHandle batch_size_dim = c->Dim(input_shape, 0);
@@ -991,10 +897,12 @@ Status DepthwiseConv2DNativeShape(shape_inference::InferenceContext* c) {
   // in the kernel implementation.
   DimensionHandle output_rows, output_cols;
 
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(
-      c, in_rows_dim, filter_rows_dim, stride_rows, padding, &output_rows));
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(
-      c, in_cols_dim, filter_cols_dim, stride_cols, padding, &output_cols));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
+      c, in_rows_dim, filter_rows_dim, dilation_rows, stride_rows, padding, -1,
+      -1, &output_rows));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
+      c, in_cols_dim, filter_cols_dim, dilation_cols, stride_cols, padding, -1,
+      -1, &output_cols));
 
   ShapeHandle output_shape;
   if (data_format == "NCHW") {
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index 715eb2ad018..0d869ee7ba6 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -23,129 +23,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-// GetWindowedOutputSize(): Given an input tensor, kernel, stride and padding
-// type, the function computes the output and padding dimensions.
-//
-// For example, ignoring batches or multiple features, a 1D convolution
-// takes as input a 1D tensor of shape (H), and convolves it with a filter of
-// shape (K).
-//
-// It also takes in a few additional parameters:
-//
-// Stride (S): the stride with which we apply the filters. This is the offset
-// between locations where we apply the filters. A larger stride
-// means that the output will be spatially smaller.
-//
-// Padding (P): the padding we apply to the input tensor along each
-// dimension. This is usually used to make sure that the spatial dimensions
-// do not shrink when we progress with convolutions. This function supports two
-// types of padding.
-//   SAME: the pad value is computed so that the output will have size H/S.
-//   VALID: no padding is carried out.
-// If you want to use EXPLICIT padding, GetWindowedOutputSizeVerbose must be
-// called instead. Note the padded area is zero-filled.
-//
-// The output dimensions for convolution and many other operations, when given
-// all the parameters above, are as follows:
-// - When Padding = SAME: the output size is (H'), where
-//     H' = ceil(float(H) / float(S))
-//   where ceil is the ceiling function. The number of padded cells
-//   is computed as:
-//     Pc = ((H' - 1) * S + K - H) / 2
-//   When the stride is 1, the expression simplifies to
-//     H' = H, Pc = (K-1)/2.
-//   This is where SAME comes from - the output has the same size as the input
-//   has.
-//
-// - When Padding = VALID: the output size is computed as
-//     H' = ceil(float(H - K + 1) / float(S))
-//   and the number of padded cells is always zero.
-//   When the stride is 1, the expression simplifies to
-//     H' = H-K+1.
-//
-// For convolution, mathematically, the output value at location (r')
-// is the inner product of two vectors: the chunk of input at
-//    ((r'*S-Pr) : (r'*S-Pr+K)),
-// and the filter.
-//
-// For 2D and 3D convolutions, the spatial dimensions are orthogonal, so the
-// size and padding of each spatial dimension can be computed by calling
-// GetWindowedOutputSize separately for each dimension.
-//
-Status GetWindowedOutputSize(int64 input_size, int64 filter_size, int64 stride,
-                             Padding padding_type, int64* output_size,
-                             int64* padding_size);
-
-// The V2 version computes the same outputs with arbitrary dilation_rate.
-// The output dimensions are computed as follows:
-// - When adding dilation_rate (D), we compute an effective filter size (K'):
-//     K' = (K - 1) * D + 1
-// - When Padding = SAME: the output size is (H'), where
-//     H' = ceil(float(H) / float(S))
-//   where ceil is the ceiling function. The number of padded cells
-//   is computed as:
-//     Pc = ((H' - 1) * S + K' - H) / 2
-//   When the stride is 1, the expression simplifies to
-//     H' = H, Pc = (K'-1)/2.
-//   This is where SAME comes from - the output has the same size as the input
-//   has.
-//
-// - When Padding = VALID: the output size is computed as
-//     H' = ceil(float(H - K' + 1) / float(S))
-//   and the number of padded cells is always zero.
-//   When the stride is 1, the expression simplifies to
-//     H' = H-K'+1.
-//
-// If you want to use EXPLICIT padding, GetWindowedOutputSizeVerboseV2 must be
-// called instead
-//
-// TODO(b/67112639): Merge V2 versions and the original versions eventually.
-Status GetWindowedOutputSizeV2(int64 input_size, int64 filter_size,
-                               int64 dilation_rate, int64 stride,
-                               Padding padding_type, int64* output_size,
-                               int64* padding_size);
-
-// Returns the same output dimensions as in GetWindowedOutputSize, but returns
-// verbose padding dimensions (before/after), and EXPLICIT padding is supported.
-// When padding_type is EXPLICIT, *padding_before and *padding_after must
-// already point to initialized integers with the padding amounts. Otherwise,
-// *padding_before and *padding_after are set by this function, and any
-// excess padding (caused by an odd padding size value) is added to the
-// 'padding_after' dimension.
-Status GetWindowedOutputSizeVerbose(int64 input_size, int64 filter_size,
-                                    int64 stride, Padding padding_type,
-                                    int64* output_size, int64* padding_before,
-                                    int64* padding_after);
-
-// The V2 version computes the same outputs with arbitrary dilation_rate. For
-// detailed equations, refer to the comments for GetWindowedOutputSizeV2().
-Status GetWindowedOutputSizeVerboseV2(int64 input_size, int64 filter_size,
-                                      int64 dilation_rate, int64 stride,
-                                      Padding padding_type, int64* output_size,
-                                      int64* padding_before,
-                                      int64* padding_after);
-
-// Given an input tensor, kernel, stride and padding type, populates the 3D size
-// of the output tensor and padding to be applied to the input tensor at the
-// lower end of every dimension. Use for 3D convolutions, where the input data
-// is padded with zeros, as well as for 3D avg/max pooling, where the input data
-// is padded with invalid values that are not considered for pooling. EXPLICIT
-// padding is not supported.
-Status Get3dOutputSize(const std::array<int64, 3>& input,
-                       const std::array<int64, 3>& window,
-                       const std::array<int64, 3>& strides,
-                       Padding padding_type, std::array<int64, 3>* output_ptr,
-                       std::array<int64, 3>* padding_ptr);
-
-// The V2 version computes the same outputs with arbitrary dilation_rate. For
-// detailed equations, refer to the comments for GetWindowedOutputSizeV2().
-Status Get3dOutputSizeV2(const std::array<int64, 3>& input,
-                         const std::array<int64, 3>& window,
-                         const std::array<int64, 3>& dilations,
-                         const std::array<int64, 3>& strides,
-                         Padding padding_type, std::array<int64, 3>* output_ptr,
-                         std::array<int64, 3>* padding_ptr);
-
 namespace shape_inference {
 
 // Like GetWindowedOutputSize, but deals with DimensionHandles. Does not support
diff --git a/tensorflow/core/framework/cost_graph.proto b/tensorflow/core/framework/cost_graph.proto
index 68df1932de1..42c9e23cfab 100644
--- a/tensorflow/core/framework/cost_graph.proto
+++ b/tensorflow/core/framework/cost_graph.proto
@@ -76,4 +76,14 @@ message CostGraphDef {
     bool inaccurate = 17;
   }
   repeated Node node = 1;
+
+  // Total cost of this graph, typically used for balancing decisions.
+  message AggregatedCost {
+    // Aggregated cost value.
+    float cost = 1;
+
+    // Aggregated cost dimension (e.g. 'memory', 'compute', 'network').
+    string dimension = 2;
+  }
+  repeated AggregatedCost cost = 2;
 }
diff --git a/tensorflow/core/framework/cpu_allocator_impl.cc b/tensorflow/core/framework/cpu_allocator_impl.cc
index 59b94b606f4..1faee8986b8 100644
--- a/tensorflow/core/framework/cpu_allocator_impl.cc
+++ b/tensorflow/core/framework/cpu_allocator_impl.cc
@@ -133,12 +133,12 @@ class CPUAllocator : public Allocator {
 
  private:
   mutex mu_;
-  AllocatorStats stats_ GUARDED_BY(mu_);
+  AllocatorStats stats_ TF_GUARDED_BY(mu_);
 
   // Use <atomic> for single allocations to avoid mutex contention when
   // statistics are disabled.
   std::atomic<int> single_allocation_warning_count_;
-  int total_allocation_warning_count_ GUARDED_BY(mu_);
+  int total_allocation_warning_count_ TF_GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(CPUAllocator);
 };
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 2255130e4af..a3e5f87f66b 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -619,12 +619,8 @@ class IteratorBase {
   // Saves the state of this iterator.
   //
   // This method is used to store the state of the iterator in a checkpoint.
-  //
-  // TODO(jsimsa): Make this method pure virtual once all `IteratorBase`
   // implementations have an override.
-  virtual Status SaveInternal(IteratorStateWriter* writer) {
-    return errors::Unimplemented("SaveInternal");
-  }
+  virtual Status SaveInternal(IteratorStateWriter* writer) = 0;
 
   // Restores the state of this iterator.
   //
@@ -633,13 +629,9 @@ class IteratorBase {
   // Implementations may assume that the iterator is in a clean state. That is,
   // its `Initialize` method has been called, but its `GetNext` method has
   // never been called.
-  //
-  // TODO(jsimsa): Make this method pure virtual once all `IteratorBase`
   // implementations have an override.
   virtual Status RestoreInternal(IteratorContext* ctx,
-                                 IteratorStateReader* reader) {
-    return errors::Unimplemented("RestoreInternal");
-  }
+                                 IteratorStateReader* reader) = 0;
 
   // Returns the number of elements produced by this iterator.
   int64 num_elements() const {
@@ -749,22 +741,6 @@ class DatasetBase : public core::RefCounted {
     return MakeIterator(&ctx, parent, output_prefix, iterator);
   }
 
-  // TODO(jsimsa): Remove this overlead once all callers are migrated to the API
-  // that passes in the parent iterator pointer.
-  ABSL_DEPRECATED("Use the overload that passes the parent iterator pointer.")
-  Status MakeIterator(IteratorContext* ctx, const string& output_prefix,
-                      std::unique_ptr<IteratorBase>* iterator) const {
-    return MakeIterator(ctx, /*parent=*/nullptr, output_prefix, iterator);
-  }
-
-  // TODO(jsimsa): Remove this overlead once all callers are migrated to the API
-  // that passes in the parent iterator pointer.
-  ABSL_DEPRECATED("Use the overload that passes the parent iterator pointer.")
-  Status MakeIterator(IteratorContext&& ctx, const string& output_prefix,
-                      std::unique_ptr<IteratorBase>* iterator) const {
-    return MakeIterator(&ctx, output_prefix, iterator);
-  }
-
   // Returns a new iterator restored from the checkpoint data in `reader`.
   Status MakeIteratorFromCheckpoint(
       IteratorContext* ctx, const string& output_prefix,
@@ -807,26 +783,11 @@ class DatasetBase : public core::RefCounted {
   // A human-readable debug string for this dataset.
   virtual string DebugString() const = 0;
 
-  // If the dataset is stateful it will not be possible to save its graph or
-  // checkpoint the state of its iterators.
-  //
-  // TODO(jsimsa): Remove this method once all `DatasetBase` implementations are
-  // migrated over to `CheckExternalState`.
-  ABSL_DEPRECATED("Use CheckExternalState instead.")
-  virtual bool IsStateful() const { return false; }
-
-  // Indicates whether the dataset depends on any external state. If so, the
-  // method returns `errors::FailedPrecondition` with a message that identifies
-  // the external state. Otherwise, the method returns `Status::OK()`.
-  //
-  // TODO(jsimsa): Make this method pure virtual once all `DatasetBase`
-  // implementations have an override.
-  virtual Status CheckExternalState() const {
-    if (IsStateful()) {
-      return errors::FailedPrecondition("Dataset cannot be serialized.");
-    }
-    return Status::OK();
-  }
+  // Indicates whether the dataset depends on any external state which would
+  // prevent it from being serializable. If so, the method returns
+  // `errors::FailedPrecondition` with a message that identifies the external
+  // state. Otherwise, the method returns `Status::OK()`.
+  virtual Status CheckExternalState() const = 0;
 
  protected:
   friend Status AsGraphDef(
@@ -907,17 +868,6 @@ class DatasetBaseIterator : public IteratorBase {
   }
 
   Status Save(SerializationContext* ctx, IteratorStateWriter* writer) final {
-    Status s = params_.dataset->CheckExternalState();
-    if (!s.ok()) {
-      if (ctx->external_state_policy() ==
-          SerializationContext::ExternalStatePolicy::kWarn) {
-        LOG(WARNING) << "Dataset contains external state: " << s.ToString();
-      }
-      if (ctx->external_state_policy() ==
-          SerializationContext::ExternalStatePolicy::kFail) {
-        return s;
-      }
-    }
     return IteratorBase::Save(ctx, writer);
   }
 
@@ -1149,8 +1099,8 @@ class BackgroundWorker {
   std::unique_ptr<Thread> thread_;
   mutex mu_;
   condition_variable cond_var_;
-  bool cancelled_ GUARDED_BY(mu_) = false;
-  std::deque<std::function<void()>> work_queue_ GUARDED_BY(mu_);
+  bool cancelled_ TF_GUARDED_BY(mu_) = false;
+  std::deque<std::function<void()>> work_queue_ TF_GUARDED_BY(mu_);
 };
 
 // Registry of names of ops whose kernels subclass the `DatasetOpKernel` class.
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index b8890dd069b..eba64a6b41e 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -120,11 +120,6 @@ class DeviceBase {
 
   Env* env() const { return env_; }
 
-  // Override this to return true for devices that require an Op's
-  // compute method to save references to the temporary tensors it
-  // allocates until the Op execution completes
-  virtual bool RequiresRecordingAccessedTensors() const { return false; }
-
   struct CpuWorkerThreads {
     int num_threads = 0;
     thread::ThreadPool* workers = nullptr;
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 0572f38c11d..640f00a1352 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -351,7 +351,7 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   //
   // NB: This function returns a borrowed pointer, which can be invalidated by a
   // subsequent call to `ReplaceFunction()` with the given name.
-  const FunctionDef* Find(const string& func) const LOCKS_EXCLUDED(mu_);
+  const FunctionDef* Find(const string& func) const TF_LOCKS_EXCLUDED(mu_);
 
   // Adds function definition 'fdef' to this function library.
   // Returns status 'ok' on success, or error otherwise. This is a no-op if
@@ -359,48 +359,49 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // If 'fdef' is successfully added to the library, it will be accessible
   // from 'LookUp' and included in the proto returned by 'ToProto'.
   // This operation is atomic.
-  Status AddFunctionDef(const FunctionDef& fdef) LOCKS_EXCLUDED(mu_);
+  Status AddFunctionDef(const FunctionDef& fdef) TF_LOCKS_EXCLUDED(mu_);
 
   // Adds gradient definition 'grad' to this function library.
   // This is a no-op if 'grad' already exists in this function library.
   // If 'grad' is successfully added, it will be accessible via 'FindGradient'
   // and included in the proto returned by 'ToProto'.
   // This operation is atomic.
-  Status AddGradientDef(const GradientDef& grad) LOCKS_EXCLUDED(mu_);
+  Status AddGradientDef(const GradientDef& grad) TF_LOCKS_EXCLUDED(mu_);
 
   // Replaces the function corresponding to `func` with `fdef`. Returns
   // a non-OK status if "func" was not found in the library, OK otherwise.
   // Please be careful when replacing function: make sure all previous pointers
   // returned by `Find()` are no longer in use.
   Status ReplaceFunction(const string& func, const FunctionDef& fdef)
-      LOCKS_EXCLUDED(mu_);
+      TF_LOCKS_EXCLUDED(mu_);
 
   // Replaces the gradient corresponding to `grad.function_name()`. Returns
   // a non-OK status if "grad.function_name()" was not found in the library, OK
   // otherwise.
-  Status ReplaceGradient(const GradientDef& grad) LOCKS_EXCLUDED(mu_);
+  Status ReplaceGradient(const GradientDef& grad) TF_LOCKS_EXCLUDED(mu_);
 
   // Removes the function corresponding to 'func'. Returns a non-OK status if
   // 'func' was not found in the library, OK otherwise.
   // Please be careful when removing function: make sure there are no other
   // nodes using the function, and all previous pointers returned by `Find()`
   // are no longer in use.
-  Status RemoveFunction(const string& func) LOCKS_EXCLUDED(mu_);
+  Status RemoveFunction(const string& func) TF_LOCKS_EXCLUDED(mu_);
 
   // Adds the functions and gradients in 'other' to this function library.
   // Duplicate functions and gradients are ignored.
   // This operation is atomic.
-  Status AddLibrary(const FunctionLibraryDefinition& other) LOCKS_EXCLUDED(mu_);
+  Status AddLibrary(const FunctionLibraryDefinition& other)
+      TF_LOCKS_EXCLUDED(mu_);
 
   // Adds the functions and gradients in 'lib_def' to this function library.
   // Duplicate functions and gradients are ignored.
   // This operation is atomic.
-  Status AddLibrary(const FunctionDefLibrary& lib_def) LOCKS_EXCLUDED(mu_);
+  Status AddLibrary(const FunctionDefLibrary& lib_def) TF_LOCKS_EXCLUDED(mu_);
 
   // If the gradient function for 'func' is specified explicitly in
   // the library, returns the gradient function name.  Otherwise,
   // returns an empty string.
-  string FindGradient(const string& func) const LOCKS_EXCLUDED(mu_);
+  string FindGradient(const string& func) const TF_LOCKS_EXCLUDED(mu_);
 
   // OpRegistryInterface method. Useful for constructing a Graph.
   //
@@ -412,11 +413,11 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // subsequent call to `ReplaceFunction()` with the given name.
   Status LookUp(const string& op_type_name,
                 const OpRegistrationData** op_reg_data) const override
-      LOCKS_EXCLUDED(mu_);
+      TF_LOCKS_EXCLUDED(mu_);
 
   // Generates new function name with the specified prefix that is unique
   // across this library.
-  string UniqueFunctionName(StringPiece prefix) const LOCKS_EXCLUDED(mu_);
+  string UniqueFunctionName(StringPiece prefix) const TF_LOCKS_EXCLUDED(mu_);
 
   // Given a node def 'ndef', inspects attributes of the callee
   // function to derive the attribute 'value' for 'attr'. Returns OK
@@ -432,7 +433,7 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   Status GetAttr(const Node& node, const string& attr, T* value) const;
 
   // Returns a proto representation of the state of this function library.
-  FunctionDefLibrary ToProto() const LOCKS_EXCLUDED(mu_);
+  FunctionDefLibrary ToProto() const TF_LOCKS_EXCLUDED(mu_);
 
   size_t num_functions() const {
     tf_shared_lock l(mu_);
@@ -440,7 +441,7 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   }
 
   // Returns all the function names in the FunctionLibraryDefinition.
-  std::vector<string> ListFunctionNames() const LOCKS_EXCLUDED(mu_);
+  std::vector<string> ListFunctionNames() const TF_LOCKS_EXCLUDED(mu_);
 
   const OpRegistryInterface* default_registry() const {
     return default_registry_;
@@ -460,7 +461,7 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // argument error is returned.
   Status CopyFunctionDefFrom(const string& func,
                              const FunctionLibraryDefinition& other)
-      LOCKS_EXCLUDED(mu_);
+      TF_LOCKS_EXCLUDED(mu_);
 
  private:
   // Shape inference for functions is handled separately by ShapeRefiner.
@@ -473,44 +474,46 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   };
 
   std::shared_ptr<FunctionDefAndOpRegistration> FindHelper(
-      const string& func) const SHARED_LOCKS_REQUIRED(mu_);
+      const string& func) const TF_SHARED_LOCKS_REQUIRED(mu_);
   string FindGradientHelper(const string& func) const
-      SHARED_LOCKS_REQUIRED(mu_);
+      TF_SHARED_LOCKS_REQUIRED(mu_);
 
   Status AddHelper(std::shared_ptr<FunctionDefAndOpRegistration> registration,
-                   bool* added) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+                   bool* added) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Same as AddFunctionDef/AddGradientDef except these methods set
   // `added` to true if the `fdef`/`grad` were actually added to this.
   Status AddFunctionDefHelper(const FunctionDef& fdef, bool* added)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
   Status AddGradientDefHelper(const GradientDef& grad, bool* added)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Helper function for GetAttr. Returns the FunctionDef* to get the
   // attr from.
-  const FunctionDef* GetAttrImpl(const NodeDef& ndef) const LOCKS_EXCLUDED(mu_);
+  const FunctionDef* GetAttrImpl(const NodeDef& ndef) const
+      TF_LOCKS_EXCLUDED(mu_);
 
   // Remove all functions in `funcs` and all gradients of functions in
   // `funcs_with_grads` from this library.
   void Remove(const std::vector<string>& funcs,
               const std::vector<string>& funcs_with_grads)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Remove `func` from the library. Returns non-OK Status unless `func` is in
   // the library. This should only be called when there is a guarantee that the
   // function being removed hasn't been retrieved with `Find`.
-  Status RemoveFunctionHelper(const string& func) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  Status RemoveFunctionHelper(const string& func)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Remove gradient of function `func` from the library. Returns non-OK Status
   // unless `func` has a gradient.
-  Status RemoveGradient(const string& func) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  Status RemoveGradient(const string& func) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   mutable mutex mu_;
   const OpRegistryInterface* const default_registry_;
   gtl::FlatMap<string, std::shared_ptr<FunctionDefAndOpRegistration>>
-      function_defs_ GUARDED_BY(mu_);
-  gtl::FlatMap<string, string> func_grad_ GUARDED_BY(mu_);
+      function_defs_ TF_GUARDED_BY(mu_);
+  gtl::FlatMap<string, string> func_grad_ TF_GUARDED_BY(mu_);
 };
 
 // Forward declare. Defined in common_runtime/function.h
@@ -712,6 +715,10 @@ class FunctionLibraryRuntime {
     // If True, allow returning dead tensors.
     bool allow_dead_tensors = false;
 
+    // If True, hint that all kernels should be treated as "inexpensive", and
+    // hence executed on the scheduling thread.
+    bool run_all_kernels_inline = false;
+
     // Returns a human readable representation of this.
     string DebugString() const;
   };
diff --git a/tensorflow/core/framework/function_handle_cache.h b/tensorflow/core/framework/function_handle_cache.h
index 2800a598e09..d48cf4ff1e1 100644
--- a/tensorflow/core/framework/function_handle_cache.h
+++ b/tensorflow/core/framework/function_handle_cache.h
@@ -44,7 +44,7 @@ class FunctionHandleCache {
   FunctionLibraryRuntime* lib_ = nullptr;  // not owned
   const string state_handle_;
   std::unordered_map<string, FunctionLibraryRuntime::Handle> handles_
-      GUARDED_BY(mu_);
+      TF_GUARDED_BY(mu_);
 };
 
 }  // namespace data
diff --git a/tensorflow/core/framework/kernel_shape_util.cc b/tensorflow/core/framework/kernel_shape_util.cc
new file mode 100644
index 00000000000..c63a477bd0f
--- /dev/null
+++ b/tensorflow/core/framework/kernel_shape_util.cc
@@ -0,0 +1,131 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/kernel_shape_util.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+Status GetWindowedOutputSizeVerboseV2(int64 input_size, int64 filter_size,
+                                      int64 dilation_rate, int64 stride,
+                                      Padding padding_type, int64* output_size,
+                                      int64* padding_before,
+                                      int64* padding_after) {
+  if (stride <= 0) {
+    return errors::InvalidArgument("Stride must be > 0, but got ", stride);
+  }
+  if (dilation_rate < 1) {
+    return errors::InvalidArgument("Dilation rate must be >= 1, but got ",
+                                   dilation_rate);
+  }
+
+  // See also the parallel implementation in GetWindowedOutputSizeFromDimsV2.
+  int64 effective_filter_size = (filter_size - 1) * dilation_rate + 1;
+  switch (padding_type) {
+    case Padding::VALID:
+      *output_size = (input_size - effective_filter_size + stride) / stride;
+      *padding_before = *padding_after = 0;
+      break;
+    case Padding::EXPLICIT:
+      *output_size = (input_size + *padding_before + *padding_after -
+                      effective_filter_size + stride) /
+                     stride;
+      break;
+    case Padding::SAME:
+      *output_size = (input_size + stride - 1) / stride;
+      const int64 padding_needed =
+          std::max(int64{0}, (*output_size - 1) * stride +
+                                 effective_filter_size - input_size);
+      // For odd values of total padding, add more padding at the 'right'
+      // side of the given dimension.
+      *padding_before = padding_needed / 2;
+      *padding_after = padding_needed - *padding_before;
+      break;
+  }
+  if (*output_size < 0) {
+    return errors::InvalidArgument(
+        "Computed output size would be negative: ", *output_size,
+        " [input_size: ", input_size,
+        ", effective_filter_size: ", effective_filter_size,
+        ", stride: ", stride, "]");
+  }
+  return Status::OK();
+}
+
+Status GetWindowedOutputSizeVerbose(int64 input_size, int64 filter_size,
+                                    int64 stride, Padding padding_type,
+                                    int64* output_size, int64* padding_before,
+                                    int64* padding_after) {
+  return GetWindowedOutputSizeVerboseV2(input_size, filter_size,
+                                        /*dilation_rate=*/1, stride,
+                                        padding_type, output_size,
+                                        padding_before, padding_after);
+}
+
+Status GetWindowedOutputSize(int64 input_size, int64 filter_size, int64 stride,
+                             Padding padding_type, int64* output_size,
+                             int64* padding_size) {
+  if (padding_type == Padding::EXPLICIT) {
+    return errors::Internal(
+        "GetWindowedOutputSize does not handle EXPLICIT padding; call "
+        "GetWindowedOutputSizeVerbose instead");
+  }
+  int64 padding_after_unused;
+  return GetWindowedOutputSizeVerbose(input_size, filter_size, stride,
+                                      padding_type, output_size, padding_size,
+                                      &padding_after_unused);
+}
+
+Status GetWindowedOutputSizeV2(int64 input_size, int64 filter_size,
+                               int64 dilation_rate, int64 stride,
+                               Padding padding_type, int64* output_size,
+                               int64* padding_size) {
+  if (padding_type == Padding::EXPLICIT) {
+    return errors::Internal(
+        "GetWindowedOutputSizeV2 does not handle EXPLICIT padding; call "
+        "GetWindowedOutputSizeVerboseV2 instead");
+  }
+  int64 padding_after_unused;
+  return GetWindowedOutputSizeVerboseV2(input_size, filter_size, dilation_rate,
+                                        stride, padding_type, output_size,
+                                        padding_size, &padding_after_unused);
+}
+
+Status Get3dOutputSize(const std::array<int64, 3>& input,
+                       const std::array<int64, 3>& window,
+                       const std::array<int64, 3>& strides,
+                       Padding padding_type, std::array<int64, 3>* output_ptr,
+                       std::array<int64, 3>* padding_ptr) {
+  for (size_t i = 0; i < input.size(); ++i) {
+    TF_RETURN_IF_ERROR(GetWindowedOutputSize(input[i], window[i], strides[i],
+                                             padding_type, &(*output_ptr)[i],
+                                             &(*padding_ptr)[i]));
+  }
+  return Status::OK();
+}
+
+Status Get3dOutputSizeV2(const std::array<int64, 3>& input,
+                         const std::array<int64, 3>& window,
+                         const std::array<int64, 3>& dilations,
+                         const std::array<int64, 3>& strides,
+                         Padding padding_type, std::array<int64, 3>* output_ptr,
+                         std::array<int64, 3>* padding_ptr) {
+  for (size_t i = 0; i < input.size(); ++i) {
+    TF_RETURN_IF_ERROR(GetWindowedOutputSizeV2(
+        input[i], window[i], dilations[i], strides[i], padding_type,
+        &(*output_ptr)[i], &(*padding_ptr)[i]));
+  }
+  return Status::OK();
+}
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/kernel_shape_util.h b/tensorflow/core/framework/kernel_shape_util.h
new file mode 100644
index 00000000000..adb1dca1156
--- /dev/null
+++ b/tensorflow/core/framework/kernel_shape_util.h
@@ -0,0 +1,149 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_KERNEL_SHAPE_UTIL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_KERNEL_SHAPE_UTIL_H_
+
+#include <array>
+
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/util/padding.h"
+
+namespace tensorflow {
+// GetWindowedOutputSize(): Given an input tensor, kernel, stride and padding
+// type, the function computes the output and padding dimensions.
+//
+// For example, ignoring batches or multiple features, a 1D convolution
+// takes as input a 1D tensor of shape (H), and convolves it with a filter of
+// shape (K).
+//
+// It also takes in a few additional parameters:
+//
+// Stride (S): the stride with which we apply the filters. This is the offset
+// between locations where we apply the filters. A larger stride
+// means that the output will be spatially smaller.
+//
+// Padding (P): the padding we apply to the input tensor along each
+// dimension. This is usually used to make sure that the spatial dimensions
+// do not shrink when we progress with convolutions. This function supports two
+// types of padding.
+//   SAME: the pad value is computed so that the output will have size H/S.
+//   VALID: no padding is carried out.
+// If you want to use EXPLICIT padding, GetWindowedOutputSizeVerbose must be
+// called instead. Note the padded area is zero-filled.
+//
+// The output dimensions for convolution and many other operations, when given
+// all the parameters above, are as follows:
+// - When Padding = SAME: the output size is (H'), where
+//     H' = ceil(float(H) / float(S))
+//   where ceil is the ceiling function. The number of padded cells
+//   is computed as:
+//     Pc = ((H' - 1) * S + K - H) / 2
+//   When the stride is 1, the expression simplifies to
+//     H' = H, Pc = (K-1)/2.
+//   This is where SAME comes from - the output has the same size as the input
+//   has.
+//
+// - When Padding = VALID: the output size is computed as
+//     H' = ceil(float(H - K + 1) / float(S))
+//   and the number of padded cells is always zero.
+//   When the stride is 1, the expression simplifies to
+//     H' = H-K+1.
+//
+// For convolution, mathematically, the output value at location (r')
+// is the inner product of two vectors: the chunk of input at
+//    ((r'*S-Pr) : (r'*S-Pr+K)),
+// and the filter.
+//
+// For 2D and 3D convolutions, the spatial dimensions are orthogonal, so the
+// size and padding of each spatial dimension can be computed by calling
+// GetWindowedOutputSize separately for each dimension.
+//
+Status GetWindowedOutputSize(int64 input_size, int64 filter_size, int64 stride,
+                             Padding padding_type, int64* output_size,
+                             int64* padding_size);
+
+// The V2 version computes the same outputs with arbitrary dilation_rate.
+// The output dimensions are computed as follows:
+// - When adding dilation_rate (D), we compute an effective filter size (K'):
+//     K' = (K - 1) * D + 1
+// - When Padding = SAME: the output size is (H'), where
+//     H' = ceil(float(H) / float(S))
+//   where ceil is the ceiling function. The number of padded cells
+//   is computed as:
+//     Pc = ((H' - 1) * S + K' - H) / 2
+//   When the stride is 1, the expression simplifies to
+//     H' = H, Pc = (K'-1)/2.
+//   This is where SAME comes from - the output has the same size as the input
+//   has.
+//
+// - When Padding = VALID: the output size is computed as
+//     H' = ceil(float(H - K' + 1) / float(S))
+//   and the number of padded cells is always zero.
+//   When the stride is 1, the expression simplifies to
+//     H' = H-K'+1.
+//
+// If you want to use EXPLICIT padding, GetWindowedOutputSizeVerboseV2 must be
+// called instead
+//
+// TODO(b/67112639): Merge V2 versions and the original versions eventually.
+Status GetWindowedOutputSizeV2(int64 input_size, int64 filter_size,
+                               int64 dilation_rate, int64 stride,
+                               Padding padding_type, int64* output_size,
+                               int64* padding_size);
+
+// Returns the same output dimensions as in GetWindowedOutputSize, but returns
+// verbose padding dimensions (before/after), and EXPLICIT padding is supported.
+// When padding_type is EXPLICIT, *padding_before and *padding_after must
+// already point to initialized integers with the padding amounts. Otherwise,
+// *padding_before and *padding_after are set by this function, and any
+// excess padding (caused by an odd padding size value) is added to the
+// 'padding_after' dimension.
+Status GetWindowedOutputSizeVerbose(int64 input_size, int64 filter_size,
+                                    int64 stride, Padding padding_type,
+                                    int64* output_size, int64* padding_before,
+                                    int64* padding_after);
+
+// The V2 version computes the same outputs with arbitrary dilation_rate. For
+// detailed equations, refer to the comments for GetWindowedOutputSizeV2().
+Status GetWindowedOutputSizeVerboseV2(int64 input_size, int64 filter_size,
+                                      int64 dilation_rate, int64 stride,
+                                      Padding padding_type, int64* output_size,
+                                      int64* padding_before,
+                                      int64* padding_after);
+
+// Given an input tensor, kernel, stride and padding type, populates the 3D size
+// of the output tensor and padding to be applied to the input tensor at the
+// lower end of every dimension. Use for 3D convolutions, where the input data
+// is padded with zeros, as well as for 3D avg/max pooling, where the input data
+// is padded with invalid values that are not considered for pooling. EXPLICIT
+// padding is not supported.
+Status Get3dOutputSize(const std::array<int64, 3>& input,
+                       const std::array<int64, 3>& window,
+                       const std::array<int64, 3>& strides,
+                       Padding padding_type, std::array<int64, 3>* output_ptr,
+                       std::array<int64, 3>* padding_ptr);
+
+// The V2 version computes the same outputs with arbitrary dilation_rate. For
+// detailed equations, refer to the comments for GetWindowedOutputSizeV2().
+Status Get3dOutputSizeV2(const std::array<int64, 3>& input,
+                         const std::array<int64, 3>& window,
+                         const std::array<int64, 3>& dilations,
+                         const std::array<int64, 3>& strides,
+                         Padding padding_type, std::array<int64, 3>* output_ptr,
+                         std::array<int64, 3>* padding_ptr);
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_FRAMEWORK_KERNEL_SHAPE_UTIL_H_
diff --git a/tensorflow/core/framework/local_rendezvous.h b/tensorflow/core/framework/local_rendezvous.h
index 07c52712de7..19c218793b6 100644
--- a/tensorflow/core/framework/local_rendezvous.h
+++ b/tensorflow/core/framework/local_rendezvous.h
@@ -64,8 +64,8 @@ class LocalRendezvous {
 
   // TODO(zhifengc): shard table_.
   mutex mu_;
-  Table table_ GUARDED_BY(mu_);
-  Status status_ GUARDED_BY(mu_);
+  Table table_ TF_GUARDED_BY(mu_);
+  Status status_ TF_GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(LocalRendezvous);
 };
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index f4b26ade74b..48560d16084 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -135,7 +135,7 @@ class InterleaveMany : public Node {
 
  protected:
   std::shared_ptr<Node> Clone(std::shared_ptr<Node> output) const override
-      SHARED_LOCKS_REQUIRED(mu_) {
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
     return std::make_shared<InterleaveMany>(
         Args{id_, name_, std::move(output)});
   }
@@ -144,7 +144,7 @@ class InterleaveMany : public Node {
   // output time of inputs comprising the interleave "cycle".
   double OutputTimeLocked(std::vector<double>* input_times,
                           std::map<string, double>* gradient) const override
-      SHARED_LOCKS_REQUIRED(mu_) {
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
     if (num_inputs() <= 1) {
       return SelfProcessingTimeLocked();
     }
@@ -188,7 +188,7 @@ class InterleaveMany : public Node {
   // The processing time is the sum of the self processing time and the average
   // processing time of inputs comprising the interleave "cycle".
   double TotalProcessingTimeLocked(std::map<string, double>* processing_times)
-      override SHARED_LOCKS_REQUIRED(mu_) {
+      override TF_SHARED_LOCKS_REQUIRED(mu_) {
     double self_processing_time = SelfProcessingTimeLocked();
     if (processing_times) {
       (*processing_times)[long_name()] = self_processing_time;
@@ -223,7 +223,7 @@ class AsyncInterleaveMany : public Node {
 
  protected:
   std::shared_ptr<Node> Clone(std::shared_ptr<Node> output) const override
-      SHARED_LOCKS_REQUIRED(mu_) {
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
     std::vector<std::shared_ptr<Parameter>> parameters;
     for (auto& pair : parameters_) {
       parameters.push_back(pair.second);
@@ -239,7 +239,7 @@ class AsyncInterleaveMany : public Node {
   // `buffer_size` is derived from parallelism.
   double OutputTimeLocked(std::vector<double>* input_times,
                           std::map<string, double>* gradient) const override
-      SHARED_LOCKS_REQUIRED(mu_) {
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
     if (num_inputs() <= 1) {
       return SelfProcessingTimeLocked();
     }
@@ -309,7 +309,7 @@ class AsyncInterleaveMany : public Node {
   // The processing time is the sum of the self processing time and the average
   // processing time of inputs comprising the interleave "cycle".
   double TotalProcessingTimeLocked(std::map<string, double>* processing_times)
-      override SHARED_LOCKS_REQUIRED(mu_) {
+      override TF_SHARED_LOCKS_REQUIRED(mu_) {
     double self_processing_time = SelfProcessingTimeLocked();
     if (processing_times) {
       (*processing_times)[long_name()] = self_processing_time;
@@ -333,7 +333,7 @@ class KnownRatio : public Node {
 
  protected:
   std::shared_ptr<Node> Clone(std::shared_ptr<Node> output) const override
-      SHARED_LOCKS_REQUIRED(mu_) {
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
     return std::make_shared<KnownRatio>(Args{id_, name_, std::move(output)},
                                         ratio_);
   }
@@ -342,7 +342,7 @@ class KnownRatio : public Node {
   // `ratio_` and the sum of output times of inputs.
   double OutputTimeLocked(std::vector<double>* input_times,
                           std::map<string, double>* gradient) const override
-      SHARED_LOCKS_REQUIRED(mu_) {
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
     if (ratio_ == 0) {
       return SelfProcessingTimeLocked();
     }
@@ -378,7 +378,7 @@ class KnownRatio : public Node {
   // The processing time is the sum of the self processing time and the product
   // of `ratio_` and the sum of processing times of inputs.
   double TotalProcessingTimeLocked(std::map<string, double>* processing_times)
-      override SHARED_LOCKS_REQUIRED(mu_) {
+      override TF_SHARED_LOCKS_REQUIRED(mu_) {
     double self_processing_time = SelfProcessingTimeLocked();
     if (processing_times) {
       (*processing_times)[long_name()] = self_processing_time;
@@ -405,7 +405,7 @@ class AsyncKnownRatio : public Node {
 
  protected:
   std::shared_ptr<Node> Clone(std::shared_ptr<Node> output) const override
-      SHARED_LOCKS_REQUIRED(mu_) {
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
     std::vector<std::shared_ptr<Parameter>> parameters;
     for (auto& pair : parameters_) {
       parameters.push_back(pair.second);
@@ -423,7 +423,7 @@ class AsyncKnownRatio : public Node {
   // Current implementation assumes that there is at most 1 parameter per node.
   double OutputTimeLocked(std::vector<double>* input_times,
                           std::map<string, double>* gradient) const override
-      SHARED_LOCKS_REQUIRED(mu_) {
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
     double parallelism = 1.0;
     double buffer_size = 0.0;
     auto* parallelism_parameter = gtl::FindOrNull(parameters_, kParallelism);
@@ -514,7 +514,7 @@ class AsyncKnownRatio : public Node {
   // The processing time is the sum of the self processing time and the product
   // of `ratio_` and the sum of processing times of inputs.
   double TotalProcessingTimeLocked(std::map<string, double>* processing_times)
-      override SHARED_LOCKS_REQUIRED(mu_) {
+      override TF_SHARED_LOCKS_REQUIRED(mu_) {
     double self_processing_time = SelfProcessingTimeLocked();
     if (processing_times) {
       (*processing_times)[long_name()] = self_processing_time;
@@ -535,7 +535,7 @@ class UnknownRatio : public Node {
 
  protected:
   std::shared_ptr<Node> Clone(std::shared_ptr<Node> output) const override
-      SHARED_LOCKS_REQUIRED(mu_) {
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
     return std::make_shared<UnknownRatio>(Args{id_, name_, std::move(output)});
   }
 
@@ -543,7 +543,7 @@ class UnknownRatio : public Node {
   // the ratio estimate and the sum of output times of inputs.
   double OutputTimeLocked(std::vector<double>* input_times,
                           std::map<string, double>* gradient) const override
-      SHARED_LOCKS_REQUIRED(mu_) {
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
     if (num_elements_ == 0 || inputs_.empty() ||
         inputs_.front()->num_elements() == 0) {
       return SelfProcessingTimeLocked();
@@ -582,7 +582,7 @@ class UnknownRatio : public Node {
   // The processing time is the sum of the self processing time and the product
   // of the ratio estimate and the sum of processing times of inputs.
   double TotalProcessingTimeLocked(std::map<string, double>* processing_times)
-      override SHARED_LOCKS_REQUIRED(mu_) {
+      override TF_SHARED_LOCKS_REQUIRED(mu_) {
     double self_processing_time = SelfProcessingTimeLocked();
     if (processing_times) {
       (*processing_times)[long_name()] = self_processing_time;
@@ -608,20 +608,20 @@ class Unknown : public Node {
 
  protected:
   std::shared_ptr<Node> Clone(std::shared_ptr<Node> output) const override
-      SHARED_LOCKS_REQUIRED(mu_) {
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
     return std::make_shared<Unknown>(Args{id_, name_, std::move(output)});
   }
 
   // The output time is the sum of output times of inputs.
   double OutputTimeLocked(std::vector<double>* input_times,
                           std::map<string, double>* gradient) const override
-      SHARED_LOCKS_REQUIRED(mu_) {
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
     return OutputTimeForInputs(input_times, gradient);
   }
 
   // The processing time is the sum of processing times of inputs.
   double TotalProcessingTimeLocked(std::map<string, double>* processing_times)
-      override SHARED_LOCKS_REQUIRED(mu_) {
+      override TF_SHARED_LOCKS_REQUIRED(mu_) {
     return TotalProcessingTimeForInputs(processing_times);
   }
 };
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index 17cd7842fc1..48d600a9f18 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -129,37 +129,37 @@ class Node {
   virtual ~Node() {}
 
   // Adds an input.
-  void add_input(std::shared_ptr<Node> node) LOCKS_EXCLUDED(mu_) {
+  void add_input(std::shared_ptr<Node> node) TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
     inputs_.push_back(node);
   }
 
   // Increments the aggregate processing time by the given delta.
-  void add_processing_time(int64 delta) LOCKS_EXCLUDED(mu_) {
+  void add_processing_time(int64 delta) TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
     processing_time_ += delta;
   }
 
   // Returns an indication whether autotuning is enabled for this node.
-  bool autotune() const LOCKS_EXCLUDED(mu_) {
+  bool autotune() const TF_LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     return autotune_;
   }
 
   // Returns the number of bytes stored in this node's buffer.
-  int64 buffered_bytes() const LOCKS_EXCLUDED(mu_) {
+  int64 buffered_bytes() const TF_LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     return buffered_bytes_;
   }
 
   // Returns the number of elements stored in this node's buffer.
-  int64 buffered_elements() const LOCKS_EXCLUDED(mu_) {
+  int64 buffered_elements() const TF_LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     return buffered_elements_;
   }
 
   // Indicates whether the node has tunable parameters.
-  bool has_tunable_parameters() const LOCKS_EXCLUDED(mu_) {
+  bool has_tunable_parameters() const TF_LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     for (const auto& pair : parameters_) {
       if (pair.second->state->tunable) return true;
@@ -168,10 +168,10 @@ class Node {
   }
 
   // Returns the unique node ID.
-  int64 id() const LOCKS_EXCLUDED(mu_) { return id_; }
+  int64 id() const TF_LOCKS_EXCLUDED(mu_) { return id_; }
 
   // Returns the node inputs.
-  std::list<std::shared_ptr<Node>> inputs() const LOCKS_EXCLUDED(mu_) {
+  std::list<std::shared_ptr<Node>> inputs() const TF_LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     return inputs_;
   }
@@ -183,7 +183,7 @@ class Node {
   const string& name() const { return name_; }
 
   // Returns the number of elements produced by the node.
-  int64 num_elements() const LOCKS_EXCLUDED(mu_) {
+  int64 num_elements() const TF_LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     return num_elements_;
   }
@@ -192,33 +192,33 @@ class Node {
   Node* output() const { return output_; }
 
   // Returns the aggregate processing time.
-  int64 processing_time() const LOCKS_EXCLUDED(mu_) {
+  int64 processing_time() const TF_LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     return processing_time_;
   }
 
   // Records the change in this node's buffer.
   void record_buffer_event(int64 bytes_delta, int64 elements_delta)
-      LOCKS_EXCLUDED(mu_) {
+      TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
     buffered_bytes_ += bytes_delta;
     buffered_elements_ += elements_delta;
   }
 
   // Records that the node produced an element.
-  void record_element() LOCKS_EXCLUDED(mu_) {
+  void record_element() TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
     num_elements_++;
   }
 
   // Records that a node thread has started executing.
-  void record_start(int64 time_nanos) LOCKS_EXCLUDED(mu_) {
+  void record_start(int64 time_nanos) TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
     work_start_[std::this_thread::get_id()] = time_nanos;
   }
 
   // Records that a node thread has stopped executing.
-  void record_stop(int64 time_nanos) LOCKS_EXCLUDED(mu_) {
+  void record_stop(int64 time_nanos) TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
     std::thread::id tid = std::this_thread::get_id();
     auto iter = work_start_.find(tid);
@@ -232,13 +232,13 @@ class Node {
   }
 
   // Removes an input.
-  void remove_input(std::shared_ptr<Node> input) LOCKS_EXCLUDED(mu_) {
+  void remove_input(std::shared_ptr<Node> input) TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
     inputs_.remove(input);
   }
 
   // Sets the value that determines whether autotuning is enabled for this node.
-  void set_autotune(bool autotune) LOCKS_EXCLUDED(mu_) {
+  void set_autotune(bool autotune) TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
     autotune_ = autotune;
   }
@@ -246,7 +246,7 @@ class Node {
   // Collects tunable parameters in the subtree rooted in this node.
   void CollectTunableParameters(
       std::map<string, std::shared_ptr<Parameter>>* parameters) const
-      LOCKS_EXCLUDED(mu_) {
+      TF_LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     if (!autotune_) {
       return;
@@ -262,7 +262,7 @@ class Node {
   }
 
   // Returns a human-readable representation of this node.
-  string DebugString() const LOCKS_EXCLUDED(mu_) {
+  string DebugString() const TF_LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     string result;
     strings::StrAppend(&result, long_name(), ":\n");
@@ -286,7 +286,7 @@ class Node {
   // parameters of the subtree rooted in this node and the last input time.
   double OutputTime(std::vector<double>* input_times,
                     std::map<string, double>* gradient) const
-      LOCKS_EXCLUDED(mu_) {
+      TF_LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     return OutputTimeLocked(input_times, gradient);
   }
@@ -297,7 +297,7 @@ class Node {
   // The purpose for this method is to allow the model optimization logic to
   // operate over immutable state while allowing concurrent model updates.
   std::shared_ptr<Node> Snapshot(std::shared_ptr<Node> output)
-      LOCKS_EXCLUDED(mu_) {
+      TF_LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     std::shared_ptr<Node> result = Clone(output);
     {
@@ -316,14 +316,14 @@ class Node {
   }
 
   // Returns the per-element processing time spent in this node.
-  double SelfProcessingTime() const LOCKS_EXCLUDED(mu_) {
+  double SelfProcessingTime() const TF_LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     return SelfProcessingTimeLocked();
   }
 
   // Returns the total number of bytes buffered in all nodes in the subtree for
   // which autotuning is enabled.
-  double TotalBufferedBytes() const LOCKS_EXCLUDED(mu_) {
+  double TotalBufferedBytes() const TF_LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     if (!autotune_) {
       return 0;
@@ -345,7 +345,7 @@ class Node {
   // Collects the total buffer limit of all nodes in the subtree for which
   // autotuning is enabled. This number represents the amount of memory that
   // would be used by the subtree nodes if all of their buffers were full.
-  double TotalMaximumBufferedBytes() const LOCKS_EXCLUDED(mu_) {
+  double TotalMaximumBufferedBytes() const TF_LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     if (!autotune_) {
       return 0;
@@ -368,14 +368,14 @@ class Node {
   // If `processing_times` is not `nullptr`, collects the per-element CPU time
   // spent in each node of the subtree.
   double TotalProcessingTime(std::map<string, double>* processing_times)
-      LOCKS_EXCLUDED(mu_) {
+      TF_LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     return TotalProcessingTimeLocked(processing_times);
   }
 
  protected:
   // Returns the number of inputs.
-  int64 num_inputs() const SHARED_LOCKS_REQUIRED(mu_) {
+  int64 num_inputs() const TF_SHARED_LOCKS_REQUIRED(mu_) {
     int64 num_inputs = 0;
     for (auto& input : inputs_) {
       // Inputs for which autotuning is disabled are excluded.
@@ -388,10 +388,10 @@ class Node {
 
   // Creates a clone of this node.
   virtual std::shared_ptr<Node> Clone(std::shared_ptr<Node> output) const
-      SHARED_LOCKS_REQUIRED(mu_) = 0;
+      TF_SHARED_LOCKS_REQUIRED(mu_) = 0;
 
   // Returns the average size of an element buffered in this node.
-  double AverageBufferedElementSize() const SHARED_LOCKS_REQUIRED(mu_) {
+  double AverageBufferedElementSize() const TF_SHARED_LOCKS_REQUIRED(mu_) {
     if (buffered_elements_ == 0) {
       return 0;
     }
@@ -404,7 +404,7 @@ class Node {
   // tunable parameters and the last input time.
   double OutputTimeForInputs(std::vector<double>* input_times,
                              std::map<string, double>* gradient) const
-      SHARED_LOCKS_REQUIRED(mu_) {
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
     double sum = 0;
     for (auto& input : inputs_) {
       // Inputs for which autotuning is disabled are excluded.
@@ -420,7 +420,7 @@ class Node {
   // parameters of the subtree rooted in this node and the last input time.
   virtual double OutputTimeLocked(std::vector<double>* input_times,
                                   std::map<string, double>* gradient) const
-      SHARED_LOCKS_REQUIRED(mu_) = 0;
+      TF_SHARED_LOCKS_REQUIRED(mu_) = 0;
 
   // Returns the sum of per-element processing time for the inputs of this node.
   // Processing time for a given input is a weighted combination of a statistic
@@ -432,7 +432,8 @@ class Node {
   // Uniform distribution of per-element processing times across different
   // inputs is assumed.
   double TotalProcessingTimeForInputs(
-      std::map<string, double>* processing_times) SHARED_LOCKS_REQUIRED(mu_) {
+      std::map<string, double>* processing_times)
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
     // If the number of elements produced by an input is smaller than this
     // constant, then its processing time is estimated using a weighted average
     // of the empirical processing time and processing time history.
@@ -472,7 +473,7 @@ class Node {
   }
 
   // Returns the per-element processing time spent in this node.
-  double SelfProcessingTimeLocked() const SHARED_LOCKS_REQUIRED(mu_) {
+  double SelfProcessingTimeLocked() const TF_SHARED_LOCKS_REQUIRED(mu_) {
     if (num_elements_ == 0) {
       return 0;
     }
@@ -485,7 +486,7 @@ class Node {
   // spent in each node of the subtree.
   virtual double TotalProcessingTimeLocked(
       std::map<string, double>* processing_times)
-      SHARED_LOCKS_REQUIRED(mu_) = 0;
+      TF_SHARED_LOCKS_REQUIRED(mu_) = 0;
 
   mutable mutex mu_;
   const int64 id_;
@@ -494,13 +495,13 @@ class Node {
   // Indicates whether the subtree rooted in this node should be included in
   // autotuning. In particular, if this is `false`, then the subtree is excluded
   // from computation of output time and processing time.
-  bool autotune_ GUARDED_BY(mu_) = true;
-  int64 buffered_bytes_ GUARDED_BY(mu_) = 0;
-  int64 buffered_elements_ GUARDED_BY(mu_) = 0;
-  int64 processing_time_ GUARDED_BY(mu_) = 0;
-  int64 num_elements_ GUARDED_BY(mu_) = 0;
-  std::map<std::thread::id, int64> work_start_ GUARDED_BY(mu_);
-  std::map<string, std::shared_ptr<Parameter>> parameters_ GUARDED_BY(mu_);
+  bool autotune_ TF_GUARDED_BY(mu_) = true;
+  int64 buffered_bytes_ TF_GUARDED_BY(mu_) = 0;
+  int64 buffered_elements_ TF_GUARDED_BY(mu_) = 0;
+  int64 processing_time_ TF_GUARDED_BY(mu_) = 0;
+  int64 num_elements_ TF_GUARDED_BY(mu_) = 0;
+  std::map<std::thread::id, int64> work_start_ TF_GUARDED_BY(mu_);
+  std::map<string, std::shared_ptr<Parameter>> parameters_ TF_GUARDED_BY(mu_);
 
   // Statistic of inputs processing time history.
   double input_processing_time_sum_ = 0.0L;
@@ -509,7 +510,7 @@ class Node {
   // Inputs of this node. These can represent an iterator created from the input
   // dataset but also other input iterators (e.g. created by the user-defined
   // functions of `flat_map` or `interleave`).
-  std::list<std::shared_ptr<Node>> inputs_ GUARDED_BY(mu_);
+  std::list<std::shared_ptr<Node>> inputs_ TF_GUARDED_BY(mu_);
 
   // The reference to the output node is not owned so that deletion of a
   // node results in recursive deletion of the subtree rooted in the node.
@@ -580,31 +581,33 @@ class Model {
   // Adds a node with the given name and given output. The method returns
   // a pointer to the node but does not transfer ownership.
   void AddNode(Node::Factory factory, const string& name,
-               const string& output_name, Node** out_node) LOCKS_EXCLUDED(mu_);
+               const string& output_name, Node** out_node)
+      TF_LOCKS_EXCLUDED(mu_);
 
   // Increments the processing time for the given node..
-  void AddProcessingTime(const string& name, int64 delta) LOCKS_EXCLUDED(mu_);
+  void AddProcessingTime(const string& name, int64 delta)
+      TF_LOCKS_EXCLUDED(mu_);
 
   // Uses the given algorithm to perform the autotuning optimization.
   void Optimize(AutotuneAlgorithm algorithm, int64 cpu_budget, int64 ram_budget)
-      LOCKS_EXCLUDED(mu_);
+      TF_LOCKS_EXCLUDED(mu_);
 
   // Records that a node has produced an element.
-  void RecordElement(const string& name) LOCKS_EXCLUDED(mu_);
+  void RecordElement(const string& name) TF_LOCKS_EXCLUDED(mu_);
 
   // Returns the number of elements that the input pipeline has produced.
-  int64 NumElements(const string& name) LOCKS_EXCLUDED(mu_);
+  int64 NumElements(const string& name) TF_LOCKS_EXCLUDED(mu_);
 
   // Records that the given node has started work. If `stop_output` is set, it
   // also records that the output of the given node has stopped work.
-  void RecordStart(const string& name, bool stop_output) LOCKS_EXCLUDED(mu_);
+  void RecordStart(const string& name, bool stop_output) TF_LOCKS_EXCLUDED(mu_);
 
   // Records that the given node has stopped work. If `stop_output` is set, it
   // also records that the output of the given node has started work.
-  void RecordStop(const string& name, bool start_output) LOCKS_EXCLUDED(mu_);
+  void RecordStop(const string& name, bool start_output) TF_LOCKS_EXCLUDED(mu_);
 
   // Removes the given node.
-  void RemoveNode(const string& name) LOCKS_EXCLUDED(mu_);
+  void RemoveNode(const string& name) TF_LOCKS_EXCLUDED(mu_);
 
  private:
   // Collects tunable parameters in the tree rooted in the given node, returning
@@ -660,9 +663,9 @@ class Model {
   // access is required only when adding or removing nodes. Concurrent access to
   // existing nodes is protected by a node mutex.
   mutex mu_;
-  int64 id_counter_ GUARDED_BY(mu_) = 1;
-  std::shared_ptr<Node> output_ GUARDED_BY(mu_);
-  std::map<string, std::shared_ptr<Node>> lookup_table_ GUARDED_BY(mu_);
+  int64 id_counter_ TF_GUARDED_BY(mu_) = 1;
+  std::shared_ptr<Node> output_ TF_GUARDED_BY(mu_);
+  std::map<string, std::shared_ptr<Node>> lookup_table_ TF_GUARDED_BY(mu_);
 
   // Indicates whether the modeling framework should collect resource usage
   // (e.g. CPU, memory). The logic for collecting this information assumes that
diff --git a/tensorflow/core/framework/model_test.cc b/tensorflow/core/framework/model_test.cc
index 27ddd1bf239..af3ded6bc55 100644
--- a/tensorflow/core/framework/model_test.cc
+++ b/tensorflow/core/framework/model_test.cc
@@ -386,18 +386,18 @@ class TestNode : public model::Node {
 
  protected:
   std::shared_ptr<Node> Clone(std::shared_ptr<Node> output) const override
-      SHARED_LOCKS_REQUIRED(mu_) {
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
     return nullptr;
   }
 
   double OutputTimeLocked(std::vector<double>* input_times,
                           std::map<string, double>* gradient) const override
-      SHARED_LOCKS_REQUIRED(mu_) {
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
     return 0;
   }
 
   double TotalProcessingTimeLocked(std::map<string, double>* processing_times)
-      override SHARED_LOCKS_REQUIRED(mu_) {
+      override TF_SHARED_LOCKS_REQUIRED(mu_) {
     return 0;
   }
 };
diff --git a/tensorflow/core/framework/numeric_op.h b/tensorflow/core/framework/numeric_op.h
index 0167e21f113..9f8ceed2968 100644
--- a/tensorflow/core/framework/numeric_op.h
+++ b/tensorflow/core/framework/numeric_op.h
@@ -12,38 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #ifndef TENSORFLOW_CORE_FRAMEWORK_NUMERIC_OP_H_
 #define TENSORFLOW_CORE_FRAMEWORK_NUMERIC_OP_H_
 
+#include "tensorflow/core/framework/numeric_op_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 
-// One input and one output, both the same type.
 template <class T>
-class UnaryOp : public OpKernel {
- public:
-  explicit UnaryOp(OpKernelConstruction* context) : OpKernel(context) {
-    const DataType dt = DataTypeToEnum<T>::v();
-    OP_REQUIRES_OK(context, context->MatchSignature({dt}, {dt}));
-  }
-};
+using UnaryOp = UnaryOpBase<T, OpKernel, OpKernelConstruction>;
 
-// Two inputs and one output, all the same type.
 template <class T>
-class BinaryOp : public OpKernel {
- public:
-  explicit BinaryOp(OpKernelConstruction* context) : OpKernel(context) {
-    const DataType dt = DataTypeToEnum<T>::v();
-    OP_REQUIRES_OK(context, context->MatchSignature({dt, dt}, {dt}));
-  }
-};
+using BinaryOp = BinaryOpBase<T, OpKernel, OpKernelConstruction>;
 
 // For operations where the input and output are the same shape.
 //
diff --git a/tensorflow/core/framework/numeric_op_base.h b/tensorflow/core/framework/numeric_op_base.h
new file mode 100644
index 00000000000..be7d3bf8f9e
--- /dev/null
+++ b/tensorflow/core/framework/numeric_op_base.h
@@ -0,0 +1,49 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_NUMERIC_OP_BASE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_NUMERIC_OP_BASE_H_
+
+#include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// One input and one output, both the same type.
+template <class T, class OpKernelT, class OpKernelConstructionT>
+class UnaryOpBase : public OpKernelT {
+ public:
+  explicit UnaryOpBase(OpKernelConstructionT* construction) :
+      OpKernelT(construction) {
+    const DataType dt = DataTypeToEnum<T>::v();
+    OP_REQUIRES_OK(construction, construction->MatchSignature({dt}, {dt}));
+  }
+};
+
+// Two inputs and one output, all the same type.
+template <class T, class OpKernelT, class OpKernelConstructionT>
+class BinaryOpBase : public OpKernelT {
+ public:
+  explicit BinaryOpBase(OpKernelConstructionT* construction) :
+      OpKernelT(construction) {
+    const DataType dt = DataTypeToEnum<T>::v();
+    OP_REQUIRES_OK(construction, construction->MatchSignature({dt, dt}, {dt}));
+  }
+};
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_NUMERIC_OP_BASE_H_
diff --git a/tensorflow/core/framework/op.h b/tensorflow/core/framework/op.h
index e28ab845312..8481e2f1021 100644
--- a/tensorflow/core/framework/op.h
+++ b/tensorflow/core/framework/op.h
@@ -140,31 +140,31 @@ class OpRegistry : public OpRegistryInterface {
   // Ensures that all the functions in deferred_ get called, their OpDef's
   // registered, and returns with deferred_ empty.  Returns true the first
   // time it is called. Prints a fatal log if any op registration fails.
-  bool MustCallDeferred() const EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  bool MustCallDeferred() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Calls the functions in deferred_ and registers their OpDef's
   // It returns the Status of the first failed op registration or Status::OK()
   // otherwise.
-  Status CallDeferred() const EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  Status CallDeferred() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Add 'def' to the registry with additional data 'data'. On failure, or if
   // there is already an OpDef with that name registered, returns a non-okay
   // status.
   Status RegisterAlreadyLocked(const OpRegistrationDataFactory& op_data_factory)
-      const EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   const OpRegistrationData* LookUpSlow(const string& op_type_name) const;
 
   mutable mutex mu_;
   // Functions in deferred_ may only be called with mu_ held.
-  mutable std::vector<OpRegistrationDataFactory> deferred_ GUARDED_BY(mu_);
+  mutable std::vector<OpRegistrationDataFactory> deferred_ TF_GUARDED_BY(mu_);
   // Values are owned.
   mutable std::unordered_map<string, const OpRegistrationData*> registry_
-      GUARDED_BY(mu_);
-  mutable bool initialized_ GUARDED_BY(mu_);
+      TF_GUARDED_BY(mu_);
+  mutable bool initialized_ TF_GUARDED_BY(mu_);
 
   // Registry watcher.
-  mutable Watcher watcher_ GUARDED_BY(mu_);
+  mutable Watcher watcher_ TF_GUARDED_BY(mu_);
 
   std::function<Status(const OpRegistryInterface&)> op_registry_validator_;
 };
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 38c56eb3b1c..15d55cc19d0 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/node_properties.h"
 #include "tensorflow/core/framework/op_def_util.h"
+#include "tensorflow/core/framework/tensor_reference.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/notification.h"
@@ -222,7 +223,6 @@ Tensor* PersistentTensor::AccessTensor(OpKernelConstruction* context) {
 }
 
 Tensor* PersistentTensor::AccessTensor(OpKernelContext* context) {
-  context->NotifyUseOfPersistentTensor(tensor_);
   return &tensor_;
 }
 
@@ -331,7 +331,7 @@ OpKernelContext::OpKernelContext(Params* params)
 
 OpKernelContext::OpKernelContext(Params* params, int num_outputs)
     : params_(params), outputs_(num_outputs) {
-  if (params_->record_tensor_accesses || params_->track_allocations) {
+  if (params_->track_allocations) {
     tracking_state_ = absl::make_unique<TrackingState>();
   }
 
@@ -393,13 +393,6 @@ void OpKernelContext::SetStatus(const Status& status) {
   status_.Update(status);
 }
 
-void OpKernelContext::really_record_tensor_reference(const Tensor& tensor) {
-  DCHECK(tracking_state_);
-  mutex_lock l(tracking_state_->mu);
-  // Keep a reference to the underlying memory around.
-  tracking_state_->referenced_tensors.Add(tensor);
-}
-
 Status OpKernelContext::input(StringPiece name, const Tensor** tensor) {
   int start, stop;
   TF_RETURN_IF_ERROR(params_->op_kernel->InputRange(name, &start, &stop));
@@ -414,7 +407,6 @@ Status OpKernelContext::input(StringPiece name, const Tensor** tensor) {
                                    "' when non-ref input was expected");
   }
   *tensor = (*params_->inputs)[start].tensor;
-  record_tensor_reference(**tensor);
   return Status::OK();
 }
 
@@ -449,7 +441,6 @@ const Tensor& OpKernelContext::input(int index) {
   CHECK_LT(index, num_inputs()) << " name: " << op_kernel().name();
   CHECK(!input_is_ref(index));
   const Tensor& tensor = *((*params_->inputs)[index].tensor);
-  record_tensor_reference(tensor);
   return tensor;
 }
 
@@ -460,12 +451,10 @@ Tensor OpKernelContext::mutable_input(int index, bool lock_held) {
   // return a copy of the Ref acquired while holding the mutex
   if (lock_held) {
     Tensor& tensor = *((*params_->inputs)[index].tensor);
-    record_tensor_reference(tensor);
     return tensor;
   } else {
     tf_shared_lock l(*input_ref_mutex(index));
     Tensor& tensor = *((*params_->inputs)[index].tensor);
-    record_tensor_reference(tensor);
     return tensor;
   }
 }
@@ -482,7 +471,6 @@ void OpKernelContext::replace_ref_input(int index, const Tensor& tensor,
     mutex_lock l(*input_ref_mutex(index));
     *(*params_->inputs)[index].tensor = tensor;
   }
-  record_tensor_reference(tensor);
 }
 
 void OpKernelContext::forward_ref_input_to_ref_output(int input_index,
@@ -658,7 +646,6 @@ Status OpKernelContext::mutable_input(StringPiece name, Tensor* tensor,
     tf_shared_lock l(*input_ref_mutex(start));
     *tensor = *(*params_->inputs)[start].tensor;
   }
-  record_tensor_reference(*tensor);
   return Status::OK();
 }
 
@@ -781,7 +768,6 @@ Status OpKernelContext::allocate_tensor(
     LogMemory::RecordTensorAllocation(params_->op_kernel->name(),
                                       params_->step_id, new_tensor);
   }
-  record_tensor_reference(new_tensor);
   *out_tensor = std::move(new_tensor);
   return Status::OK();
 }
@@ -969,7 +955,6 @@ void OpKernelContext::set_output(int index, const Tensor& tensor) {
   } else {
     // Input can be forwarded to output; incref on `tensor` and set output at
     // `index` to this tensor.
-    record_tensor_reference(tensor);
     outputs_[index] = TensorValue(new Tensor(tensor));
     if (track_allocations() && tensor.TotalBytes() > 0) {
       DCHECK(tracking_state_);
@@ -994,7 +979,6 @@ void OpKernelContext::set_output_ref(int index, mutex* mu,
   CHECK_GE(index, 0);
   CHECK_LT(index, outputs_.size());
   CHECK(IsRefType(params_->op_kernel->output_type(index)));
-  record_tensor_reference(*tensor_for_ref);
   outputs_[index] = TensorValue(mu, tensor_for_ref);
 }
 
@@ -1134,7 +1118,8 @@ struct KernelRegistration {
 // KernelDef.
 struct KernelRegistry {
   mutex mu;
-  std::unordered_multimap<string, KernelRegistration> registry GUARDED_BY(mu);
+  std::unordered_multimap<string, KernelRegistration> registry
+      TF_GUARDED_BY(mu);
 };
 
 #if defined(_WIN32)
@@ -1756,7 +1741,7 @@ void OpKernelContext::CtxFailureWithWarning(const char* file, int line,
 
 void CheckNotInComputeAsync(OpKernelContext* ctx,
                             const char* correct_macro_name) {
-  CHECK_EQ(nullptr, ctx->op_kernel().AsAsync())
+  CHECK_EQ(nullptr, ctx->params_->op_kernel->AsAsync())
       << "Use " << correct_macro_name << " in AsyncOpKernel implementations.";
 }
 
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 876c7551263..7a92a40e103 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -43,7 +43,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tracking_allocator.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/framework/unique_tensor_references.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -136,7 +135,6 @@ class OpKernel {
 
   // Returns nullptr iff this op kernel is synchronous.
   virtual AsyncOpKernel* AsAsync() { return nullptr; }
-  virtual const AsyncOpKernel* AsAsync() const { return nullptr; }
 
   // Initial time (in CPU cycles) we expect an operation to take.  Used to
   // determine whether an operation should be place in a threadpool.  Operations
@@ -156,18 +154,6 @@ class OpKernel {
   // Returns a pointer to the tensor stored inside constant ops.
   virtual const Tensor* const_tensor() const { return nullptr; }
 
-  // Returns true if this kernel must produce its ith output.
-  // REQUIRES: 0 <= i < num_inputs().
-  bool output_required(int i) const { return outputs_required_[i]; }
-
-  // Hints whether or not the ith output must be produced when running the
-  // kernel. By default, all outputs are required. The kernel implementation
-  // may ignore the hint.
-  // REQUIRES: 0 <= i < num_inputs().
-  void set_output_required(int i, bool is_required) {
-    outputs_required_[i] = is_required;
-  }
-
   // Updates the dynamic cost estimate, which is used to determine whether this
   // op is expensive. The new cost estimate is a weighted average of the old
   // cost estimate and the latest cost.
@@ -235,7 +221,6 @@ class OpKernel {
   const bool is_deferred_;
   bool expensive_;
   std::atomic_uint_fast64_t cost_estimate_;
-  std::vector<bool> outputs_required_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(OpKernel);
 };
@@ -263,7 +248,6 @@ class AsyncOpKernel : public OpKernel {
   virtual void ComputeAsync(OpKernelContext* context, DoneCallback done) = 0;
 
   AsyncOpKernel* AsAsync() override { return this; }
-  const AsyncOpKernel* AsAsync() const override { return this; }
 
   void Compute(OpKernelContext* context) override;
 };
@@ -583,11 +567,11 @@ struct TensorValue {
 // Used to store partitioned graphs from function-calling ops.
 struct GraphCollector {
   mutex mu;
-  std::vector<GraphDef> partitioned_graphs GUARDED_BY(mu);
-  GraphDef raw_graph GUARDED_BY(mu);
-  GraphDef optimized_graph GUARDED_BY(mu);
+  std::vector<GraphDef> partitioned_graphs TF_GUARDED_BY(mu);
+  GraphDef raw_graph TF_GUARDED_BY(mu);
+  GraphDef optimized_graph TF_GUARDED_BY(mu);
 
-  bool dirty GUARDED_BY(mu);
+  bool dirty TF_GUARDED_BY(mu);
 
   GraphCollector() : dirty(false) {}
 
@@ -609,7 +593,7 @@ struct GraphCollector {
     dirty = true;
   }
 
-  void ClearGraphs() EXCLUSIVE_LOCKS_REQUIRED(mu) {
+  void ClearGraphs() TF_EXCLUSIVE_LOCKS_REQUIRED(mu) {
     raw_graph.Clear();
     optimized_graph.Clear();
     partitioned_graphs.clear();
@@ -675,7 +659,6 @@ class OpKernelContext {
 
     bool track_allocations = false;
     bool log_memory = false;
-    bool record_tensor_accesses = false;
 
     // Array indexed by output number for this node
     const AllocatorAttributes* output_attr_array = nullptr;
@@ -690,8 +673,6 @@ class OpKernelContext {
     // Mechanism used by this op kernel invocation to communicate with
     // computations running on other devices.
     RendezvousInterface* rendezvous = nullptr;
-    const std::function<Status(const int64, const DeviceMgr*, Rendezvous** r)>*
-        create_rendezvous;
 
     // Mechanism for executing a collective op that needs to coordinate
     // with parallel instances running on other devices.
@@ -732,6 +713,7 @@ class OpKernelContext {
     std::function<void(std::function<void()>)>* runner = nullptr;
     StepStatsCollectorInterface* stats_collector = nullptr;
     GraphCollector* graph_collector = nullptr;
+    bool run_all_kernels_inline = false;
 
     // TensorSliceReaderCache support.
     checkpoint::TensorSliceReaderCacheWrapper* slice_reader_cache = nullptr;
@@ -867,6 +849,12 @@ class OpKernelContext {
   // If non-null, kernels should populate with any partition subgraphs created.
   GraphCollector* graph_collector() { return params_->graph_collector; }
 
+  // If True, hint that all kernels in functions called by this kernel, should
+  // be treated as "inexpensive", and hence executed on the scheduling thread.
+  bool run_all_kernels_inline() const {
+    return params_->run_all_kernels_inline;
+  }
+
   // Input to output forwarding.
 
   // Set the output Ref Tensor at output_index to be an alias of the
@@ -1123,10 +1111,6 @@ class OpKernelContext {
   // An op kernel communicates with outside environment through
   // Rendezvous Send() and Recv().
   RendezvousInterface* rendezvous() const { return params_->rendezvous; }
-  Status create_rendezvous(const int64 step_id, const DeviceMgr* device_mgr,
-                           Rendezvous** r) const {
-    return (*params_->create_rendezvous)(step_id, device_mgr, r);
-  }
 
   CollectiveExecutor* collective_executor() const {
     return params_->collective_executor;
@@ -1222,11 +1206,6 @@ class OpKernelContext {
   // TODO(tucker): Add example usage.
   DeviceBase* device() const { return params_->device; }
 
-  // Retrieve list of referenced tensors in out_vector. Once this is
-  // called, it is not legal to reference any more tensors.  Should
-  // not be called from Op kernels.
-  void retrieve_accessed_tensors(TensorReferenceVector* out_vector);
-
   // Per-step container for use by white-listed internal ops.
   ScopedStepContainer* step_container() const {
     return params_->step_container;
@@ -1255,25 +1234,26 @@ class OpKernelContext {
   // Records temp memory allocation. Tensor object is recorded to identify the
   // case where temp memory is used as output memory.
   void record_temp_memory_allocation(int64 size, const Tensor& t)
-      LOCKS_EXCLUDED(tracking_state_->stats_mu);
+      TF_LOCKS_EXCLUDED(tracking_state_->stats_mu);
 
   // Returns recorded size of temporary memory;
-  int64 temp_memory_allocated() const LOCKS_EXCLUDED(tracking_state_->stats_mu);
+  int64 temp_memory_allocated() const
+      TF_LOCKS_EXCLUDED(tracking_state_->stats_mu);
 
   // Records persistent memory allocation, size can be negative indicating
   // deallocation.
   void record_persistent_memory_allocation(int64 size, int64 alloc_id = -1)
-      LOCKS_EXCLUDED(tracking_state_->stats_mu);
+      TF_LOCKS_EXCLUDED(tracking_state_->stats_mu);
 
   // Returns recorded size and ids of persistent memory.
   int64 persistent_memory_allocated() const
-      LOCKS_EXCLUDED(tracking_state_->stats_mu);
+      TF_LOCKS_EXCLUDED(tracking_state_->stats_mu);
 
   std::vector<int64> persistent_alloc_ids() const
-      LOCKS_EXCLUDED(tracking_state_->stats_mu);
+      TF_LOCKS_EXCLUDED(tracking_state_->stats_mu);
 
   // Resets counters for temp and persistent memory and recorded ids.
-  void clear_recorded_memory() LOCKS_EXCLUDED(tracking_state_->stats_mu);
+  void clear_recorded_memory() TF_LOCKS_EXCLUDED(tracking_state_->stats_mu);
 
   bool input_is_ref(int index) const;
 
@@ -1308,13 +1288,6 @@ class OpKernelContext {
  private:
   bool record_memory_consumption_ = false;
 
-  // Internal method to add a tensor's buffer to the list of buffers
-  // referenced during the execution of the Op, so that GPUs may
-  // accurately track the memory that may not be reused until the Op
-  // execution completes.
-  void record_tensor_reference(const Tensor& tensor);
-  void really_record_tensor_reference(const Tensor& tensor);
-
   // Internal common method used when allocating tensor memory
   Status allocate_tensor(DataType type, const TensorShape& shape,
                          Tensor* out_tensor,
@@ -1331,14 +1304,6 @@ class OpKernelContext {
   // called.
   void maybe_initialize_scope_id_set();
 
-  // This is called by PersistentTensor::AccessTensor whenever the
-  // wrapped tensor is retrieved, to ensure the runtime knows that the
-  // Tensor is being accessed within an Op. This is necessary for
-  // memory safety of devices like GPUs that queue Ops for
-  // asynchronous execution after the Compute() method completes.
-  friend class PersistentTensor;
-  void NotifyUseOfPersistentTensor(const Tensor& tensor);
-
   Status status_;
   friend class CollectiveExecutor;  // for access to params_
   Params* params_;                  // not owned
@@ -1353,20 +1318,23 @@ class OpKernelContext {
   // recorded.
   struct TrackingState {
     mutable mutex mu;
-    gtl::InlinedVector<WrappedAllocator, 4> wrapped_allocators GUARDED_BY(mu);
-
-    UniqueTensorReferences referenced_tensors GUARDED_BY(mu);
+    gtl::InlinedVector<WrappedAllocator, 4> wrapped_allocators
+        TF_GUARDED_BY(mu);
 
     mutable mutex stats_mu;
-    int64 temp_memory_allocated GUARDED_BY(stats_mu) = 0;
+    int64 temp_memory_allocated TF_GUARDED_BY(stats_mu) = 0;
 
-    int64 persistent_memory_allocated GUARDED_BY(stats_mu) = 0;
+    int64 persistent_memory_allocated TF_GUARDED_BY(stats_mu) = 0;
     gtl::InlinedVector<std::pair<const void*, int64>, 2>
-        temp_tensor_buffer_and_size GUARDED_BY(stats_mu);
-    gtl::InlinedVector<int64, 2> persistent_alloc_ids GUARDED_BY(stats_mu);
+        temp_tensor_buffer_and_size TF_GUARDED_BY(stats_mu);
+    gtl::InlinedVector<int64, 2> persistent_alloc_ids TF_GUARDED_BY(stats_mu);
   };
   std::unique_ptr<TrackingState> tracking_state_;
 
+  // For access to `params_->op_kernel`.
+  friend void CheckNotInComputeAsync(OpKernelContext* ctx,
+                                     const char* correct_macro_name);
+
   TF_DISALLOW_COPY_AND_ASSIGN(OpKernelContext);
 };
 
@@ -1653,23 +1621,6 @@ inline bool OpKernelContext::input_is_ref(int index) const {
   return value.is_ref();
 }
 
-inline void OpKernelContext::record_tensor_reference(const Tensor& tensor) {
-  DCHECK_EQ(params_->device->RequiresRecordingAccessedTensors(),
-            params_->record_tensor_accesses);
-  if (params_->record_tensor_accesses) {
-    really_record_tensor_reference(tensor);
-  }
-}
-
-inline void OpKernelContext::retrieve_accessed_tensors(
-    TensorReferenceVector* out_vector) {
-  if (params_->record_tensor_accesses) {
-    DCHECK(tracking_state_);
-    mutex_lock l(tracking_state_->mu);
-    tracking_state_->referenced_tensors.FreezeAndReturnReferences(out_vector);
-  }
-}
-
 // no input if tensor == nullptr.
 inline bool OpKernelContext::has_input(int index) const {
   DCHECK_GE(index, 0);
@@ -1684,17 +1635,9 @@ inline mutex* OpKernelContext::input_ref_mutex(int index) {
   return (*params_->inputs)[index].mutex_if_ref;
 }
 
-inline void OpKernelContext::NotifyUseOfPersistentTensor(const Tensor& t) {
-  if (t.IsInitialized()) {
-    record_tensor_reference(t);
-  }
-}
-
 inline Tensor* OpKernelContext::mutable_output(int index) {
   DCHECK_GE(index, 0);
   DCHECK_LT(index, num_outputs());
-  // No need to record_tensor_reference since the output must already
-  // have been set by a call that did so.
   return outputs_[index].tensor;
 }
 
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index 40425cf24e0..94b502f3f71 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -348,74 +348,17 @@ TEST_F(OpKernelTest, MatchSignatureFailes) {
 
 class DummyDevice : public DeviceBase {
  public:
-  DummyDevice(Env* env, bool save) : DeviceBase(env), save_(save) {}
-  bool RequiresRecordingAccessedTensors() const override { return save_; }
+  explicit DummyDevice(Env* env) : DeviceBase(env) {}
   Allocator* GetAllocator(AllocatorAttributes /*attr*/) override {
     return cpu_allocator();
   }
-
- private:
-  bool save_;
 };
 
-TEST_F(OpKernelTest, SaveTempFalse) {
-  Env* env = Env::Default();
-  OpKernelContext::Params params;
-  params.record_tensor_accesses = false;
-  auto device =
-      absl::make_unique<DummyDevice>(env, params.record_tensor_accesses);
-  params.device = device.get();
-  Status status;
-  std::unique_ptr<OpKernel> op(
-      CreateOpKernel(DEVICE_CPU, params.device, cpu_allocator(),
-                     CreateNodeDef("Test1", {DT_FLOAT, DT_INT32}),
-                     TF_GRAPH_DEF_VERSION, &status));
-  EXPECT_TRUE(status.ok());
-  params.op_kernel = op.get();
-  auto ctx = absl::make_unique<OpKernelContext>(&params);
-
-  Tensor t;
-  TF_EXPECT_OK(ctx->allocate_temp(DT_FLOAT, TensorShape(), &t));
-
-  TensorReferenceVector referenced_tensors;
-  ctx->retrieve_accessed_tensors(&referenced_tensors);
-  EXPECT_EQ(0, referenced_tensors.size());
-}
-
-TEST_F(OpKernelTest, SaveTempTrue) {
-  Env* env = Env::Default();
-  OpKernelContext::Params params;
-  params.record_tensor_accesses = true;
-  auto device =
-      absl::make_unique<DummyDevice>(env, params.record_tensor_accesses);
-  params.device = device.get();
-  Status status;
-  std::unique_ptr<OpKernel> op(
-      CreateOpKernel(DEVICE_CPU, params.device, cpu_allocator(),
-                     CreateNodeDef("Test1", {DT_FLOAT, DT_INT32}),
-                     TF_GRAPH_DEF_VERSION, &status));
-  EXPECT_TRUE(status.ok());
-  params.op_kernel = op.get();
-  auto ctx = absl::make_unique<OpKernelContext>(&params);
-
-  Tensor t;
-  TF_EXPECT_OK(ctx->allocate_temp(DT_FLOAT, TensorShape(), &t));
-
-  TensorReferenceVector referenced_tensors;
-  ctx->retrieve_accessed_tensors(&referenced_tensors);
-  EXPECT_EQ(1, referenced_tensors.size());
-  for (auto& ref : referenced_tensors) {
-    ref.Unref();
-  }
-}
-
 TEST_F(OpKernelTest, InputDtype) {
   Env* env = Env::Default();
   OpKernelContext::Params params;
-  params.record_tensor_accesses = false;
-  auto device =
-      absl::make_unique<DummyDevice>(env, params.record_tensor_accesses);
-  params.device = device.get();
+  DummyDevice device(env);
+  params.device = &device;
   Status status;
   std::unique_ptr<OpKernel> op(
       CreateOpKernel(DEVICE_CPU, params.device, cpu_allocator(),
@@ -499,7 +442,6 @@ class ScopedAllocatorDevice : public DeviceBase {
 TEST_F(OpKernelTest, ScopedAllocationTest) {
   Env* env = Env::Default();
   OpKernelContext::Params params;
-  params.record_tensor_accesses = false;
   auto sa_device = absl::make_unique<ScopedAllocatorDevice>(env);
   params.device = sa_device.get();
   Status status;
@@ -788,10 +730,8 @@ REGISTER_KERNEL_BUILDER(Name("ListOut").Device(tensorflow::DEVICE_CPU),
 TEST_F(OpKernelBuilderTest, OpOutputList) {
   Env* env = Env::Default();
   OpKernelContext::Params params;
-  params.record_tensor_accesses = false;
-  auto device =
-      absl::make_unique<DummyDevice>(env, params.record_tensor_accesses);
-  params.device = device.get();
+  DummyDevice device(env);
+  params.device = &device;
   Status status;
   std::unique_ptr<OpKernel> op(CreateOpKernel(
       DEVICE_CPU, params.device, cpu_allocator(),
@@ -1066,7 +1006,7 @@ void BM_InputRangeHelper(int iters, const NodeDef& node_def,
                          const char* input_name, int expected_start,
                          int expected_stop) {
   Status status;
-  auto device = absl::make_unique<DummyDevice>(Env::Default(), false);
+  auto device = absl::make_unique<DummyDevice>(Env::Default());
 
   std::unique_ptr<OpKernel> op(CreateOpKernel(DEVICE_CPU, device.get(),
                                               cpu_allocator(), node_def,
diff --git a/tensorflow/core/framework/op_segment.h b/tensorflow/core/framework/op_segment.h
index 37d939ea2b3..ab3ef6009b3 100644
--- a/tensorflow/core/framework/op_segment.h
+++ b/tensorflow/core/framework/op_segment.h
@@ -78,7 +78,7 @@ class OpSegment {
   typedef std::unordered_map<string, Item*> SessionMap;
 
   mutable mutex mu_;
-  SessionMap sessions_ GUARDED_BY(mu_);
+  SessionMap sessions_ TF_GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(OpSegment);
 };
diff --git a/tensorflow/core/framework/ops_util.h b/tensorflow/core/framework/ops_util.h
index feaab10b366..b323109abfc 100644
--- a/tensorflow/core/framework/ops_util.h
+++ b/tensorflow/core/framework/ops_util.h
@@ -21,7 +21,6 @@ limitations under the License.
 #include <array>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/framework/reader_op_kernel.h b/tensorflow/core/framework/reader_op_kernel.h
index e65a8695be8..b044e8a3b92 100644
--- a/tensorflow/core/framework/reader_op_kernel.h
+++ b/tensorflow/core/framework/reader_op_kernel.h
@@ -41,7 +41,7 @@ class ReaderOpKernel : public ResourceOpKernel<ReaderInterface> {
   // ReaderInterface descendant allocated with new that ReaderOpKernel
   // will take ownership of.
   void SetReaderFactory(std::function<ReaderInterface*()> factory)
-      LOCKS_EXCLUDED(mu_) {
+      TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
     DCHECK(resource_ == nullptr);
     factory_ = factory;
@@ -70,7 +70,7 @@ class ReaderOpKernel : public ResourceOpKernel<ReaderInterface> {
   virtual void Cancel() {}
 
   Status CreateResource(ReaderInterface** reader)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) override {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) override {
     *reader = factory_();
     if (*reader == nullptr) {
       return errors::ResourceExhausted("Failed to allocate reader");
@@ -80,7 +80,7 @@ class ReaderOpKernel : public ResourceOpKernel<ReaderInterface> {
     return Status::OK();
   }
 
-  std::function<ReaderInterface*()> factory_ GUARDED_BY(mu_);
+  std::function<ReaderInterface*()> factory_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/rendezvous.h b/tensorflow/core/framework/rendezvous.h
index b9172f63df6..e5283cdd13c 100644
--- a/tensorflow/core/framework/rendezvous.h
+++ b/tensorflow/core/framework/rendezvous.h
@@ -28,6 +28,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+class DeviceMgr;
+
 // A Rendezvous is an abstraction for passing tensors from producers
 // to consumers. A rendezvous is a table of channels. Each channel is
 // keyed by a rendezvous key. The key encodes a pair of <producer,
@@ -127,6 +129,8 @@ class RendezvousInterface {
 // threads with no clear owner.
 class Rendezvous : public RendezvousInterface, public core::RefCounted {
  public:
+  using Factory =
+      std::function<Status(const int64, const DeviceMgr*, Rendezvous**)>;
   // Constructs a rendezvous key for the tensor of "name" sent from
   // "src_device" to "dst_device". The tensor is generated in the frame
   // and iteration specified by "frame_iter".
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index a22ddc1f0ea..3a9b97c7831 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -95,7 +95,6 @@ class ScopedStepContainer {
   ScopedStepContainer(const int64 step_id,
                       std::function<void(const string&)> cleanup)
       : container_(strings::StrCat("__per_step_", step_id)),
-        step_id_(step_id),
         cleanup_(cleanup),
         dirty_(false) {}
 
@@ -103,13 +102,12 @@ class ScopedStepContainer {
                       std::function<void(const string&)> cleanup,
                       const string& prefix)
       : container_(strings::StrCat("__", prefix, "_per_step_", step_id)),
-        step_id_(step_id),
         cleanup_(cleanup),
         dirty_(false) {}
 
   ~ScopedStepContainer() { CleanUp(); }
 
-  void CleanUp() NO_THREAD_SAFETY_ANALYSIS {
+  void CleanUp() TF_NO_THREAD_SAFETY_ANALYSIS {
     // NOTE(mrry): Avoid acquiring the mutex in the case that the container is
     // clean.
     if (dirty_) {
@@ -144,14 +142,11 @@ class ScopedStepContainer {
   Status LookupOrCreate(ResourceMgr* rm, const string& name, T** resource,
                         std::function<Status(T**)> creator) TF_MUST_USE_RESULT;
 
-  const int64 step_id() const { return step_id_; }
-
  private:
   const string container_;
-  const int64 step_id_;
   const std::function<void(const string&)> cleanup_;
   mutex mu_;
-  mutable std::atomic<bool> dirty_ GUARDED_BY(mu_);
+  mutable std::atomic<bool> dirty_ TF_GUARDED_BY(mu_);
 };
 
 class ResourceMgr {
@@ -254,20 +249,20 @@ class ResourceMgr {
 
   const string default_container_;
   mutable mutex mu_;
-  std::unordered_map<string, Container*> containers_ GUARDED_BY(mu_);
+  std::unordered_map<string, Container*> containers_ TF_GUARDED_BY(mu_);
 
   template <typename T, bool use_dynamic_cast = false>
   Status LookupInternal(const string& container, const string& name,
                         T** resource) const
-      SHARED_LOCKS_REQUIRED(mu_) TF_MUST_USE_RESULT;
+      TF_SHARED_LOCKS_REQUIRED(mu_) TF_MUST_USE_RESULT;
 
   Status DoCreate(const string& container, TypeIndex type, const string& name,
                   ResourceBase* resource)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) TF_MUST_USE_RESULT;
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) TF_MUST_USE_RESULT;
 
   Status DoLookup(const string& container, TypeIndex type, const string& name,
                   ResourceBase** resource) const
-      SHARED_LOCKS_REQUIRED(mu_) TF_MUST_USE_RESULT;
+      TF_SHARED_LOCKS_REQUIRED(mu_) TF_MUST_USE_RESULT;
 
   Status DoDelete(const string& container, uint64 type_hash_code,
                   const string& resource_name,
@@ -277,16 +272,16 @@ class ResourceMgr {
 
   // Inserts the type name for 'hash_code' into the hash_code to type name map.
   Status InsertDebugTypeName(uint64 hash_code, const string& type_name)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) TF_MUST_USE_RESULT;
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) TF_MUST_USE_RESULT;
 
   // Returns the type name for the 'hash_code'.
   // Returns "<unknown>" if a resource with such a type was never inserted into
   // the container.
   const char* DebugTypeName(uint64 hash_code) const
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Map from type hash_code to type name.
-  std::unordered_map<uint64, string> debug_type_names_ GUARDED_BY(mu_);
+  std::unordered_map<uint64, string> debug_type_names_ TF_GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(ResourceMgr);
 };
@@ -314,11 +309,13 @@ ResourceHandle MakeResourceHandle(
 template <typename T>
 ResourceHandle MakeResourceHandle(
     OpKernelConstruction* ctx, const string& container, const string& name,
-    const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {}) {
-  return MakeResourceHandle(
-      container.empty() ? ctx->resource_manager()->default_container()
-                        : container,
-      name, *ctx->device(), MakeTypeIndex<T>(), dtypes_and_shapes);
+    const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {},
+    const std::vector<string>& allowed_devices = {}) {
+  return MakeResourceHandle(container.empty()
+                                ? ctx->resource_manager()->default_container()
+                                : container,
+                            name, *ctx->device(), MakeTypeIndex<T>(),
+                            dtypes_and_shapes, allowed_devices);
 }
 
 Status MakeResourceHandleToOutput(OpKernelContext* context, int output_index,
diff --git a/tensorflow/core/framework/resource_op_kernel.h b/tensorflow/core/framework/resource_op_kernel.h
index 60e9703190c..d8ee52a0e5d 100644
--- a/tensorflow/core/framework/resource_op_kernel.h
+++ b/tensorflow/core/framework/resource_op_kernel.h
@@ -70,23 +70,23 @@ class ResourceOpKernel : public OpKernel {
     }
   }
 
-  void Compute(OpKernelContext* context) override LOCKS_EXCLUDED(mu_) {
+  void Compute(OpKernelContext* context) override TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
     if (resource_ == nullptr) {
       ResourceMgr* mgr = context->resource_manager();
       OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
 
       T* resource;
-      OP_REQUIRES_OK(
-          context,
-          mgr->LookupOrCreate<T>(cinfo_.container(), cinfo_.name(), &resource,
-                                 [this](T** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-                                   Status s = CreateResource(ret);
-                                   if (!s.ok() && *ret != nullptr) {
-                                     CHECK((*ret)->Unref());
-                                   }
-                                   return s;
-                                 }));
+      OP_REQUIRES_OK(context,
+                     mgr->LookupOrCreate<T>(
+                         cinfo_.container(), cinfo_.name(), &resource,
+                         [this](T** ret) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                           Status s = CreateResource(ret);
+                           if (!s.ok() && *ret != nullptr) {
+                             CHECK((*ret)->Unref());
+                           }
+                           return s;
+                         }));
 
       Status s = VerifyResource(resource);
       if (TF_PREDICT_FALSE(!s.ok())) {
@@ -114,13 +114,14 @@ class ResourceOpKernel : public OpKernel {
  protected:
   // Variables accessible from subclasses.
   mutex mu_;
-  ContainerInfo cinfo_ GUARDED_BY(mu_);
-  T* resource_ GUARDED_BY(mu_) = nullptr;
+  ContainerInfo cinfo_ TF_GUARDED_BY(mu_);
+  T* resource_ TF_GUARDED_BY(mu_) = nullptr;
 
  private:
   // Must return a T descendant allocated with new that ResourceOpKernel will
   // take ownership of.
-  virtual Status CreateResource(T** resource) EXCLUSIVE_LOCKS_REQUIRED(mu_) = 0;
+  virtual Status CreateResource(T** resource)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) = 0;
 
   // During the first Compute(), resource is either created or looked up using
   // shared_name. In the latter case, the resource found should be verified if
@@ -129,7 +130,7 @@ class ResourceOpKernel : public OpKernel {
   // inconsistent capacities.
   virtual Status VerifyResource(T* resource) { return Status::OK(); }
 
-  PersistentTensor handle_ GUARDED_BY(mu_);
+  PersistentTensor handle_ TF_GUARDED_BY(mu_);
 
   // Is the output of the operator of type DT_RESOURCE?
   bool has_resource_type_;
diff --git a/tensorflow/core/framework/resource_op_kernel_test.cc b/tensorflow/core/framework/resource_op_kernel_test.cc
index c373e541c9d..fdef9644b8e 100644
--- a/tensorflow/core/framework/resource_op_kernel_test.cc
+++ b/tensorflow/core/framework/resource_op_kernel_test.cc
@@ -54,7 +54,7 @@ class StubResourceOpKernel : public ResourceOpKernel<StubResource> {
  public:
   using ResourceOpKernel::ResourceOpKernel;
 
-  StubResource* resource() LOCKS_EXCLUDED(mu_) {
+  StubResource* resource() TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock lock(mu_);
     return resource_;
   }
diff --git a/tensorflow/core/framework/resource_var.h b/tensorflow/core/framework/resource_var.h
index c4db98fec96..39fe5bbff91 100644
--- a/tensorflow/core/framework/resource_var.h
+++ b/tensorflow/core/framework/resource_var.h
@@ -77,8 +77,8 @@ class Var : public ResourceBase {
   // there is not a good value there due to a race condition, and it's possible
   // to stumble upon this during variable.initialized_value(). So it's best to
   // just store directly whether the variable is initialized.
-  bool is_initialized = false;  // GUARDED_BY(mu_) but annotalysis doesn't like
-                                // it.
+  bool is_initialized = false;  // TF_GUARDED_BY(mu_) but annotalysis doesn't
+                                // like it.
 
   // Also fake-guarded by mu_. Should be set to True whenever any sparse
   // operation uses the variable. Once this is true no tensor is allowed to
@@ -97,22 +97,22 @@ class Var : public ResourceBase {
 
 // Does unlock and unref automatically when going out of scope, and also
 // supports early manual release.
-class SCOPED_LOCKABLE ScopedUnlockUnrefVar {
+class TF_SCOPED_LOCKABLE ScopedUnlockUnrefVar {
  public:
-  explicit ScopedUnlockUnrefVar(Var* var) EXCLUSIVE_LOCK_FUNCTION(var_->mu())
+  explicit ScopedUnlockUnrefVar(Var* var) TF_EXCLUSIVE_LOCK_FUNCTION(var_->mu())
       : var_(var) {
     if (var_) {
       var_->mu()->lock();
     }
   }
-  void Release() UNLOCK_FUNCTION() {
+  void Release() TF_UNLOCK_FUNCTION() {
     if (var_) {
       var_->mu()->unlock();
       var_->Unref();
       var_ = nullptr;
     }
   }
-  ~ScopedUnlockUnrefVar() UNLOCK_FUNCTION() { Release(); }
+  ~ScopedUnlockUnrefVar() TF_UNLOCK_FUNCTION() { Release(); }
 
  private:
   Var* var_;
diff --git a/tensorflow/core/framework/run_handler.cc b/tensorflow/core/framework/run_handler.cc
index 52a1bfb71ce..c0ab50fe4e2 100644
--- a/tensorflow/core/framework/run_handler.cc
+++ b/tensorflow/core/framework/run_handler.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <cmath>
+#include <list>
 #include <memory>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -40,79 +41,52 @@ namespace {
 static constexpr int32 kMaxConcurrentHandlers = 128;
 // LINT.ThenChange(//tensorflow/core/framework/run_handler_test.cc)
 
-// TODO(azaks): Refactor with thread:ThreadPool
-class RunHandlerEnvironment {
-  typedef Thread EnvThread;
-  struct TaskImpl {
-    std::function<void()> f;
-    Context context;
-    uint64 trace_id;
-  };
-  Env* const env_;
-  const ThreadOptions thread_options_;
-  const string name_;
-
- public:
-  struct Task {
-    std::unique_ptr<TaskImpl> f;
-  };
-
-  RunHandlerEnvironment(Env* env, const ThreadOptions& thread_options,
-                        const string& name)
-      : env_(env), thread_options_(thread_options), name_(name) {}
-
-  EnvThread* CreateThread(std::function<void()> f) {
-    return env_->StartThread(thread_options_, name_, [=]() {
-      // Set the processor flag to flush denormals to zero.
-      port::ScopedFlushDenormal flush;
-      // Set the processor rounding mode to ROUND TO NEAREST.
-      port::ScopedSetRound round(FE_TONEAREST);
-      if (thread_options_.numa_node != port::kNUMANoAffinity) {
-        port::NUMASetThreadNodeAffinity(thread_options_.numa_node);
-      }
-      f();
-    });
-  }
-
-  Task CreateTask(std::function<void()> f) {
-    uint64 id = 0;
-    if (tracing::EventCollector::IsEnabled()) {
-      id = tracing::GetUniqueArg();
-      tracing::RecordEvent(tracing::EventCategory::kScheduleClosure, id);
-    }
-    return Task{
-        std::unique_ptr<TaskImpl>(new TaskImpl{
-            std::move(f),
-            Context(ContextKind::kThread),
-            id,
-        }),
-    };
-  }
-
-  void ExecuteTask(const Task& t) {
-    WithContext wc(t.f->context);
-    tracing::ScopedRegion region(tracing::EventCategory::kRunClosure,
-                                 t.f->trace_id);
-    t.f->f();
-  }
-};
-
-typedef typename RunHandlerEnvironment::Task Task;
+typedef typename internal::RunHandlerEnvironment::Task Task;
 typedef Eigen::RunQueue<Task, 1024> Queue;
 
-// To reduce cache misses, we use a doubly-linked list of Waiter structs and
-// queue them in LIFO order rather than the FIFO order used by a single
-// condition variable.
-struct Waiter {
-  Waiter() {
-    next = this;
-    prev = this;
+}  // namespace
+
+namespace internal {
+RunHandlerEnvironment::RunHandlerEnvironment(
+    Env* env, const ThreadOptions& thread_options, const string& name)
+    : env_(env), thread_options_(thread_options), name_(name) {}
+
+RunHandlerEnvironment::EnvThread* RunHandlerEnvironment::CreateThread(
+    std::function<void()> f) {
+  return env_->StartThread(thread_options_, name_, [=]() {
+    // Set the processor flag to flush denormals to zero.
+    port::ScopedFlushDenormal flush;
+    // Set the processor rounding mode to ROUND TO NEAREST.
+    port::ScopedSetRound round(FE_TONEAREST);
+    if (thread_options_.numa_node != port::kNUMANoAffinity) {
+      port::NUMASetThreadNodeAffinity(thread_options_.numa_node);
+    }
+    f();
+  });
+}
+
+RunHandlerEnvironment::Task RunHandlerEnvironment::CreateTask(
+    std::function<void()> f) {
+  uint64 id = 0;
+  if (tracing::EventCollector::IsEnabled()) {
+    id = tracing::GetUniqueArg();
+    tracing::RecordEvent(tracing::EventCategory::kScheduleClosure, id);
   }
-  condition_variable cv;
-  mutex mu;
-  Waiter* next;
-  Waiter* prev;
-};
+  return Task{
+      std::unique_ptr<TaskImpl>(new TaskImpl{
+          std::move(f),
+          Context(ContextKind::kThread),
+          id,
+      }),
+  };
+}
+
+void RunHandlerEnvironment::ExecuteTask(const Task& t) {
+  WithContext wc(t.f->context);
+  tracing::ScopedRegion region(tracing::EventCategory::kRunClosure,
+                               t.f->trace_id);
+  t.f->f();
+}
 
 void WaitOnWaiter(Waiter* waiter, Waiter* queue_head, mutex* mutex,
                   int max_sleep_micros) {
@@ -149,442 +123,359 @@ void WaitOnWaiter(Waiter* waiter, Waiter* queue_head, mutex* mutex,
   }
 }
 
-class ThreadWorkSource {
- public:
-  ThreadWorkSource()
-      : non_blocking_work_sharding_factor_(
-            static_cast<int32>(ParamFromEnvWithDefault(
-                "TF_RUN_HANDLER_NUM_OF_NON_BLOCKING_QUEUES", 1))),
-        non_blocking_work_queues_(non_blocking_work_sharding_factor_),
-        blocking_inflight_(0),
-        non_blocking_inflight_(0),
-        traceme_id_(0),
-        version_(0),
-        sub_thread_pool_waiter_(nullptr) {
-    queue_waiters_.next = &queue_waiters_;
-    queue_waiters_.prev = &queue_waiters_;
-    for (int i = 0; i < NonBlockingWorkShardingFactor(); ++i) {
-      non_blocking_work_queues_.emplace_back(new NonBlockingQueue());
+ThreadWorkSource::ThreadWorkSource()
+    : non_blocking_work_sharding_factor_(
+          static_cast<int32>(ParamFromEnvWithDefault(
+              "TF_RUN_HANDLER_NUM_OF_NON_BLOCKING_QUEUES", 1))),
+      non_blocking_work_queues_(non_blocking_work_sharding_factor_),
+      blocking_inflight_(0),
+      non_blocking_inflight_(0),
+      traceme_id_(0),
+      version_(0),
+      sub_thread_pool_waiter_(nullptr) {
+  queue_waiters_.next = &queue_waiters_;
+  queue_waiters_.prev = &queue_waiters_;
+  for (int i = 0; i < NonBlockingWorkShardingFactor(); ++i) {
+    non_blocking_work_queues_.emplace_back(new NonBlockingQueue());
+  }
+}
+
+ThreadWorkSource::~ThreadWorkSource() {
+  for (int i = 0; i < non_blocking_work_queues_.size(); ++i) {
+    delete non_blocking_work_queues_[i];
+  }
+}
+
+Task ThreadWorkSource::EnqueueTask(Task t, bool is_blocking) {
+  mutex* mu = nullptr;
+  Queue* task_queue = nullptr;
+  thread_local int64 closure_counter = 0;
+
+  if (!is_blocking) {
+    int queue_index = ++closure_counter % non_blocking_work_sharding_factor_;
+    task_queue = &(non_blocking_work_queues_[queue_index]->queue);
+    mu = &non_blocking_work_queues_[queue_index]->queue_op_mu;
+  } else {
+    task_queue = &blocking_work_queue_;
+    mu = &blocking_queue_op_mu_;
+  }
+
+  {
+    mutex_lock l(*mu);
+    // For a given queue, only one thread can call PushFront.
+    t = task_queue->PushFront(std::move(t));
+  }
+
+  Waiter* w = nullptr;
+  bool use_sub_thread_pool =
+      ParamFromEnvBoolWithDefault("TF_RUN_HANDLER_USE_SUB_THREAD_POOL", false);
+
+  Waiter* waiter_queue;
+  mutex* waiter_queue_mu;
+  if (use_sub_thread_pool) {
+    // When we use multiple sub thread pools, free threads wait on sub
+    // thread pool waiting queues. Wake up threads from sub thread waiting
+    // queues.
+    // The waiting queues are defined at RunHandlerPool.
+    // Get the waiter_queue and corresponding mutex. Note, the thread work
+    // source may change afterwards if a new request comes or an old request
+    // finishes.
+    tf_shared_lock lock(run_handler_waiter_mu_);
+    waiter_queue = sub_thread_pool_waiter_;
+    waiter_queue_mu = sub_thread_pool_waiter_mu_;
+  } else {
+    waiter_queue = &queue_waiters_;
+    waiter_queue_mu = &waiters_mu_;
+  }
+  {
+    mutex_lock l(*waiter_queue_mu);
+    if (waiter_queue->next != waiter_queue) {
+      // Remove waiter from the LIFO queue
+      w = waiter_queue->next;
+
+      CHECK(w->prev != w);  // Crash OK.
+      CHECK(w->next != w);  // Crash OK.
+
+      w->next->prev = w->prev;
+      w->prev->next = w->next;
+
+      // Use `w->next == &w` to indicate that the waiter has been removed
+      // from the queue.
+      w->next = w;
+      w->prev = w;
+    }
+  }
+  if (w != nullptr) {
+    // We call notify_one() without any locks, so we can miss notifications.
+    // The wake up logic is best effort and a thread will wake in short
+    // period of time in case a notification is missed.
+    w->cv.notify_one();
+  }
+  VLOG(3) << "Added " << (is_blocking ? "inter" : "intra") << " work from "
+          << traceme_id_.load(std::memory_order_relaxed);
+  return t;
+}
+
+Task ThreadWorkSource::PopBlockingTask() {
+  return blocking_work_queue_.PopBack();
+}
+
+Task ThreadWorkSource::PopNonBlockingTask(int start_index,
+                                          bool search_from_all_queue) {
+  Task t;
+  unsigned sharding_factor = NonBlockingWorkShardingFactor();
+  for (unsigned j = 0; j < sharding_factor; ++j) {
+    t = non_blocking_work_queues_[(start_index + j) % sharding_factor]
+            ->queue.PopBack();
+    if (t.f) {
+      return t;
+    }
+    if (!search_from_all_queue) {
+      break;
+    }
+  }
+  return t;
+}
+
+void ThreadWorkSource::WaitForWork(int max_sleep_micros) {
+  thread_local Waiter waiter;
+  WaitOnWaiter(&waiter, &queue_waiters_, &waiters_mu_, max_sleep_micros);
+}
+
+int ThreadWorkSource::TaskQueueSize(bool is_blocking) {
+  if (is_blocking) {
+    return blocking_work_queue_.Size();
+  } else {
+    unsigned total_size = 0;
+    for (int i = 0; i < non_blocking_work_sharding_factor_; ++i) {
+      total_size += non_blocking_work_queues_[i]->queue.Size();
+    }
+    return total_size;
+  }
+}
+
+int64 ThreadWorkSource::GetTracemeId() {
+  return traceme_id_.load(std::memory_order_relaxed);
+}
+
+void ThreadWorkSource::SetTracemeId(int64 value) { traceme_id_ = value; }
+
+void ThreadWorkSource::SetWaiter(uint64 version, Waiter* waiter, mutex* mutex) {
+  {
+    tf_shared_lock lock(run_handler_waiter_mu_);
+    // Most of the request won't change sub pool for recomputation.
+    // Optimization for avoiding holding exclusive lock to reduce contention.
+    if (sub_thread_pool_waiter_ == waiter) {
+      return;
+    }
+    // If the current version is a newer version, no need to update.
+    if (version_ > version) {
+      return;
     }
   }
 
-  ~ThreadWorkSource() {
-    for (int i = 0; i < non_blocking_work_queues_.size(); ++i) {
-      delete non_blocking_work_queues_[i];
-    }
-  }
+  mutex_lock l(run_handler_waiter_mu_);
+  sub_thread_pool_waiter_ = waiter;
+  sub_thread_pool_waiter_mu_ = mutex;
+  version_ = version;
+}
 
-  Task EnqueueTask(Task t, bool is_blocking) {
-    mutex* mu = nullptr;
-    Queue* task_queue = nullptr;
-    thread_local int64 closure_counter = 0;
+int64 ThreadWorkSource::GetInflightTaskCount(bool is_blocking) {
+  std::atomic<int64>* counter =
+      is_blocking ? &blocking_inflight_ : &non_blocking_inflight_;
+  return counter->load(std::memory_order_relaxed);
+}
 
-    if (!is_blocking) {
-      int queue_index = ++closure_counter % non_blocking_work_sharding_factor_;
-      task_queue = &(non_blocking_work_queues_[queue_index]->queue);
-      mu = &non_blocking_work_queues_[queue_index]->queue_op_mu;
-    } else {
-      task_queue = &blocking_work_queue_;
-      mu = &blocking_queue_op_mu_;
-    }
+void ThreadWorkSource::IncrementInflightTaskCount(bool is_blocking) {
+  std::atomic<int64>* counter =
+      is_blocking ? &blocking_inflight_ : &non_blocking_inflight_;
+  counter->fetch_add(1, std::memory_order_relaxed);
+}
 
+void ThreadWorkSource::DecrementInflightTaskCount(bool is_blocking) {
+  std::atomic<int64>* counter =
+      is_blocking ? &blocking_inflight_ : &non_blocking_inflight_;
+  counter->fetch_sub(1, std::memory_order_relaxed);
+}
+
+unsigned ThreadWorkSource::NonBlockingWorkShardingFactor() {
+  return non_blocking_work_sharding_factor_;
+}
+
+std::string ThreadWorkSource::ToString() {
+  return strings::StrCat("traceme_id = ", GetTracemeId(),
+                         ", inter queue size = ", TaskQueueSize(true),
+                         ", inter inflight = ", GetInflightTaskCount(true),
+                         ", intra queue size = ", TaskQueueSize(false),
+                         ", intra inflight = ", GetInflightTaskCount(false));
+}
+
+RunHandlerThreadPool::RunHandlerThreadPool(
+    int num_blocking_threads, int num_non_blocking_threads, Env* env,
+    const ThreadOptions& thread_options, const string& name,
+    Eigen::MaxSizeVector<mutex>* waiters_mu,
+    Eigen::MaxSizeVector<Waiter>* queue_waiters)
+    : num_threads_(num_blocking_threads + num_non_blocking_threads),
+      num_blocking_threads_(num_blocking_threads),
+      num_non_blocking_threads_(num_non_blocking_threads),
+      thread_data_(num_threads_),
+      env_(env, thread_options, name),
+      name_(name),
+      waiters_mu_(waiters_mu),
+      queue_waiters_(queue_waiters),
+      use_sub_thread_pool_(ParamFromEnvBoolWithDefault(
+          "TF_RUN_HANDLER_USE_SUB_THREAD_POOL", false)),
+      num_threads_in_sub_thread_pool_(ParamFromEnvWithDefault(
+          "TF_RUN_HANDLER_NUM_THREADS_IN_SUB_THREAD_POOL",
+          std::vector<int>({num_blocking_threads / 2,
+                            num_blocking_threads - num_blocking_threads / 2}))),
+      sub_thread_pool_start_request_percentage_(ParamFromEnvWithDefault(
+          "TF_RUN_HANDLER_SUB_THREAD_POOL_START_REQUEST_PERCENTAGE",
+          std::vector<double>({0, 0.4}))),
+      sub_thread_pool_end_request_percentage_(ParamFromEnvWithDefault(
+          "TF_RUN_HANDLER_SUB_THREAD_POOL_END_REQUEST_PERCENTAGE",
+          std::vector<double>({0.4, 1}))) {
+  thread_data_.resize(num_threads_);
+  VLOG(1) << "Creating RunHandlerThreadPool " << name << " with  "
+          << num_blocking_threads_ << " blocking threads and "
+          << num_non_blocking_threads_ << " non-blocking threads.";
+}
+
+RunHandlerThreadPool::~RunHandlerThreadPool() {
+  VLOG(1) << "Exiting RunHandlerThreadPool " << name_;
+
+  cancelled_ = true;
+  for (size_t i = 0; i < thread_data_.size(); ++i) {
     {
-      mutex_lock l(*mu);
-      // For a given queue, only one thread can call PushFront.
-      t = task_queue->PushFront(std::move(t));
+      mutex_lock l(thread_data_[i].mu);
+      thread_data_[i].sources_not_empty.notify_all();
     }
-
-    Waiter* w = nullptr;
-    bool use_sub_thread_pool = ParamFromEnvBoolWithDefault(
-        "TF_RUN_HANDLER_USE_SUB_THREAD_POOL", false);
-
-    Waiter* waiter_queue;
-    mutex* waiter_queue_mu;
-    if (use_sub_thread_pool) {
-      // When we use multiple sub thread pools, free threads wait on sub
-      // thread pool waiting queues. Wake up threads from sub thread waiting
-      // queues.
-      // The waiting queues are defined at RunHandlerPool.
-      // Get the waiter_queue and corresponding mutex. Note, the thread work
-      // source may change afterwards if a new request comes or an old request
-      // finishes.
-      tf_shared_lock lock(run_handler_waiter_mu_);
-      waiter_queue = sub_thread_pool_waiter_;
-      waiter_queue_mu = sub_thread_pool_waiter_mu_;
-    } else {
-      waiter_queue = &queue_waiters_;
-      waiter_queue_mu = &waiters_mu_;
-    }
-
-      {
-        mutex_lock l(*waiter_queue_mu);
-        if (waiter_queue->next != waiter_queue) {
-          // Remove waiter from the LIFO queue
-          w = waiter_queue->next;
-
-          CHECK(w->prev != w);
-          CHECK(w->next != w);
-
-          w->next->prev = w->prev;
-          w->prev->next = w->next;
-
-          // Use `w->next == &w` to indicate that the waiter has been removed
-          // from the queue.
-          w->next = w;
-          w->prev = w;
-        }
-      }
-      if (w != nullptr) {
-        // We call notify_one() without any locks, so we can miss notifications.
-        // The wake up logic is best effort and a thread will wake in short
-        // period of time in case a notification is missed.
-        w->cv.notify_one();
-      }
-    VLOG(3) << "Added " << (is_blocking ? "inter" : "intra") << " work from "
-            << traceme_id_.load(std::memory_order_relaxed);
-    return t;
+    thread_data_[i].thread.reset();
   }
+}
 
-  Task PopBlockingTask() { return blocking_work_queue_.PopBack(); }
-
-  Task PopNonBlockingTask(int start_index, bool search_from_all_queue) {
-    Task t;
-    unsigned sharding_factor = NonBlockingWorkShardingFactor();
-    for (unsigned j = 0; j < sharding_factor; ++j) {
-      t = non_blocking_work_queues_[(start_index + j) % sharding_factor]
-              ->queue.PopBack();
-      if (t.f) {
-        return t;
-      }
-      if (!search_from_all_queue) {
+void RunHandlerThreadPool::Start() {
+  cancelled_ = false;
+  int num_blocking_threads = num_blocking_threads_;
+  for (int i = 0; i < num_threads_; i++) {
+    int sub_thread_pool_id = num_threads_in_sub_thread_pool_.size() - 1;
+    for (int j = 0; j < num_threads_in_sub_thread_pool_.size(); ++j) {
+      if (i < num_threads_in_sub_thread_pool_[j]) {
+        sub_thread_pool_id = j;
         break;
       }
     }
-    return t;
+    thread_data_[i].sub_thread_pool_id = sub_thread_pool_id;
+    thread_data_[i].thread.reset(
+        env_.CreateThread([this, i, num_blocking_threads]() {
+          WorkerLoop(i, i < num_blocking_threads);
+        }));
   }
+}
 
-  void WaitForWork(int max_sleep_micros) {
-    thread_local Waiter waiter;
-    WaitOnWaiter(&waiter, &queue_waiters_, &waiters_mu_, max_sleep_micros);
+void RunHandlerThreadPool::StartOneThreadForTesting() {
+  cancelled_ = false;
+  thread_data_[0].sub_thread_pool_id = 0;
+  thread_data_[0].thread.reset(
+      env_.CreateThread([this]() { WorkerLoop(0, true); }));
+}
+
+void RunHandlerThreadPool::AddWorkToQueue(ThreadWorkSource* tws,
+                                          bool is_blocking,
+                                          std::function<void()> fn) {
+  Task t = env_.CreateTask(std::move(fn));
+  t = tws->EnqueueTask(std::move(t), is_blocking);
+  if (t.f) {
+    VLOG(3) << "Running " << (is_blocking ? "inter" : "intra") << " work for "
+            << tws->GetTracemeId();
+    env_.ExecuteTask(t);
   }
+}
 
-  int TaskQueueSize(bool is_blocking) {
-    if (is_blocking) {
-      return blocking_work_queue_.Size();
-    } else {
-      unsigned total_size = 0;
-      for (int i = 0; i < non_blocking_work_sharding_factor_; ++i) {
-        total_size += non_blocking_work_queues_[i]->queue.Size();
-      }
-      return total_size;
-    }
+// TODO(donglin) Change the task steal order to be round-robin such that if
+// an attempt to steal task from request i failed, then attempt to steal task
+// from the next request in terms of the arrival time. This approach may
+// provide better performance due to less lock retention. The drawback is that
+// the profiler will be a bit harder to read.
+void RunHandlerThreadPool::SetThreadWorkSources(
+    int tid, int start_request_idx, uint64 version,
+    const Eigen::MaxSizeVector<ThreadWorkSource*>& thread_work_sources) {
+  mutex_lock l(thread_data_[tid].mu);
+  if (version > thread_data_[tid].new_version) {
+    thread_data_[tid].new_version = version;
+  } else {
+    // A newer version is already updated. No need to update.
+    return;
   }
-
-  int64 GetTracemeId() { return traceme_id_.load(std::memory_order_relaxed); }
-
-  void SetTracemeId(int64 value) { traceme_id_ = value; }
-
-  void SetWaiter(uint64 version, Waiter* waiter, mutex* mutex) {
-    {
-      tf_shared_lock lock(run_handler_waiter_mu_);
-      // Most of the request won't change sub pool for recomputation.
-      // Optimization for avoiding holding exclusive lock to reduce contention.
-      if (sub_thread_pool_waiter_ == waiter) {
-        return;
-      }
-      // If the current version is a newer version, no need to update.
-      if (version_ > version) {
-        return;
-      }
-    }
-
-    mutex_lock l(run_handler_waiter_mu_);
-    sub_thread_pool_waiter_ = waiter;
-    sub_thread_pool_waiter_mu_ = mutex;
-    version_ = version;
-  }
-
-  int64 GetInflightTaskCount(bool is_blocking) {
-    std::atomic<int64>* counter =
-        is_blocking ? &blocking_inflight_ : &non_blocking_inflight_;
-    return counter->load(std::memory_order_relaxed);
-  }
-
-  void IncrementInflightTaskCount(bool is_blocking) {
-    std::atomic<int64>* counter =
-        is_blocking ? &blocking_inflight_ : &non_blocking_inflight_;
-    counter->fetch_add(1, std::memory_order_relaxed);
-  }
-
-  void DecrementInflightTaskCount(bool is_blocking) {
-    std::atomic<int64>* counter =
-        is_blocking ? &blocking_inflight_ : &non_blocking_inflight_;
-    counter->fetch_sub(1, std::memory_order_relaxed);
-  }
-
-  unsigned NonBlockingWorkShardingFactor() {
-    return non_blocking_work_sharding_factor_;
-  }
-
-  std::string ToString() {
-    return strings::StrCat("traceme_id = ", GetTracemeId(),
-                           ", inter queue size = ", TaskQueueSize(true),
-                           ", inter inflight = ", GetInflightTaskCount(true),
-                           ", intra queue size = ", TaskQueueSize(false),
-                           ", intra inflight = ", GetInflightTaskCount(false));
-  }
-
- private:
-  struct NonBlockingQueue {
-    mutex queue_op_mu;
-    char pad[128];
-    Queue queue;
-  };
-
-  int32 non_blocking_work_sharding_factor_;
-  Eigen::MaxSizeVector<NonBlockingQueue*> non_blocking_work_queues_;
-
-  std::atomic<int64> blocking_inflight_;
-  std::atomic<int64> non_blocking_inflight_;
-
-  Queue blocking_work_queue_;
-  mutex blocking_queue_op_mu_;
-  char pad_[128];
-  mutex waiters_mu_;
-  Waiter queue_waiters_ GUARDED_BY(waiters_mu_);
-  std::atomic<int64> traceme_id_;
-
-  mutex run_handler_waiter_mu_;
-  uint64 version_ GUARDED_BY(run_handler_waiter_mu_);
-  mutex* sub_thread_pool_waiter_mu_ GUARDED_BY(run_handler_waiter_mu_);
-  Waiter* sub_thread_pool_waiter_ GUARDED_BY(run_handler_waiter_mu_);
-};
-
-class RunHandlerThreadPool {
- public:
-  struct PerThread {
-    constexpr PerThread() : pool(nullptr), thread_id(-1) {}
-    RunHandlerThreadPool* pool;  // Parent pool, or null for normal threads.
-    int thread_id;               // Worker thread index in pool.
-  };
-
-  RunHandlerThreadPool(int num_blocking_threads, int num_non_blocking_threads,
-                       Env* env, const ThreadOptions& thread_options,
-                       const string& name,
-                       Eigen::MaxSizeVector<mutex>* waiters_mu,
-                       Eigen::MaxSizeVector<Waiter>* queue_waiters)
-      : num_threads_(num_blocking_threads + num_non_blocking_threads),
-        num_blocking_threads_(num_blocking_threads),
-        num_non_blocking_threads_(num_non_blocking_threads),
-        thread_data_(num_threads_),
-        env_(env, thread_options, name),
-        name_(name),
-        waiters_mu_(waiters_mu),
-        queue_waiters_(queue_waiters),
-        use_sub_thread_pool_(ParamFromEnvBoolWithDefault(
-            "TF_RUN_HANDLER_USE_SUB_THREAD_POOL", false)),
-        num_threads_in_sub_thread_pool_(ParamFromEnvWithDefault(
-            "TF_RUN_HANDLER_NUM_THREADS_IN_SUB_THREAD_POOL",
-            std::vector<int>(
-                {num_blocking_threads / 2,
-                 num_blocking_threads - num_blocking_threads / 2}))),
-        sub_thread_pool_start_request_percentage_(ParamFromEnvWithDefault(
-            "TF_RUN_HANDLER_SUB_THREAD_POOL_START_REQUEST_PERCENTAGE",
-            std::vector<double>({0, 0.4}))),
-        sub_thread_pool_end_request_percentage_(ParamFromEnvWithDefault(
-            "TF_RUN_HANDLER_SUB_THREAD_POOL_END_REQUEST_PERCENTAGE",
-            std::vector<double>({0.4, 1}))) {
-    VLOG(1) << "Creating RunHandlerThreadPool " << name << " with  "
-            << num_blocking_threads_ << " blocking threads and "
-            << num_non_blocking_threads_ << " non-blocking threads.";
-  }
-
-  ~RunHandlerThreadPool() {
-    VLOG(1) << "Exiting RunHandlerThreadPool " << name_;
-
-    cancelled_ = true;
-    for (size_t i = 0; i < thread_data_.size(); ++i) {
-      {
-        mutex_lock l(thread_data_[i].mu);
-        thread_data_[i].sources_not_empty.notify_all();
-      }
-      thread_data_[i].thread.reset();
-    }
-  }
-
-  void Start() {
-    cancelled_ = false;
-    thread_data_.resize(num_threads_);
-    int num_blocking_threads = num_blocking_threads_;
-    for (int i = 0; i < num_threads_; i++) {
-      int sub_thread_pool_id = num_threads_in_sub_thread_pool_.size() - 1;
-      for (int j = 0; j < num_threads_in_sub_thread_pool_.size(); ++j) {
-        if (i < num_threads_in_sub_thread_pool_[j]) {
-          sub_thread_pool_id = j;
-          break;
-        }
-      }
-      thread_data_[i].sub_thread_pool_id = sub_thread_pool_id;
-      thread_data_[i].thread.reset(
-          env_.CreateThread([this, i, num_blocking_threads]() {
-            WorkerLoop(i, i < num_blocking_threads);
-          }));
-    }
-  }
-
-  void AddWorkToQueue(ThreadWorkSource* tws, bool is_blocking,
-                      std::function<void()> fn) {
-    Task t = env_.CreateTask(std::move(fn));
-    t = tws->EnqueueTask(std::move(t), is_blocking);
-    if (t.f) {
-      VLOG(3) << "Running " << (is_blocking ? "inter" : "intra") << " work for "
-              << tws->GetTracemeId();
-      env_.ExecuteTask(t);
-    }
-  }
-
-  // Set work queues from which the thread 'tid' can steal its work.
-  // The request with start_request_idx will be attempted first. Other requests
-  // will be attempted in FIFO order based on their arrival time.
-
-  // TODO(donglin) Change the task steal order to be round-robin such that if
-  // an attempt to steal task from request i failed, then attempt to steal task
-  // from the next request in terms of the arrival time. This approach may
-  // provide better performance due to less lock retention. The drawback is that
-  // the profiler will be a bit harder to read.
-  void SetThreadWorkSources(
-      int tid, int start_request_idx, uint64 version,
-      const Eigen::MaxSizeVector<ThreadWorkSource*>& thread_work_sources) {
-    mutex_lock l(thread_data_[tid].mu);
-    if (version > thread_data_[tid].new_version) {
-      thread_data_[tid].new_version = version;
-    } else {
-      // A newer version is already updated. No need to update.
-      return;
-    }
-    thread_data_[tid].new_thread_work_sources->resize(0);
-
-    if (use_sub_thread_pool_) {
-      for (int i = 0; i < thread_work_sources.size(); ++i) {
-        thread_data_[tid].new_thread_work_sources->emplace_back(
-            thread_work_sources[i]);
-      }
-    } else {
+  thread_data_[tid].new_thread_work_sources->resize(0);
+  if (use_sub_thread_pool_) {
+    for (int i = 0; i < thread_work_sources.size(); ++i) {
       thread_data_[tid].new_thread_work_sources->emplace_back(
-          thread_work_sources[start_request_idx]);
-      // The number of shards for the queue. Threads in each shard will
-      // prioritize different thread_work_sources. Increase the number of shards
-      // could decrease the contention in the queue. For example, when
-      // num_shards == 1: thread_work_sources are ordered as start_request_idx,
-      // 0, 1, 2, 3, 4 ... for all threads. When num_shards == 2:
-      // thread_work_sources are order as start_request_idx, 0, 2, 4 ... 1, 3,
-      // 5... for half of the threads and start_request_idx, 1, 3, 5 ... 0, 2,
-      // 4... for the other half of the threads.
-      int num_shards =
-          ParamFromEnvWithDefault("TF_RUN_HANDLER_QUEUE_SHARDS", 1);
-      int token = tid % num_shards;
-      for (int i = 0; i < num_shards; ++i) {
-        for (int j = token; j < thread_work_sources.size(); j += num_shards) {
-          if (j != start_request_idx) {
-            thread_data_[tid].new_thread_work_sources->emplace_back(
-                thread_work_sources[j]);
-          }
+          thread_work_sources[i]);
+    }
+  } else {
+    thread_data_[tid].new_thread_work_sources->emplace_back(
+        thread_work_sources[start_request_idx]);
+    // The number of shards for the queue. Threads in each shard will
+    // prioritize different thread_work_sources. Increase the number of shards
+    // could decrease the contention in the queue. For example, when
+    // num_shards == 1: thread_work_sources are ordered as start_request_idx,
+    // 0, 1, 2, 3, 4 ... for all threads. When num_shards == 2:
+    // thread_work_sources are order as start_request_idx, 0, 2, 4 ... 1, 3,
+    // 5... for half of the threads and start_request_idx, 1, 3, 5 ... 0, 2,
+    // 4... for the other half of the threads.
+    int num_shards = ParamFromEnvWithDefault("TF_RUN_HANDLER_QUEUE_SHARDS", 1);
+    int token = tid % num_shards;
+    for (int i = 0; i < num_shards; ++i) {
+      for (int j = token; j < thread_work_sources.size(); j += num_shards) {
+        if (j != start_request_idx) {
+          thread_data_[tid].new_thread_work_sources->emplace_back(
+              thread_work_sources[j]);
         }
-        token = (token + 1) % num_shards;
       }
-      thread_data_[tid].sources_not_empty.notify_all();
+      token = (token + 1) % num_shards;
     }
+    thread_data_[tid].sources_not_empty.notify_all();
   }
+}
 
-  PerThread* GetPerThread() {
-    thread_local PerThread per_thread_;
-    PerThread* pt = &per_thread_;
-    return pt;
+RunHandlerThreadPool::PerThread* RunHandlerThreadPool::GetPerThread() {
+  thread_local RunHandlerThreadPool::PerThread per_thread_;
+  RunHandlerThreadPool::PerThread* pt = &per_thread_;
+  return pt;
+}
+
+int RunHandlerThreadPool::CurrentThreadId() const {
+  const PerThread* pt = const_cast<RunHandlerThreadPool*>(this)->GetPerThread();
+  if (pt->pool == this) {
+    return pt->thread_id;
+  } else {
+    return -1;
   }
+}
 
-  int CurrentThreadId() const {
-    const PerThread* pt =
-        const_cast<RunHandlerThreadPool*>(this)->GetPerThread();
-    if (pt->pool == this) {
-      return pt->thread_id;
-    } else {
-      return -1;
-    }
-  }
+int RunHandlerThreadPool::NumThreads() const { return num_threads_; }
 
-  int NumThreads() const { return num_threads_; }
+int RunHandlerThreadPool::NumBlockingThreads() const {
+  return num_blocking_threads_;
+}
 
-  int NumBlockingThreads() const { return num_blocking_threads_; }
+int RunHandlerThreadPool::NumNonBlockingThreads() const {
+  return num_non_blocking_threads_;
+}
 
-  int NumNonBlockingThreads() const { return num_non_blocking_threads_; }
-
-  void WorkerLoop(int thread_id, bool may_steal_blocking_work);
-
-  // Search tasks from Requets range searching_range_start to
-  // searching_range_end. If there is no tasks in the search range and
-  // may_steal_blocking_work is true, then search from all requests.
-  Task FindTask(
-      int searching_range_start, int searching_range_end, int thread_id,
-      int sub_thread_pool_id, int max_blocking_inflight,
-      bool may_steal_blocking_work,
-      const Eigen::MaxSizeVector<ThreadWorkSource*>& thread_work_sources,
-      bool* task_from_blocking_queue, ThreadWorkSource** tws);
-
-  void WaitForWork(bool is_blocking, int thread_id,
-                   int32 max_blocking_inflight);
-
-  void WaitForWorkInSubThreadPool(bool is_blocking, int sub_thread_pool_id);
-
- private:
-  struct ThreadData {
-    ThreadData()
-        : new_version(0),
-          current_index(0),
-          new_thread_work_sources(new Eigen::MaxSizeVector<ThreadWorkSource*>(
-              static_cast<int32>(ParamFromEnvWithDefault(
-                  "TF_RUN_HANDLER_MAX_CONCURRENT_HANDLERS",
-                  kMaxConcurrentHandlers)))),
-          current_version(0),
-          current_thread_work_sources(
-              new Eigen::MaxSizeVector<ThreadWorkSource*>(
-                  static_cast<int32>(ParamFromEnvWithDefault(
-                      "TF_RUN_HANDLER_MAX_CONCURRENT_HANDLERS",
-                      kMaxConcurrentHandlers)))) {}
-    mutex mu;
-    uint64 new_version;
-    condition_variable sources_not_empty;
-    std::unique_ptr<Thread> thread;
-    int current_index;
-    std::unique_ptr<Eigen::MaxSizeVector<ThreadWorkSource*>>
-        new_thread_work_sources GUARDED_BY(mu);
-
-    uint64 current_version;
-    // Should only be accessed by one thread.
-    std::unique_ptr<Eigen::MaxSizeVector<ThreadWorkSource*>>
-        current_thread_work_sources;
-
-    int sub_thread_pool_id;
-  };
-
-  const int num_threads_;
-  const int num_blocking_threads_;
-  const int num_non_blocking_threads_;
-  Eigen::MaxSizeVector<ThreadData> thread_data_;
-  RunHandlerEnvironment env_;
-  std::atomic<bool> cancelled_;
-  string name_;
-  Eigen::MaxSizeVector<mutex>* waiters_mu_;
-  Eigen::MaxSizeVector<Waiter>* queue_waiters_;
-
-  bool use_sub_thread_pool_;
-  std::vector<int> num_threads_in_sub_thread_pool_;
-
-  // Threads in each sub thread pool will search tasks from the given
-  // start_request_percentage to end_request_percentage in a round robin
-  // fashion.
-  std::vector<double> sub_thread_pool_start_request_percentage_;
-  std::vector<double> sub_thread_pool_end_request_percentage_;
-};
+RunHandlerThreadPool::ThreadData::ThreadData()
+    : new_version(0),
+      current_index(0),
+      new_thread_work_sources(
+          new Eigen::MaxSizeVector<ThreadWorkSource*>(static_cast<int32>(
+              ParamFromEnvWithDefault("TF_RUN_HANDLER_MAX_CONCURRENT_HANDLERS",
+                                      kMaxConcurrentHandlers)))),
+      current_version(0),
+      current_thread_work_sources(
+          new Eigen::MaxSizeVector<ThreadWorkSource*>(static_cast<int32>(
+              ParamFromEnvWithDefault("TF_RUN_HANDLER_MAX_CONCURRENT_HANDLERS",
+                                      kMaxConcurrentHandlers)))) {}
 
 Task RunHandlerThreadPool::FindTask(
     int searching_range_start, int searching_range_end, int thread_id,
@@ -596,10 +487,9 @@ Task RunHandlerThreadPool::FindTask(
   int current_index = thread_data_[thread_id].current_index;
   *task_from_blocking_queue = false;
 
-  // TODO(chaox): Change the search algorithm from round robin to random
-  // walk.
   for (int i = 0; i < searching_range_end - searching_range_start; ++i) {
-    if (current_index >= searching_range_end) {
+    if (current_index >= searching_range_end ||
+        current_index < searching_range_start) {
       current_index = searching_range_start;
     }
     *tws = thread_work_sources[current_index];
@@ -820,7 +710,7 @@ void RunHandlerThreadPool::WaitForWork(bool is_blocking, int thread_id,
   tws->WaitForWork(kMaxSleepMicros);
 }
 
-}  // namespace
+}  // namespace internal
 
 // Contains the concrete implementation of the RunHandler.
 // Externally visible RunHandler class simply forwards the work to this one.
@@ -841,11 +731,14 @@ class RunHandler::Impl {
   void ScheduleInterOpClosure(std::function<void()> fn);
   void ScheduleIntraOpClosure(std::function<void()> fn);
 
-  void Reset(int64 step_id);
+  void Reset(int64 step_id,
+             const RunOptions::Experimental::RunHandlerPoolOptions& options);
 
   RunHandlerPool::Impl* pool_impl() { return pool_impl_; }
 
-  ThreadWorkSource* tws() { return &tws_; }
+  internal::ThreadWorkSource* tws() { return &tws_; }
+
+  int64 priority() { return options_.priority(); }
 
  private:
   class ThreadPoolInterfaceWrapper : public thread::ThreadPoolInterface {
@@ -865,7 +758,8 @@ class RunHandler::Impl {
   uint64 start_time_us_;
   int64 step_id_;
   std::unique_ptr<thread::ThreadPoolInterface> thread_pool_interface_;
-  ThreadWorkSource tws_;
+  internal::ThreadWorkSource tws_;
+  RunOptions::Experimental::RunHandlerPoolOptions options_;
 };
 
 // Contains shared state across all run handlers present in the pool. Also
@@ -880,7 +774,7 @@ class RunHandlerPool::Impl {
             ParamFromEnvWithDefault("TF_RUN_HANDLER_NUM_SUB_THREAD_POOL", 2)),
         queue_waiters_(
             ParamFromEnvWithDefault("TF_RUN_HANDLER_NUM_SUB_THREAD_POOL", 2)),
-        run_handler_thread_pool_(new RunHandlerThreadPool(
+        run_handler_thread_pool_(new internal::RunHandlerThreadPool(
             num_inter_op_threads, num_intra_op_threads, Env::Default(),
             ThreadOptions(), "tf_run_handler_pool", &waiters_mu_,
             &queue_waiters_)),
@@ -891,7 +785,6 @@ class RunHandlerPool::Impl {
             std::vector<double>({1}))) {
     VLOG(1) << "Creating a RunHandlerPool with max handlers: " << max_handlers_;
     free_handlers_.reserve(max_handlers_);
-    sorted_active_handlers_.reserve(max_handlers_);
     handlers_.reserve(max_handlers_);
     for (int i = 0; i < max_handlers_; ++i) {
       handlers_.emplace_back(new RunHandler::Impl(this));
@@ -920,20 +813,23 @@ class RunHandlerPool::Impl {
     run_handler_thread_pool_.reset();
   }
 
-  RunHandlerThreadPool* run_handler_thread_pool() {
+  internal::RunHandlerThreadPool* run_handler_thread_pool() {
     return run_handler_thread_pool_.get();
   }
 
-  bool has_free_handler() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  bool has_free_handler() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     return !free_handlers_.empty();
   }
 
-  std::unique_ptr<RunHandler> Get(int64 step_id, int64 timeout_in_ms)
-      LOCKS_EXCLUDED(mu_) {
-    thread_local std::unique_ptr<Eigen::MaxSizeVector<ThreadWorkSource*>>
+  std::unique_ptr<RunHandler> Get(
+      int64 step_id, int64 timeout_in_ms,
+      const RunOptions::Experimental::RunHandlerPoolOptions& options)
+      TF_LOCKS_EXCLUDED(mu_) {
+    thread_local std::unique_ptr<
+        Eigen::MaxSizeVector<internal::ThreadWorkSource*>>
         thread_work_sources =
-            std::unique_ptr<Eigen::MaxSizeVector<ThreadWorkSource*>>(
-                new Eigen::MaxSizeVector<ThreadWorkSource*>(
+            std::unique_ptr<Eigen::MaxSizeVector<internal::ThreadWorkSource*>>(
+                new Eigen::MaxSizeVector<internal::ThreadWorkSource*>(
                     static_cast<int32>(ParamFromEnvWithDefault(
                         "TF_RUN_HANDLER_MAX_CONCURRENT_HANDLERS",
                         kMaxConcurrentHandlers))));
@@ -960,17 +856,24 @@ class RunHandlerPool::Impl {
       // Remove the last entry from free_handlers_ and add to the end of
       // sorted_active_handlers_.
       handler_impl = free_handlers_.back();
-      handler_impl->Reset(step_id);
-      // Sortedness isn't violated if we simply add at the end of the list,
-      // since handlers are expected to be obtained in increasing order of time.
-      sorted_active_handlers_.push_back(handler_impl);
-      DCHECK_LE(sorted_active_handlers_.size(), max_handlers_);
+      handler_impl->Reset(step_id, options);
       free_handlers_.pop_back();
 
-      num_active_requests = sorted_active_handlers_.size();
+      num_active_requests = sorted_active_handlers_.size() + 1;
       thread_work_sources->resize(num_active_requests);
+      int priority = options.priority();
+      auto it = sorted_active_handlers_.cbegin();
+      bool new_handler_inserted = false;
       for (int i = 0; i < num_active_requests; ++i) {
-        (*thread_work_sources)[i] = sorted_active_handlers_[i]->tws();
+        if (!new_handler_inserted && (it == sorted_active_handlers_.cend() ||
+                                      priority > (*it)->priority())) {
+          sorted_active_handlers_.insert(it, handler_impl);
+          new_handler_inserted = true;
+          // Point to the newly added handler.
+          --it;
+        }
+        (*thread_work_sources)[i] = (*it)->tws();
+        ++it;
       }
       version = ++version_;
     }
@@ -978,7 +881,7 @@ class RunHandlerPool::Impl {
     return WrapUnique<RunHandler>(new RunHandler(handler_impl));
   }
 
-  void ReleaseHandler(RunHandler::Impl* handler) LOCKS_EXCLUDED(mu_) {
+  void ReleaseHandler(RunHandler::Impl* handler) TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
     DCHECK_GT(sorted_active_handlers_.size(), 0);
 
@@ -1009,12 +912,23 @@ class RunHandlerPool::Impl {
     // requests will trigger recomputation.
   }
 
+  std::vector<int64> GetActiveHandlerPrioritiesForTesting()
+      TF_LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    std::vector<int64> ret;
+    for (const auto& handler_impl : sorted_active_handlers_) {
+      ret.push_back(handler_impl->priority());
+    }
+    return ret;
+  }
+
  private:
   void RecomputePoolStats(
       int num_active_requests, uint64 version,
-      const Eigen::MaxSizeVector<ThreadWorkSource*>& thread_work_sources);
+      const Eigen::MaxSizeVector<internal::ThreadWorkSource*>&
+          thread_work_sources);
 
-  void LogInfo() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  void LogInfo() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Maximum number of handlers pre-created during pool construction time. The
   // number has been chosen expecting each handler might at least want 1
@@ -1023,27 +937,32 @@ class RunHandlerPool::Impl {
   const int max_handlers_;
 
   Eigen::MaxSizeVector<mutex> waiters_mu_;
-  Eigen::MaxSizeVector<Waiter> queue_waiters_;
+  Eigen::MaxSizeVector<internal::Waiter> queue_waiters_;
 
-  std::unique_ptr<RunHandlerThreadPool> run_handler_thread_pool_;
+  std::unique_ptr<internal::RunHandlerThreadPool> run_handler_thread_pool_;
   // Thread compatible part used only by lock under RunHandlerPool.
   // Handlers are sorted by start time.
   // TODO(azaks): sort by the remaining latency budget.
-  std::vector<RunHandler::Impl*> sorted_active_handlers_ GUARDED_BY(mu_);
-  std::vector<RunHandler::Impl*> free_handlers_ GUARDED_BY(mu_);
-  std::vector<std::unique_ptr<RunHandler::Impl>> handlers_ GUARDED_BY(mu_);
-  // Histogram of elapsed runtime of every handler (in ms).
-  histogram::Histogram time_hist_ GUARDED_BY(mu_);
+  // TODO(chaox): Consider other data structure for maintaining the sorted
+  // active handlers if the searching overhead(currently O(n)) becomes the
+  // bottleneck.
+  std::list<RunHandler::Impl*> sorted_active_handlers_ TF_GUARDED_BY(mu_);
+  std::vector<RunHandler::Impl*> free_handlers_ TF_GUARDED_BY(mu_);
+  std::vector<std::unique_ptr<RunHandler::Impl>> handlers_ TF_GUARDED_BY(mu_);
 
-  int64 iterations_ GUARDED_BY(mu_);
+  // Histogram of elapsed runtime of every handler (in ms).
+  histogram::Histogram time_hist_ TF_GUARDED_BY(mu_);
+
+  int64 iterations_ TF_GUARDED_BY(mu_);
   mutex mu_;
-  int64 version_ GUARDED_BY(mu_);
+  int64 version_ TF_GUARDED_BY(mu_);
   const std::vector<double> sub_thread_pool_end_request_percentage_;
 };
 
 void RunHandlerPool::Impl::RecomputePoolStats(
     int num_active_requests, uint64 version,
-    const Eigen::MaxSizeVector<ThreadWorkSource*>& thread_work_sources) {
+    const Eigen::MaxSizeVector<internal::ThreadWorkSource*>&
+        thread_work_sources) {
   if (num_active_requests == 0) return;
 
   int sub_thread_pool_id = 0;
@@ -1092,16 +1011,17 @@ void RunHandlerPool::Impl::LogInfo() {
     uint64 now = tensorflow::Env::Default()->NowMicros();
     string times_str = "";
     string ids_str = "";
+    auto it = sorted_active_handlers_.cbegin();
     for (int i = 0; i < num_active_requests; ++i) {
       if (i > 0) {
         times_str += " ";
         ids_str += " ";
       }
 
-      times_str += strings::StrCat(
-          (now - sorted_active_handlers_[i]->start_time_us()) / 1000.0, " ms.");
-      ids_str +=
-          strings::StrCat(sorted_active_handlers_[i]->tws()->GetTracemeId());
+      times_str +=
+          strings::StrCat((now - (*it)->start_time_us()) / 1000.0, " ms.");
+      ids_str += strings::StrCat((*it)->tws()->GetTracemeId());
+      ++it;
     }
     VLOG(1) << "Elapsed times are: " << times_str;
     VLOG(1) << "Step ids are: " << ids_str;
@@ -1127,7 +1047,7 @@ void RunHandler::Impl::ThreadPoolInterfaceWrapper::Schedule(
 RunHandler::Impl::Impl(RunHandlerPool::Impl* pool_impl)
     : pool_impl_(pool_impl) {
   thread_pool_interface_.reset(new ThreadPoolInterfaceWrapper(this));
-  Reset(0);
+  Reset(0, RunOptions::Experimental::RunHandlerPoolOptions());
 }
 
 void RunHandler::Impl::ScheduleInterOpClosure(std::function<void()> fn) {
@@ -1142,9 +1062,12 @@ void RunHandler::Impl::ScheduleIntraOpClosure(std::function<void()> fn) {
                                                         std::move(fn));
 }
 
-void RunHandler::Impl::Reset(int64 step_id) {
+void RunHandler::Impl::Reset(
+    int64 step_id,
+    const RunOptions::Experimental::RunHandlerPoolOptions& options) {
   start_time_us_ = tensorflow::Env::Default()->NowMicros();
   step_id_ = step_id;
+  options_ = options;
   tws_.SetTracemeId(step_id);
 }
 
@@ -1157,9 +1080,15 @@ RunHandlerPool::RunHandlerPool(int num_inter_op_threads,
 
 RunHandlerPool::~RunHandlerPool() {}
 
-std::unique_ptr<RunHandler> RunHandlerPool::Get(int64 step_id,
-                                                int64 timeout_in_ms) {
-  return impl_->Get(step_id, timeout_in_ms);
+std::unique_ptr<RunHandler> RunHandlerPool::Get(
+    int64 step_id, int64 timeout_in_ms,
+    const RunOptions::Experimental::RunHandlerPoolOptions& options) {
+  return impl_->Get(step_id, timeout_in_ms, options);
+}
+
+std::vector<int64> RunHandlerPool::GetActiveHandlerPrioritiesForTesting()
+    const {
+  return impl_->GetActiveHandlerPrioritiesForTesting();
 }
 
 RunHandler::RunHandler(Impl* impl) : impl_(impl) {}
diff --git a/tensorflow/core/framework/run_handler.h b/tensorflow/core/framework/run_handler.h
index 33749a54c9f..529c3deda3a 100644
--- a/tensorflow/core/framework/run_handler.h
+++ b/tensorflow/core/framework/run_handler.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/histogram/histogram.h"
+#include "tensorflow/core/platform/context.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/protobuf/config.pb.h"
@@ -62,7 +63,14 @@ class RunHandlerPool {
   // unique_ptr is destroyed.
   //
   // Will block unless there is an inactive handler.
-  std::unique_ptr<RunHandler> Get(int64 step_id = 0, int64 timeout_in_ms = 0);
+  std::unique_ptr<RunHandler> Get(
+      int64 step_id = 0, int64 timeout_in_ms = 0,
+      const RunOptions::Experimental::RunHandlerPoolOptions& options =
+          RunOptions::Experimental::RunHandlerPoolOptions());
+
+  // Get the priorities for active handlers. The return result is with the same
+  // order of the active handler list.
+  std::vector<int64> GetActiveHandlerPrioritiesForTesting() const;
 
  private:
   class Impl;
@@ -99,6 +107,208 @@ class RunHandler {
   Impl* impl_;  // NOT OWNED.
 };
 
+namespace internal {
+
+// TODO(azaks): Refactor with thread:ThreadPool
+class RunHandlerEnvironment {
+  typedef Thread EnvThread;
+  struct TaskImpl {
+    std::function<void()> f;
+    Context context;
+    uint64 trace_id;
+  };
+  Env* const env_;
+  const ThreadOptions thread_options_;
+  const string name_;
+
+ public:
+  struct Task {
+    std::unique_ptr<TaskImpl> f;
+  };
+
+  RunHandlerEnvironment(Env* env, const ThreadOptions& thread_options,
+                        const string& name);
+
+  EnvThread* CreateThread(std::function<void()> f);
+
+  Task CreateTask(std::function<void()> f);
+
+  void ExecuteTask(const Task& t);
+};
+
+typedef typename RunHandlerEnvironment::Task Task;
+typedef Eigen::RunQueue<Task, 1024> Queue;
+
+// To reduce cache misses, we use a doubly-linked list of Waiter structs and
+// queue them in LIFO order rather than the FIFO order used by a single
+// condition variable.
+struct Waiter {
+  Waiter() {
+    next = this;
+    prev = this;
+  }
+  condition_variable cv;
+  mutex mu;
+  Waiter* next;
+  Waiter* prev;
+};
+
+class ThreadWorkSource {
+ public:
+  ThreadWorkSource();
+
+  ~ThreadWorkSource();
+
+  Task EnqueueTask(Task t, bool is_blocking);
+
+  Task PopBlockingTask();
+
+  Task PopNonBlockingTask(int start_index, bool search_from_all_queue);
+
+  void WaitForWork(int max_sleep_micros);
+
+  int TaskQueueSize(bool is_blocking);
+
+  int64 GetTracemeId();
+
+  void SetTracemeId(int64 value);
+
+  void SetWaiter(uint64 version, Waiter* waiter, mutex* mutex);
+
+  int64 GetInflightTaskCount(bool is_blocking);
+
+  void IncrementInflightTaskCount(bool is_blocking);
+
+  void DecrementInflightTaskCount(bool is_blocking);
+
+  unsigned NonBlockingWorkShardingFactor();
+
+  std::string ToString();
+
+ private:
+  struct NonBlockingQueue {
+    mutex queue_op_mu;
+    char pad[128];
+    Queue queue;
+  };
+
+  int32 non_blocking_work_sharding_factor_;
+  Eigen::MaxSizeVector<NonBlockingQueue*> non_blocking_work_queues_;
+
+  std::atomic<int64> blocking_inflight_;
+  std::atomic<int64> non_blocking_inflight_;
+
+  Queue blocking_work_queue_;
+  mutex blocking_queue_op_mu_;
+  char pad_[128];
+  mutex waiters_mu_;
+  Waiter queue_waiters_ TF_GUARDED_BY(waiters_mu_);
+  std::atomic<int64> traceme_id_;
+
+  mutex run_handler_waiter_mu_;
+  uint64 version_ TF_GUARDED_BY(run_handler_waiter_mu_);
+  mutex* sub_thread_pool_waiter_mu_ TF_GUARDED_BY(run_handler_waiter_mu_);
+  Waiter* sub_thread_pool_waiter_ TF_GUARDED_BY(run_handler_waiter_mu_);
+};
+
+class RunHandlerThreadPool {
+ public:
+  struct PerThread {
+    constexpr PerThread() : pool(nullptr), thread_id(-1) {}
+    RunHandlerThreadPool* pool;  // Parent pool, or null for normal threads.
+    int thread_id;               // Worker thread index in pool.
+  };
+
+  RunHandlerThreadPool(int num_blocking_threads, int num_non_blocking_threads,
+                       Env* env, const ThreadOptions& thread_options,
+                       const string& name,
+                       Eigen::MaxSizeVector<mutex>* waiters_mu,
+                       Eigen::MaxSizeVector<Waiter>* queue_waiters);
+
+  ~RunHandlerThreadPool();
+
+  void Start();
+
+  void StartOneThreadForTesting();
+
+  void AddWorkToQueue(ThreadWorkSource* tws, bool is_blocking,
+                      std::function<void()> fn);
+
+  // Set work queues from which the thread 'tid' can steal its work.
+  // The request with start_request_idx will be attempted first. Other requests
+  // will be attempted in FIFO order based on their arrival time.
+  void SetThreadWorkSources(
+      int tid, int start_request_idx, uint64 version,
+      const Eigen::MaxSizeVector<ThreadWorkSource*>& thread_work_sources);
+
+  PerThread* GetPerThread();
+
+  int CurrentThreadId() const;
+
+  int NumThreads() const;
+
+  int NumBlockingThreads() const;
+
+  int NumNonBlockingThreads() const;
+
+  void WorkerLoop(int thread_id, bool may_steal_blocking_work);
+
+  // Search tasks from Requets range searching_range_start to
+  // searching_range_end. If there is no tasks in the search range and
+  // may_steal_blocking_work is true, then search from all requests.
+  Task FindTask(
+      int searching_range_start, int searching_range_end, int thread_id,
+      int sub_thread_pool_id, int max_blocking_inflight,
+      bool may_steal_blocking_work,
+      const Eigen::MaxSizeVector<ThreadWorkSource*>& thread_work_sources,
+      bool* task_from_blocking_queue, ThreadWorkSource** tws);
+
+  void WaitForWork(bool is_blocking, int thread_id,
+                   int32 max_blocking_inflight);
+
+  void WaitForWorkInSubThreadPool(bool is_blocking, int sub_thread_pool_id);
+
+ private:
+  struct ThreadData {
+    ThreadData();
+    mutex mu;
+    uint64 new_version;
+    condition_variable sources_not_empty;
+    std::unique_ptr<Thread> thread;
+    int current_index;
+    std::unique_ptr<Eigen::MaxSizeVector<ThreadWorkSource*>>
+        new_thread_work_sources TF_GUARDED_BY(mu);
+
+    uint64 current_version;
+    // Should only be accessed by one thread.
+    std::unique_ptr<Eigen::MaxSizeVector<ThreadWorkSource*>>
+        current_thread_work_sources;
+
+    int sub_thread_pool_id;
+  };
+
+  const int num_threads_;
+  const int num_blocking_threads_;
+  const int num_non_blocking_threads_;
+  Eigen::MaxSizeVector<ThreadData> thread_data_;
+  internal::RunHandlerEnvironment env_;
+  std::atomic<bool> cancelled_;
+  string name_;
+  Eigen::MaxSizeVector<mutex>* waiters_mu_;
+  Eigen::MaxSizeVector<Waiter>* queue_waiters_;
+
+  bool use_sub_thread_pool_;
+  std::vector<int> num_threads_in_sub_thread_pool_;
+
+  // Threads in each sub thread pool will search tasks from the given
+  // start_request_percentage to end_request_percentage in a round robin
+  // fashion.
+  std::vector<double> sub_thread_pool_start_request_percentage_;
+  std::vector<double> sub_thread_pool_end_request_percentage_;
+};
+
+}  // namespace internal
+
 }  // end namespace tensorflow.
 
 #endif  // TENSORFLOW_CORE_FRAMEWORK_RUN_HANDLER_H_
diff --git a/tensorflow/core/framework/run_handler_test.cc b/tensorflow/core/framework/run_handler_test.cc
index 8de3e3ba6bb..d849bf0f17f 100644
--- a/tensorflow/core/framework/run_handler_test.cc
+++ b/tensorflow/core/framework/run_handler_test.cc
@@ -78,6 +78,511 @@ TEST(RunHandlerUtilTest, TestBasicScheduling) {
   counter.Wait();
 }
 
+TEST(RunHandlerUtilTest, PrioritySchedulingTest) {
+  int num_threads = 2;
+  std::unique_ptr<RunHandlerPool> pool(
+      new RunHandlerPool(num_threads, num_threads));
+
+  RunOptions::Experimental::RunHandlerPoolOptions options =
+      RunOptions::Experimental::RunHandlerPoolOptions();
+  options.set_priority(2);
+  auto handler1 = pool->Get(/*step_id=*/1, /*timeout_in_ms=*/0, options);
+  options.set_priority(1);
+  auto handler2 = pool->Get(/*step_id=*/2, /*timeout_in_ms=*/0, options);
+  options.set_priority(3);
+  auto handler3 = pool->Get(/*step_id=*/3, /*timeout_in_ms=*/0, options);
+
+  // The active requests should be ordered by priorites.
+  std::vector<int64> sorted_active_list =
+      pool->GetActiveHandlerPrioritiesForTesting();
+  EXPECT_EQ(sorted_active_list.size(), 3);
+  EXPECT_EQ(sorted_active_list[0], 3);
+  EXPECT_EQ(sorted_active_list[1], 2);
+  EXPECT_EQ(sorted_active_list[2], 1);
+
+  handler1.reset();
+  options.set_priority(5);
+  auto handler4 = pool->Get(/*step_id=*/4, /*timeout_in_ms=*/0, options);
+  options.set_priority(4);
+  auto handler5 = pool->Get(/*step_id=*/5, /*timeout_in_ms=*/0, options);
+  sorted_active_list = pool->GetActiveHandlerPrioritiesForTesting();
+  EXPECT_EQ(sorted_active_list.size(), 4);
+  EXPECT_EQ(sorted_active_list[0], 5);
+  EXPECT_EQ(sorted_active_list[1], 4);
+  EXPECT_EQ(sorted_active_list[2], 3);
+  EXPECT_EQ(sorted_active_list[3], 1);
+}
+
+TEST(RunHandlerThreadPool, EnqueueTask) {
+  Eigen::MaxSizeVector<mutex> waiters_mu(2);
+  waiters_mu.resize(2);
+  Eigen::MaxSizeVector<internal::Waiter> waiters(2);
+  waiters.resize(2);
+  internal::RunHandlerThreadPool run_handler_thread_pool(
+      /*num_blocking_threads=*/0, /*num_non_blocking_threads=*/0,
+      Env::Default(), ThreadOptions(), "tf_run_handler_pool", &waiters_mu,
+      &waiters);
+  internal::ThreadWorkSource tws;
+
+  int result = 0;
+  std::function<void()> fn = [&result] { result = 1; };
+  std::function<void()> fn2 = [&result] { result = 2; };
+  run_handler_thread_pool.AddWorkToQueue(&tws, /*is_blocking=*/true, fn);
+  EXPECT_EQ(tws.TaskQueueSize(/*is_blocking=*/true), 1);
+  run_handler_thread_pool.AddWorkToQueue(&tws, /*is_blocking=*/true, fn2);
+  EXPECT_EQ(tws.TaskQueueSize(/*is_blocking=*/true), 2);
+  tws.PopBlockingTask().f->f();
+  EXPECT_EQ(result, 1);
+  tws.PopBlockingTask().f->f();
+  EXPECT_EQ(result, 2);
+
+  run_handler_thread_pool.AddWorkToQueue(&tws, /*is_blocking=*/false, fn);
+  EXPECT_EQ(tws.TaskQueueSize(/*is_blocking=*/false), 1);
+  run_handler_thread_pool.AddWorkToQueue(&tws, /*is_blocking=*/false, fn2);
+  EXPECT_EQ(tws.TaskQueueSize(/*is_blocking=*/false), 2);
+  tws.PopNonBlockingTask(0, true).f->f();
+  EXPECT_EQ(result, 1);
+  tws.PopNonBlockingTask(0, true).f->f();
+  EXPECT_EQ(result, 2);
+}
+
+TEST(RunHandlerThreadPool, FindTask) {
+  Eigen::MaxSizeVector<mutex> waiters_mu(2);
+  waiters_mu.resize(2);
+  Eigen::MaxSizeVector<internal::Waiter> waiters(2);
+  waiters.resize(2);
+  internal::RunHandlerThreadPool run_handler_thread_pool(
+      /*num_blocking_threads=*/1, /*num_non_blocking_threads=*/0,
+      Env::Default(), ThreadOptions(), "tf_run_handler_pool", &waiters_mu,
+      &waiters);
+
+  Eigen::MaxSizeVector<internal::ThreadWorkSource*> thread_work_sources(5);
+  thread_work_sources.resize(5);
+  for (int i = 0; i < 5; ++i) {
+    thread_work_sources[i] = new internal::ThreadWorkSource();
+  }
+
+  {
+    // The thread should search the task following round robin fashion.
+    int result = -1;
+    run_handler_thread_pool.AddWorkToQueue(thread_work_sources[2],
+                                           /*is_blocking=*/true,
+                                           [&result] { result = 2; });
+    run_handler_thread_pool.AddWorkToQueue(thread_work_sources[2],
+                                           /*is_blocking=*/true,
+                                           [&result] { result = 2; });
+    run_handler_thread_pool.AddWorkToQueue(thread_work_sources[3],
+                                           /*is_blocking=*/true,
+                                           [&result] { result = 3; });
+    run_handler_thread_pool.AddWorkToQueue(thread_work_sources[3],
+                                           /*is_blocking=*/true,
+                                           [&result] { result = 3; });
+
+    const auto find_blocking_task_from_all_handlers =
+        [&](bool* task_from_blocking_queue, internal::Task* t) {
+          internal::ThreadWorkSource* tws;
+          *t = run_handler_thread_pool.FindTask(
+              /*searching_range_start=*/0, /*searching_range_end=*/5,
+              /*thread_id=*/0,
+              /*sub_thread_pool_id=*/0, /*max_blocking_inflight=*/10,
+              /*may_steal_blocking_work=*/true, thread_work_sources,
+              task_from_blocking_queue, &tws);
+        };
+    bool task_from_blocking_queue;
+    internal::Task t;
+    find_blocking_task_from_all_handlers(&task_from_blocking_queue, &t);
+    EXPECT_EQ(task_from_blocking_queue, true);
+    t.f->f();
+    EXPECT_EQ(result, 2);
+
+    find_blocking_task_from_all_handlers(&task_from_blocking_queue, &t);
+    EXPECT_EQ(task_from_blocking_queue, true);
+    t.f->f();
+    EXPECT_EQ(result, 3);
+
+    find_blocking_task_from_all_handlers(&task_from_blocking_queue, &t);
+    EXPECT_EQ(task_from_blocking_queue, true);
+    t.f->f();
+    EXPECT_EQ(result, 2);
+
+    find_blocking_task_from_all_handlers(&task_from_blocking_queue, &t);
+    EXPECT_EQ(task_from_blocking_queue, true);
+    t.f->f();
+    EXPECT_EQ(result, 3);
+  }
+
+  {
+    // Task out of searching range cannot be found.
+    int result = -1;
+    run_handler_thread_pool.AddWorkToQueue(thread_work_sources[3],
+                                           /*is_blocking=*/true,
+                                           [&result] { result = 3; });
+
+    const auto find_blocking_task_from_range =
+        [&](bool* task_from_blocking_queue, internal::Task* t, int range_start,
+            int range_end) {
+          internal::ThreadWorkSource* tws;
+          *t = run_handler_thread_pool.FindTask(
+              range_start, range_end,
+              /*thread_id=*/0,
+              /*sub_thread_pool_id=*/0, /*max_blocking_inflight=*/10,
+              /*may_steal_blocking_work=*/true, thread_work_sources,
+              task_from_blocking_queue, &tws);
+        };
+
+    bool task_from_blocking_queue;
+    internal::Task t;
+    find_blocking_task_from_range(&task_from_blocking_queue, &t, 0, 3);
+    EXPECT_EQ(t.f, nullptr);
+
+    // Clean up the queue.
+    find_blocking_task_from_range(&task_from_blocking_queue, &t, 0, 5);
+  }
+
+  {
+    // The thread should search from start range if the currrent index is
+    // smaller.
+    int result = -1;
+    run_handler_thread_pool.AddWorkToQueue(thread_work_sources[2],
+                                           /*is_blocking=*/true,
+                                           [&result] { result = 2; });
+    run_handler_thread_pool.AddWorkToQueue(thread_work_sources[3],
+                                           /*is_blocking=*/true,
+                                           [&result] { result = 3; });
+
+    const auto find_blocking_task_from_range =
+        [&](bool* task_from_blocking_queue, internal::Task* t, int range_start,
+            int range_end) {
+          internal::ThreadWorkSource* tws;
+          *t = run_handler_thread_pool.FindTask(
+              range_start, range_end,
+              /*thread_id=*/0,
+              /*sub_thread_pool_id=*/0, /*max_blocking_inflight=*/10,
+              /*may_steal_blocking_work=*/true, thread_work_sources,
+              task_from_blocking_queue, &tws);
+        };
+    bool task_from_blocking_queue;
+    internal::Task t;
+    find_blocking_task_from_range(&task_from_blocking_queue, &t, 3, 5);
+    EXPECT_EQ(task_from_blocking_queue, true);
+    t.f->f();
+    EXPECT_EQ(result, 3);
+
+    find_blocking_task_from_range(&task_from_blocking_queue, &t, 0, 5);
+    EXPECT_EQ(task_from_blocking_queue, true);
+    t.f->f();
+    EXPECT_EQ(result, 2);
+  }
+
+  {
+    // The thread should search within the range even if the current index
+    // is larger than searching_range_end;
+    int result = -1;
+    run_handler_thread_pool.AddWorkToQueue(thread_work_sources[2],
+                                           /*is_blocking=*/true,
+                                           [&result] { result = 2; });
+
+    const auto find_blocking_task_from_range =
+        [&](bool* task_from_blocking_queue, internal::Task* t, int range_start,
+            int range_end) {
+          internal::ThreadWorkSource* tws;
+          *t = run_handler_thread_pool.FindTask(
+              range_start, range_end,
+              /*thread_id=*/0,
+              /*sub_thread_pool_id=*/0, /*max_blocking_inflight=*/10,
+              /*may_steal_blocking_work=*/true, thread_work_sources,
+              task_from_blocking_queue, &tws);
+        };
+    bool task_from_blocking_queue;
+    // Make the current index to be 3.
+    internal::Task t;
+    find_blocking_task_from_range(&task_from_blocking_queue, &t, 0, 5);
+    EXPECT_EQ(task_from_blocking_queue, true);
+    t.f->f();
+    EXPECT_EQ(result, 2);
+
+    // Search in a smaller range.
+    run_handler_thread_pool.AddWorkToQueue(thread_work_sources[2],
+                                           /*is_blocking=*/true,
+                                           [&result] { result = 2; });
+    run_handler_thread_pool.AddWorkToQueue(thread_work_sources[3],
+                                           /*is_blocking=*/true,
+                                           [&result] { result = 3; });
+    find_blocking_task_from_range(&task_from_blocking_queue, &t, 0, 3);
+    EXPECT_EQ(task_from_blocking_queue, true);
+    t.f->f();
+    EXPECT_EQ(result, 2);
+
+    // Clean up the queue.
+    find_blocking_task_from_range(&task_from_blocking_queue, &t, 0, 5);
+    EXPECT_EQ(task_from_blocking_queue, true);
+    t.f->f();
+    EXPECT_EQ(result, 3);
+  }
+
+  {
+    // We prefer blocking task for blocking threads.
+    int result = -1;
+    run_handler_thread_pool.AddWorkToQueue(thread_work_sources[2],
+                                           /*is_blocking=*/false,
+                                           [&result] { result = 2; });
+    run_handler_thread_pool.AddWorkToQueue(thread_work_sources[2],
+                                           /*is_blocking=*/true,
+                                           [&result] { result = 2; });
+    const auto blocking_thread_find_task_from_all_handler =
+        [&](bool* task_from_blocking_queue, internal::Task* t) {
+          internal::ThreadWorkSource* tws;
+          *t = run_handler_thread_pool.FindTask(
+              /*searching_range_start=*/0, /*searching_range_end=*/5,
+              /*thread_id=*/0,
+              /*sub_thread_pool_id=*/0, /*max_blocking_inflight=*/10,
+              /*may_steal_blocking_work=*/true, thread_work_sources,
+              task_from_blocking_queue, &tws);
+        };
+    bool task_from_blocking_queue;
+    internal::Task t;
+    blocking_thread_find_task_from_all_handler(&task_from_blocking_queue, &t);
+    EXPECT_EQ(task_from_blocking_queue, true);
+    t.f->f();
+    EXPECT_EQ(result, 2);
+
+    blocking_thread_find_task_from_all_handler(&task_from_blocking_queue, &t);
+    EXPECT_EQ(task_from_blocking_queue, false);
+    t.f->f();
+    EXPECT_EQ(result, 2);
+  }
+
+  {
+    // Nonblocking threads can only pick up non-blocking task.
+    int result = -1;
+    run_handler_thread_pool.AddWorkToQueue(thread_work_sources[2],
+                                           /*is_blocking=*/false,
+                                           [&result] { result = 2; });
+    run_handler_thread_pool.AddWorkToQueue(thread_work_sources[2],
+                                           /*is_blocking=*/true,
+                                           [&result] { result = 2; });
+
+    const auto find_task_from_all_handler = [&](bool* task_from_blocking_queue,
+                                                internal::Task* t,
+                                                bool is_blocking_thread) {
+      internal::ThreadWorkSource* tws;
+      *t = run_handler_thread_pool.FindTask(
+          /*searching_range_start=*/0, /*searching_range_end=*/5,
+          /*thread_id=*/0,
+          /*sub_thread_pool_id=*/0, /*max_blocking_inflight=*/10,
+          is_blocking_thread, thread_work_sources, task_from_blocking_queue,
+          &tws);
+    };
+    bool task_from_blocking_queue;
+    internal::Task t;
+    find_task_from_all_handler(&task_from_blocking_queue, &t,
+                               /*is_blocking_thread=*/false);
+    EXPECT_EQ(task_from_blocking_queue, false);
+    t.f->f();
+    EXPECT_EQ(result, 2);
+
+    find_task_from_all_handler(&task_from_blocking_queue, &t,
+                               /*is_blocking_thread=*/false);
+    EXPECT_EQ(t.f, nullptr);
+
+    // Clean up the queue.
+    find_task_from_all_handler(&task_from_blocking_queue, &t,
+                               /*is_blocking_thread=*/true);
+  }
+
+  {
+    // There is a limit for max_blocking_inflight requests.
+    int result = -1;
+    run_handler_thread_pool.AddWorkToQueue(thread_work_sources[2],
+                                           /*is_blocking=*/true,
+                                           [&result] { result = 2; });
+
+    const auto find_task_from_all_handler = [&](bool* task_from_blocking_queue,
+                                                internal::Task* t,
+                                                bool is_blocking_thread) {
+      internal::ThreadWorkSource* tws;
+      *t = run_handler_thread_pool.FindTask(
+          /*searching_range_start=*/0, /*searching_range_end=*/5,
+          /*thread_id=*/0,
+          /*sub_thread_pool_id=*/0, /*max_blocking_inflight=*/10,
+          is_blocking_thread, thread_work_sources, task_from_blocking_queue,
+          &tws);
+    };
+
+    bool task_from_blocking_queue;
+    internal::Task t;
+    find_task_from_all_handler(&task_from_blocking_queue, &t,
+                               /*is_blocking_thread=*/false);
+    EXPECT_EQ(task_from_blocking_queue, false);
+    EXPECT_EQ(t.f, nullptr);
+
+    // Clean up the queue.
+    find_task_from_all_handler(&task_from_blocking_queue, &t,
+                               /*is_blocking_thread=*/true);
+  }
+
+  for (int i = 0; i < 5; ++i) {
+    delete thread_work_sources[i];
+  }
+}
+
+TEST(RunHandlerThreadPool, RoundRobinExecution) {
+  // Set up environment for 1 sub thread pool.
+  setenv("TF_RUN_HANDLER_USE_SUB_THREAD_POOL", "true", true);
+  setenv("TF_RUN_HANDLER_NUM_THREADS_IN_SUB_THREAD_POOL", "1", true);
+  setenv("TF_RUN_HANDLER_SUB_THREAD_POOL_START_REQUEST_PERCENTAGE", "0", true);
+  setenv("TF_RUN_HANDLER_SUB_THREAD_POOL_END_REQUEST_PERCENTAGE", "1", true);
+
+  Eigen::MaxSizeVector<mutex> waiters_mu(1);
+  waiters_mu.resize(1);
+  Eigen::MaxSizeVector<internal::Waiter> waiters(1);
+  waiters.resize(1);
+  internal::RunHandlerThreadPool* run_handler_thread_pool =
+      new internal::RunHandlerThreadPool(
+          /*num_blocking_threads=*/1, /*num_non_blocking_threads=*/0,
+          Env::Default(), ThreadOptions(), "tf_run_handler_pool", &waiters_mu,
+          &waiters);
+  Eigen::MaxSizeVector<internal::ThreadWorkSource*> thread_work_sources(3);
+  thread_work_sources.resize(3);
+  internal::ThreadWorkSource tws[3];
+  for (int i = 0; i < 3; ++i) {
+    tws[i].SetWaiter(1, &waiters[0], &waiters_mu[0]);
+    thread_work_sources[i] = &tws[i];
+  }
+
+  int result = 0;
+  mutex mu;
+  bool ok_to_execute = false;
+  bool ok_to_validate = false;
+  condition_variable function_start;
+  condition_variable function_end;
+  std::vector<std::function<void()>> fns;
+  for (int i = 0; i < 3; ++i) {
+    fns.push_back([&result, &mu, &function_start, &function_end, &ok_to_execute,
+                   &ok_to_validate, i] {
+      mutex_lock l(mu);
+      while (!ok_to_execute) {
+        function_start.wait(l);
+      }
+      result = i;
+      ok_to_execute = false;
+      ok_to_validate = true;
+      function_end.notify_one();
+    });
+    run_handler_thread_pool->AddWorkToQueue(&tws[i], /*is_blocking=*/true,
+                                            fns[i]);
+    run_handler_thread_pool->AddWorkToQueue(&tws[i], /*is_blocking=*/true,
+                                            fns[i]);
+  }
+  run_handler_thread_pool->Start();
+  run_handler_thread_pool->SetThreadWorkSources(
+      /*tid=*/0, /*start_request_idx=*/0, /*version=*/1, thread_work_sources);
+
+  // Validate the execution should be roundrobin.
+  mutex_lock l(mu);
+  for (int round = 0; round < 2; ++round) {
+    for (int i = 0; i < 3; ++i) {
+      ok_to_execute = true;
+      function_start.notify_one();
+      while (!ok_to_validate) {
+        function_end.wait(l);
+      }
+      ok_to_validate = false;
+      EXPECT_EQ(result, i);
+    }
+  }
+
+  delete run_handler_thread_pool;
+}
+
+TEST(RunHandlerThreadPool, MultipleSubThreadPool) {
+  // Set up environment for 2 sub thread pools.
+  setenv("TF_RUN_HANDLER_USE_SUB_THREAD_POOL", "true", true);
+  setenv("TF_RUN_HANDLER_NUM_THREADS_IN_SUB_THREAD_POOL", "2", true);
+  setenv("TF_RUN_HANDLER_SUB_THREAD_POOL_START_REQUEST_PERCENTAGE", "0,0.5",
+         true);
+  setenv("TF_RUN_HANDLER_SUB_THREAD_POOL_END_REQUEST_PERCENTAGE", "0.5,1",
+         true);
+
+  Eigen::MaxSizeVector<mutex> waiters_mu(2);
+  waiters_mu.resize(2);
+  Eigen::MaxSizeVector<internal::Waiter> waiters(2);
+  waiters.resize(2);
+  internal::RunHandlerThreadPool* run_handler_thread_pool =
+      new internal::RunHandlerThreadPool(
+          /*num_blocking_threads=*/2, /*num_non_blocking_threads=*/0,
+          Env::Default(), ThreadOptions(), "tf_run_handler_pool", &waiters_mu,
+          &waiters);
+  Eigen::MaxSizeVector<internal::ThreadWorkSource*> thread_work_sources(4);
+  thread_work_sources.resize(4);
+  internal::ThreadWorkSource tws[4];
+  for (int i = 0; i < 4; ++i) {
+    tws[i].SetWaiter(1, &waiters[i / 2], &waiters_mu[i / 2]);
+    thread_work_sources[i] = &tws[i];
+  }
+
+  int result = 0;
+  mutex mu;
+  bool ok_to_execute = false;
+  bool ok_to_validate = false;
+  condition_variable function_start;
+  condition_variable function_end;
+
+  std::vector<std::function<void()>> fns;
+  for (int i = 0; i < 4; ++i) {
+    fns.push_back([&result, &mu, &function_start, &function_end, &ok_to_execute,
+                   &ok_to_validate, i] {
+      mutex_lock l(mu);
+      while (!ok_to_execute) {
+        function_start.wait(l);
+      }
+      result = i;
+      ok_to_execute = false;
+      ok_to_validate = true;
+      function_end.notify_one();
+    });
+    run_handler_thread_pool->AddWorkToQueue(&tws[i], /*is_blocking=*/true,
+                                            fns[i]);
+    run_handler_thread_pool->AddWorkToQueue(&tws[i], /*is_blocking=*/true,
+                                            fns[i]);
+  }
+  run_handler_thread_pool->StartOneThreadForTesting();
+  run_handler_thread_pool->SetThreadWorkSources(
+      /*tid=*/0, /*start_request_idx=*/0, /*version=*/1, thread_work_sources);
+  run_handler_thread_pool->SetThreadWorkSources(
+      /*tid=*/1, /*start_request_idx=*/0, /*version=*/1, thread_work_sources);
+
+  // Pick task from the given sub thread pool requests in a round robin fashion.
+  mutex_lock l(mu);
+  for (int round = 0; round < 2; ++round) {
+    for (int i = 0; i < 2; ++i) {
+      ok_to_execute = true;
+      function_start.notify_one();
+      while (!ok_to_validate) {
+        function_end.wait(l);
+      }
+      ok_to_validate = false;
+      EXPECT_EQ(result, i);
+    }
+  }
+
+  // Pick task from any task if there is no tasks from the requests in the sub
+  // thread pool.
+  for (int i = 0; i < 2; ++i) {
+    for (int round = 0; round < 2; ++round) {
+      ok_to_execute = true;
+      function_start.notify_one();
+      while (!ok_to_validate) {
+        function_end.wait(l);
+      }
+      ok_to_validate = false;
+      EXPECT_EQ(result, i + 2);
+    }
+  }
+
+  delete run_handler_thread_pool;
+}
+
 SessionOptions DefaultSessionOptions() {
   SessionOptions options;
   (*options.config.mutable_device_count())["CPU"] = 2;
@@ -205,6 +710,75 @@ TEST_F(RunHandlerTest, TestConcurrencyUseRunHandlerPool) {
   delete tp;
 }
 
+TEST_F(RunHandlerTest, UseRunHandlerPoolEnableSubPoolWithPriority) {
+  Initialize({3, 2, -1, 0});
+  auto session = CreateSession();
+  ASSERT_TRUE(session != nullptr);
+  EXPECT_EQ(::tensorflow::Status::OK(), session->Create(def_));
+  std::vector<std::pair<string, Tensor>> inputs;
+
+  // Request two targets: one fetch output and one non-fetched output.
+  std::vector<string> output_names = {y_ + ":0"};
+  std::vector<string> target_nodes = {y_neg_};
+  std::vector<Tensor> outputs;
+
+  // Prepares RunOptions and RunMetadata
+  RunOptions run_options;
+  run_options.mutable_experimental()->set_use_run_handler_pool(true);
+  run_options.mutable_experimental()
+      ->mutable_run_handler_pool_options()
+      ->set_priority(1);
+
+  Status s = session->Run(run_options, inputs, output_names, target_nodes,
+                          &outputs, nullptr);
+  EXPECT_EQ(::tensorflow::Status::OK(), s);
+
+  ASSERT_EQ(1, outputs.size());
+  // The first output should be initialized and have the correct
+  // output.
+  auto mat = outputs[0].matrix<float>();
+  ASSERT_TRUE(outputs[0].IsInitialized());
+  EXPECT_FLOAT_EQ(5.0, mat(0, 0));
+}
+
+TEST_F(RunHandlerTest, TestConcurrencyUseRunHandlerPoolWithPriority) {
+  Initialize({1, 2, 3, 4});
+  auto session = CreateSession();
+  ASSERT_TRUE(session != nullptr);
+  EXPECT_EQ(::tensorflow::Status::OK(), session->Create(def_));
+
+  // Fill in the input and ask for the output
+  thread::ThreadPool* tp = new thread::ThreadPool(Env::Default(), "test", 4);
+
+  // Run the graph 1000 times in 4 different threads concurrently.
+  std::vector<string> output_names = {y_ + ":0"};
+  auto fn = [&session, output_names]() {
+    for (int i = 0; i < 1000; ++i) {
+      RunOptions run_options;
+      run_options.mutable_experimental()->set_use_run_handler_pool(true);
+      run_options.mutable_experimental()
+          ->mutable_run_handler_pool_options()
+          ->set_priority(i % 4);
+      std::vector<std::pair<string, Tensor>> inputs;
+      std::vector<Tensor> outputs;
+      // Run the graph
+      Status s = session->Run(run_options, inputs, output_names, {}, &outputs,
+                              nullptr);
+      EXPECT_EQ(::tensorflow::Status::OK(), s);
+      ASSERT_EQ(1, outputs.size());
+      auto mat = outputs[0].matrix<float>();
+      EXPECT_FLOAT_EQ(3.0, mat(0, 0));
+    }
+  };
+
+  for (int i = 0; i < 4; ++i) {
+    tp->Schedule(fn);
+  }
+
+  // Wait for the functions to finish.
+  delete tp;
+}
+
 TEST_F(RunHandlerTest, TestWaitTimeout) {
   std::unique_ptr<RunHandlerPool> pool(new RunHandlerPool(1, 1));
 
diff --git a/tensorflow/core/framework/session_state.h b/tensorflow/core/framework/session_state.h
index 48e06fc71d4..877c9970de4 100644
--- a/tensorflow/core/framework/session_state.h
+++ b/tensorflow/core/framework/session_state.h
@@ -75,15 +75,15 @@ class TensorStore {
                      SessionState* session_state);
 
   // Returns true if no tensors have been added to this store.
-  bool empty() NO_THREAD_SAFETY_ANALYSIS { return !dirty_; }
+  bool empty() TF_NO_THREAD_SAFETY_ANALYSIS { return !dirty_; }
 
  private:
   mutex lock_;
-  std::atomic<bool> dirty_ GUARDED_BY(lock_){false};
+  std::atomic<bool> dirty_ TF_GUARDED_BY(lock_){false};
 
   // The tensors that will be saved to session state when this run completes.
   // A map from tensor string name to tensor.
-  std::unordered_map<string, TensorAndKey> tensors_ GUARDED_BY(lock_);
+  std::unordered_map<string, TensorAndKey> tensors_ TF_GUARDED_BY(lock_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/tensor_reference.h b/tensorflow/core/framework/tensor_reference.h
index 8e6aefc167f..840772fccc8 100644
--- a/tensorflow/core/framework/tensor_reference.h
+++ b/tensorflow/core/framework/tensor_reference.h
@@ -41,40 +41,14 @@ class TensorReference {
     if (buf_) buf_->Unref();
   }
 
-  // Return an estimate of the total bytes being kept alive by this reference.
-  size_t TotalBytes() const {
-    // We add 128 as a baseline to account for per-Tensor metadata
-    return 128 + (buf_ ? buf_->size() : 0);
-  }
-
   void FillDescription(AllocationDescription* description) const {
     if (buf_) buf_->FillAllocationDescription(description);
   }
 
-  // Convenience function for de-duplicating tensor references.
-  bool SharesBufferWith(const TensorReference& t) const {
-    return buf_ == t.buf_;
-  }
-
-  // Convenience function for de-duplicating tensor references.
-  bool SharesBufferWith(const Tensor& t) const {
-    return buf_ == (t.buf_ ? t.buf_->root_buffer() : nullptr);
-  }
-
-  // Convenience function for de-duplicating tensor references.
-  size_t BufferHash() const { return std::hash<TensorBuffer*>()(buf_); }
-
-  // A constructor used only for tests
-  explicit TensorReference(TensorBuffer* test_buffer) : buf_(test_buffer) {
-    if (buf_) buf_->Ref();
-  }
-
  private:
   TensorBuffer* buf_;
 };
 
-typedef gtl::InlinedVector<TensorReference, 4> TensorReferenceVector;
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_FRAMEWORK_TENSOR_REFERENCE_H_
diff --git a/tensorflow/core/framework/tensor_testutil.h b/tensorflow/core/framework/tensor_testutil.h
index c3a6d1afd69..1d476baa927 100644
--- a/tensorflow/core/framework/tensor_testutil.h
+++ b/tensorflow/core/framework/tensor_testutil.h
@@ -201,8 +201,8 @@ struct Expector<T, false> {
     ASSERT_EQ(x.dtype(), DataTypeToEnum<T>::v());
     AssertSameTypeDims(x, y);
     const auto size = x.NumElements();
-    const T* a = x.flat<T>().data();
-    const T* b = y.flat<T>().data();
+    const T* a = x.unaligned_flat<T>().data();
+    const T* b = y.unaligned_flat<T>().data();
     for (int i = 0; i < size; ++i) {
       ExpectEqual(a[i], b[i]);
     }
@@ -218,8 +218,8 @@ struct Expector<T, true> {
     ASSERT_EQ(x.dtype(), DataTypeToEnum<T>::v());
     AssertSameTypeDims(x, y);
     const auto size = x.NumElements();
-    const T* a = x.flat<T>().data();
-    const T* b = y.flat<T>().data();
+    const T* a = x.unaligned_flat<T>().data();
+    const T* b = y.unaligned_flat<T>().data();
     for (int i = 0; i < size; ++i) {
       ExpectEqual(a[i], b[i]);
     }
@@ -235,8 +235,8 @@ struct Expector<T, true> {
     ASSERT_EQ(x.dtype(), DataTypeToEnum<T>::v());
     AssertSameTypeDims(x, y);
     const auto size = x.NumElements();
-    const T* a = x.flat<T>().data();
-    const T* b = y.flat<T>().data();
+    const T* a = x.unaligned_flat<T>().data();
+    const T* b = y.unaligned_flat<T>().data();
     for (int i = 0; i < size; ++i) {
       EXPECT_TRUE(Near(a[i], b[i], abs_err))
           << "a = " << a[i] << " b = " << b[i] << " index = " << i;
diff --git a/tensorflow/core/framework/tensor_testutil_test.cc b/tensorflow/core/framework/tensor_testutil_test.cc
index dd321535f25..879f8d2b491 100644
--- a/tensorflow/core/framework/tensor_testutil_test.cc
+++ b/tensorflow/core/framework/tensor_testutil_test.cc
@@ -184,6 +184,16 @@ TEST(TensorTestUtilTest, ExpectTensorNearDouble) {
   TestEdgeCasesNear<T>();
 }
 
+// Tensor::Slice() and Tensor::SubSlice() may return unaligned Tensor.
+TEST(TensorTestUtilTest, ExpectTensorNearSlice) {
+  Tensor x(DT_FLOAT, TensorShape({7, 3}));
+  test::FillFn<float>(&x, [](int i) -> float { return 1.0; });
+
+  test::ExpectTensorNear<float>(
+      x.SubSlice(3), test::AsTensor<float>({1.0, 1.0, 1.0}, TensorShape({3})),
+      1e-10);
+}
+
 static const double kSlackFactor = 5.0;
 
 template <typename T>
diff --git a/tensorflow/core/framework/tracking_allocator.h b/tensorflow/core/framework/tracking_allocator.h
index 428bffd9e15..ca18dc9a050 100644
--- a/tensorflow/core/framework/tracking_allocator.h
+++ b/tensorflow/core/framework/tracking_allocator.h
@@ -91,29 +91,29 @@ class TrackingAllocator : public Allocator {
   ~TrackingAllocator() override {}
 
  private:
-  bool UnRef() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  bool UnRef() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   Allocator* allocator_;  // not owned.
   mutable mutex mu_;
   // the number of calls to AllocateRaw that have not yet been matched
   // by a corresponding call to DeAllocateRaw, plus 1 if the Executor
   // has not yet read out the high watermark.
-  int ref_ GUARDED_BY(mu_);
+  int ref_ TF_GUARDED_BY(mu_);
   // the current number of outstanding bytes that have been allocated
   // by this wrapper, or 0 if the underlying allocator does not track
   // allocation sizes.
-  size_t allocated_ GUARDED_BY(mu_);
+  size_t allocated_ TF_GUARDED_BY(mu_);
   // the maximum number of outstanding bytes that have been allocated
   // by this wrapper, or 0 if the underlying allocator does not track
   // allocation sizes.
-  size_t high_watermark_ GUARDED_BY(mu_);
+  size_t high_watermark_ TF_GUARDED_BY(mu_);
   // the total number of bytes that have been allocated by this
   // wrapper if the underlying allocator tracks allocation sizes,
   // otherwise the total number of bytes that have been requested by
   // this allocator.
-  size_t total_bytes_ GUARDED_BY(mu_);
+  size_t total_bytes_ TF_GUARDED_BY(mu_);
 
-  gtl::InlinedVector<AllocRecord, 4> allocations_ GUARDED_BY(mu_);
+  gtl::InlinedVector<AllocRecord, 4> allocations_ TF_GUARDED_BY(mu_);
 
   // Track allocations locally if requested in the constructor and the
   // underlying allocator doesn't already do it for us.
@@ -123,8 +123,8 @@ class TrackingAllocator : public Allocator {
     size_t allocated_size;
     int64 allocation_id;
   };
-  std::unordered_map<const void*, Chunk> in_use_ GUARDED_BY(mu_);
-  int64 next_allocation_id_ GUARDED_BY(mu_);
+  std::unordered_map<const void*, Chunk> in_use_ TF_GUARDED_BY(mu_);
+  int64 next_allocation_id_ TF_GUARDED_BY(mu_);
 };
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/framework/unique_tensor_references.cc b/tensorflow/core/framework/unique_tensor_references.cc
deleted file mode 100644
index ab33d9ede6c..00000000000
--- a/tensorflow/core/framework/unique_tensor_references.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/unique_tensor_references.h"
-
-namespace tensorflow {
-
-UniqueTensorReferences::~UniqueTensorReferences() {
-  if (!frozen_) {
-    // The references were not retrieved so discard them to avoid
-    // leaking memory.
-    TensorReferenceVector refs;
-    FreezeAndReturnReferences(&refs);
-    for (auto& tensor : refs) {
-      tensor.Unref();
-    }
-  }
-  delete referenced_tensors_set_;
-}
-
-void UniqueTensorReferences::Add(const Tensor& tensor) {
-  DCHECK(!frozen_);
-  // Do nothing if the tensor has a null buffer.
-  if (tensor.IsInitialized() && tensor.NumElements() > 0) {
-    if (referenced_tensors_set_ != nullptr) {
-      // There are enough tensors that we are using a hash set to
-      // de-duplicate.
-      const TensorReference tensor_ref(tensor);
-      if (!referenced_tensors_set_->insert(tensor_ref).second) {
-        // The tensor was a duplicate, so discard the reference.
-        tensor_ref.Unref();
-      }
-    } else {
-      for (size_t i = 0; i < referenced_tensors_vector_.size(); ++i) {
-        if (referenced_tensors_vector_[i].SharesBufferWith(tensor)) {
-          // tensor is a duplicate, so nothing to do.
-          return;
-        }
-      }
-      referenced_tensors_vector_.push_back(TensorReference(tensor));
-      if (kInVector == referenced_tensors_vector_.size()) {
-        // There are too many tensors to keep using the N^2 algorithm
-        // so start de-duplicating using a set.
-        // Transfer the refs from the vector to the set.
-        DCHECK(referenced_tensors_set_ == nullptr);
-        referenced_tensors_set_ = new ReferencedTensorsSet;
-        referenced_tensors_set_->reserve(kInVector);
-        referenced_tensors_set_->insert(referenced_tensors_vector_.begin(),
-                                        referenced_tensors_vector_.end());
-        DCHECK_EQ(kInVector, referenced_tensors_set_->size());
-        referenced_tensors_vector_.clear();
-      }
-    }
-  }
-}
-
-void UniqueTensorReferences::FreezeAndReturnReferences(
-    TensorReferenceVector* out_vector) {
-  // Prevent any further additions.
-  frozen_ = true;
-  if (referenced_tensors_set_ != nullptr) {
-    DCHECK(referenced_tensors_vector_.empty());
-    out_vector->reserve(referenced_tensors_set_->size());
-    for (const auto& ref : *referenced_tensors_set_) {
-      out_vector->push_back(ref);
-    }
-    referenced_tensors_set_->clear();
-    delete referenced_tensors_set_;
-    referenced_tensors_set_ = nullptr;
-  } else {
-    out_vector->reserve(referenced_tensors_vector_.size());
-    for (const auto& ref : referenced_tensors_vector_) {
-      out_vector->push_back(ref);
-    }
-    referenced_tensors_vector_.clear();
-  }
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/framework/unique_tensor_references.h b/tensorflow/core/framework/unique_tensor_references.h
deleted file mode 100644
index 7520034301a..00000000000
--- a/tensorflow/core/framework/unique_tensor_references.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_FRAMEWORK_UNIQUE_TENSOR_REFERENCES_H_
-#define TENSORFLOW_FRAMEWORK_UNIQUE_TENSOR_REFERENCES_H_
-
-#include <unordered_set>
-
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_reference.h"
-#include "tensorflow/core/lib/gtl/inlined_vector.h"
-#include "tensorflow/core/platform/macros.h"
-
-namespace tensorflow {
-
-// Helper class to maintain a unique set of tensor references. In the
-// common case there are not many references, so an inline vector is
-// used for <= kInVector unique elements, defaulting to 4 since that
-// is the inlined size of TensorReferenceVector. To avoid N^2
-// operations when adding N items, any larger number of unique tensor
-// references switches to using an unordered set.
-class UniqueTensorReferences {
- public:
-  UniqueTensorReferences() : frozen_(false), referenced_tensors_set_(nullptr) {}
-
-  ~UniqueTensorReferences();
-
-  // Adds a reference to tensor if its buffer is not already referenced.
-  void Add(const Tensor& tensor);
-
-  // No more references may be added after this is called. The unique
-  // references are returning in out_vector.
-  void FreezeAndReturnReferences(TensorReferenceVector* out_vector);
-
- private:
-  // Up to kInVector elements are stored in reference_tensors_vector_
-  // to avoid any allocations or hash computations in the common
-  // case. When more unique elements are added they move to
-  // referenced_tensors_set_ to avoid an N^2 algorithm on insert.
-  static const int kInVector = 4;  // Must be >= 1.
-
-  struct TensorReferenceEqualFn {
-    bool operator()(const TensorReference& t1,
-                    const TensorReference& t2) const {
-      return t1.SharesBufferWith(t2);
-    }
-  };
-
-  struct TensorReferenceHashFn {
-    size_t operator()(const TensorReference& t) const { return t.BufferHash(); }
-  };
-
-  bool frozen_;
-  TensorReferenceVector referenced_tensors_vector_;
-
-  typedef std::unordered_set<TensorReference, TensorReferenceHashFn,
-                             TensorReferenceEqualFn>
-      ReferencedTensorsSet;
-  // Lazily allocated hash set for when the number of tensors becomes too large.
-  // If this is non-NULL, then we use the hash set, otherwise, we use the
-  // referenced_tensors_vector_ (and do O(N^2) work per insertion).
-  ReferencedTensorsSet* referenced_tensors_set_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(UniqueTensorReferences);
-};
-
-}  // end namespace tensorflow
-
-#endif  // TENSORFLOW_FRAMEWORK_UNIQUE_TENSOR_REFERENCES_H_
diff --git a/tensorflow/core/framework/unique_tensor_references_test.cc b/tensorflow/core/framework/unique_tensor_references_test.cc
deleted file mode 100644
index 47d8f914525..00000000000
--- a/tensorflow/core/framework/unique_tensor_references_test.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/unique_tensor_references.h"
-
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-
-TEST(UniquifyTensors, TestUniqueVector) {
-  UniqueTensorReferences refs;
-  Tensor a(DT_FLOAT, TensorShape({2, 2}));
-  Tensor b(DT_FLOAT, TensorShape({2, 2}));
-
-  EXPECT_FALSE(a.SharesBufferWith(b));
-
-  refs.Add(a);
-  refs.Add(b);
-  TensorReferenceVector tensors;
-  refs.FreezeAndReturnReferences(&tensors);
-  EXPECT_EQ(2, tensors.size());
-  if (tensors[0].SharesBufferWith(a)) {
-    EXPECT_TRUE(tensors[1].SharesBufferWith(b));
-  } else {
-    EXPECT_TRUE(tensors[1].SharesBufferWith(a));
-    EXPECT_TRUE(tensors[0].SharesBufferWith(b));
-  }
-  for (auto& t : tensors) {
-    t.Unref();
-  }
-}
-
-TEST(UniquifyTensors, TestNonUniqueVector) {
-  UniqueTensorReferences refs;
-  Tensor a(DT_FLOAT, TensorShape({2, 2}));
-  Tensor b(a);
-
-  EXPECT_TRUE(a.SharesBufferWith(b));
-
-  refs.Add(a);
-  refs.Add(b);
-  TensorReferenceVector tensors;
-  refs.FreezeAndReturnReferences(&tensors);
-  EXPECT_EQ(1, tensors.size());
-  EXPECT_TRUE(tensors[0].SharesBufferWith(a));
-  EXPECT_TRUE(tensors[0].SharesBufferWith(b));
-  for (auto& t : tensors) {
-    t.Unref();
-  }
-}
-
-TEST(UniquifyTensors, TestNoLeakVector) {
-  UniqueTensorReferences refs;
-  Tensor a(DT_FLOAT, TensorShape({2, 2}));
-  Tensor b(DT_FLOAT, TensorShape({2, 2}));
-
-  EXPECT_FALSE(a.SharesBufferWith(b));
-
-  refs.Add(a);
-  refs.Add(b);
-}
-
-TEST(UniquifyTensors, TestUniqueSet) {
-  UniqueTensorReferences refs;
-  Tensor a(DT_FLOAT, TensorShape({2, 2}));
-  Tensor b(DT_FLOAT, TensorShape({2, 2}));
-  Tensor c(DT_FLOAT, TensorShape({2, 2}));
-  Tensor d(DT_FLOAT, TensorShape({2, 2}));
-  Tensor e(DT_FLOAT, TensorShape({2, 2}));
-
-  EXPECT_FALSE(a.SharesBufferWith(b));
-
-  refs.Add(a);
-  refs.Add(b);
-  refs.Add(c);
-  refs.Add(d);
-  refs.Add(e);
-  TensorReferenceVector tensors;
-  refs.FreezeAndReturnReferences(&tensors);
-  EXPECT_EQ(5, tensors.size());
-  for (auto& t : tensors) {
-    t.Unref();
-  }
-}
-
-TEST(UniquifyTensors, TestNonUniqueSet) {
-  UniqueTensorReferences refs;
-  Tensor a(DT_FLOAT, TensorShape({2, 2}));
-  Tensor b(DT_FLOAT, TensorShape({2, 2}));
-  Tensor c(DT_FLOAT, TensorShape({2, 2}));
-  Tensor d(DT_FLOAT, TensorShape({2, 2}));
-  Tensor e(DT_FLOAT, TensorShape({2, 2}));
-  Tensor f(c);
-
-  EXPECT_TRUE(f.SharesBufferWith(c));
-
-  refs.Add(a);
-  refs.Add(b);
-  refs.Add(c);
-  refs.Add(d);
-  refs.Add(e);
-  refs.Add(f);
-  TensorReferenceVector tensors;
-  refs.FreezeAndReturnReferences(&tensors);
-  EXPECT_EQ(5, tensors.size());
-  for (auto& t : tensors) {
-    t.Unref();
-  }
-}
-
-TEST(UniquifyTensors, TestNoLeakSet) {
-  UniqueTensorReferences refs;
-  Tensor a(DT_FLOAT, TensorShape({2, 2}));
-  Tensor b(DT_FLOAT, TensorShape({2, 2}));
-  Tensor c(DT_FLOAT, TensorShape({2, 2}));
-  Tensor d(DT_FLOAT, TensorShape({2, 2}));
-  Tensor e(DT_FLOAT, TensorShape({2, 2}));
-
-  refs.Add(a);
-  refs.Add(b);
-  refs.Add(c);
-  refs.Add(d);
-  refs.Add(e);
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass_test.cc b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
index 68b6750961f..b8235e5140e 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
@@ -107,20 +107,19 @@ class MklToTfConversionPass : public ::testing::Test {
   string original_;
 };
 
-REGISTER_OP("Input").Output("o: float").SetIsStateful();
-REGISTER_OP("HalfInput").Output("o: half").SetIsStateful();
-REGISTER_OP("_MklInput").Output("o: uint8").SetIsStateful();
+REGISTER_OP("Float_Input").Output("o: float").SetIsStateful();
+REGISTER_OP("_Mkl_Input").Output("o: uint8").SetIsStateful();
 
 TEST_F(MklToTfConversionPass, Basic) {
   InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
+      "node { name: 'A' op: 'Float_Input'}"
+      "node { name: 'B' op: 'Float_Input'}"
       "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['A', 'B'] }"
       "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['A', 'B'] }");
   EXPECT_EQ(DoRunMklToTfConversionPass(),
-            "A(Input);B(Input);C(Mul);D(Mul)|"
+            "A(Float_Input);B(Float_Input);C(Mul);D(Mul)|"
             "A->C;A->D;B->C:1;B->D:1");
 }
 
@@ -130,10 +129,10 @@ TEST_F(MklToTfConversionPass, Basic) {
 TEST_F(MklToTfConversionPass, Positive) {
   if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
     InitGraph(
-        "node { name: 'A' op: 'Input'}"
-        "node { name: 'M' op: '_MklInput'}"
-        "node { name: 'B' op: 'Input'}"
-        "node { name: 'N' op: '_MklInput'}"
+        "node { name: 'A' op: 'Float_Input'}"
+        "node { name: 'M' op: '_Mkl_Input'}"
+        "node { name: 'B' op: 'Float_Input'}"
+        "node { name: 'N' op: '_Mkl_Input'}"
         "node { name: 'C' op: '_MklConv2D'"
         " attr { key: 'T'                value { type: DT_FLOAT } }"
         " attr { key: 'data_format'      value { s: 'NCHW' } }"
@@ -142,21 +141,22 @@ TEST_F(MklToTfConversionPass, Positive) {
         "}"
         " attr { key: 'padding'          value { s: 'SAME' } }"
         " input: ['A', 'M', 'B', 'N']}"
-        "node { name: 'D' op: 'Input'}"
+        "node { name: 'D' op: 'Float_Input'}"
         "node { name: 'E' op: 'Sub'"
         " attr {key: 'T'                 value { type: DT_FLOAT } }"
         " input: ['C', 'D']}");
     EXPECT_EQ(DoRunMklToTfConversionPass(),
-              "A(Input);B(Input);C(_MklConv2D);D(Input);E(Sub);M(_MklInput);"
-              "Mkl2Tf/_0(_MklToTf);N(_MklInput)|A->C;B->C:2;C->Mkl2Tf/_0;"
+              "A(Float_Input);B(Float_Input);C(_MklConv2D);D(Float_Input);E("
+              "Sub);M(_Mkl_Input);"
+              "Mkl2Tf/_0(_MklToTf);N(_Mkl_Input)|A->C;B->C:2;C->Mkl2Tf/_0;"
               "C:1->Mkl2Tf/_0:1;D->E:1;M->C:1;Mkl2Tf/_0->E;N->C:3");
   } else {
     CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
     InitGraph(
-        "node { name: 'A' op: 'Input'}"
-        "node { name: 'B' op: 'Input'}"
-        "node { name: 'M' op: '_MklInput'}"
-        "node { name: 'N' op: '_MklInput'}"
+        "node { name: 'A' op: 'Float_Input'}"
+        "node { name: 'B' op: 'Float_Input'}"
+        "node { name: 'M' op: '_Mkl_Input'}"
+        "node { name: 'N' op: '_Mkl_Input'}"
         "node { name: 'C' op: '_MklConv2D'"
         " attr { key: 'T'                value { type: DT_FLOAT } }"
         " attr { key: 'data_format'      value { s: 'NCHW' } }"
@@ -165,13 +165,14 @@ TEST_F(MklToTfConversionPass, Positive) {
         "}"
         " attr { key: 'padding'          value { s: 'SAME' } }"
         " input: ['A', 'B', 'M', 'N']}"
-        "node { name: 'D' op: 'Input'}"
+        "node { name: 'D' op: 'Float_Input'}"
         "node { name: 'E' op: 'Sub'"
         " attr {key: 'T'                 value { type: DT_FLOAT } }"
         " input: ['C', 'D']}");
     EXPECT_EQ(DoRunMklToTfConversionPass(),
-              "A(Input);B(Input);C(_MklConv2D);D(Input);E(Sub);M(_MklInput);"
-              "Mkl2Tf/_0(_MklToTf);N(_MklInput)|A->C;B->C:1;C->Mkl2Tf/_0;"
+              "A(Float_Input);B(Float_Input);C(_MklConv2D);D(Float_Input);E("
+              "Sub);M(_Mkl_Input);"
+              "Mkl2Tf/_0(_MklToTf);N(_Mkl_Input)|A->C;B->C:1;C->Mkl2Tf/_0;"
               "C:2->Mkl2Tf/_0:1;D->E:1;M->C:2;Mkl2Tf/_0->E;N->C:3");
   }
 }
@@ -183,10 +184,10 @@ TEST_F(MklToTfConversionPass, Positive) {
 TEST_F(MklToTfConversionPass, Negative_DoubleInsert) {
   if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
     InitGraph(
-        "node { name: 'A' op: 'Input'}"
-        "node { name: 'M' op: '_MklInput'}"
-        "node { name: 'B' op: 'Input'}"
-        "node { name: 'N' op: '_MklInput'}"
+        "node { name: 'A' op: 'Float_Input'}"
+        "node { name: 'M' op: '_Mkl_Input'}"
+        "node { name: 'B' op: 'Float_Input'}"
+        "node { name: 'N' op: '_Mkl_Input'}"
         "node { name: 'C' op: '_MklConv2D'"
         " attr { key: 'T'                value { type: DT_FLOAT } }"
         " attr { key: 'data_format'      value { s: 'NCHW' } }"
@@ -199,21 +200,22 @@ TEST_F(MklToTfConversionPass, Negative_DoubleInsert) {
         " attr { key: 'T'                value { type: DT_FLOAT } }"
         " attr { key: 'data_format'      value { s: 'NCHW' } }"
         " input: ['C:0', 'C:1']}"
-        "node { name: 'E' op: 'Input'}"
+        "node { name: 'E' op: 'Float_Input'}"
         "node { name: 'F' op: 'Sub'"
         " attr {key: 'T'                 value { type: DT_FLOAT } }"
         " input: ['D', 'E']}");
     EXPECT_EQ(DoRunMklToTfConversionPass(),
-              "A(Input);B(Input);C(_MklConv2D);D(_MklToTf);E(Input);"
-              "F(Sub);M(_MklInput);N(_MklInput)|"
+              "A(Float_Input);B(Float_Input);C(_MklConv2D);D(_MklToTf);E(Float_"
+              "Input);"
+              "F(Sub);M(_Mkl_Input);N(_Mkl_Input)|"
               "A->C;B->C:2;C->D;C:1->D:1;D->F;E->F:1;M->C:1;N->C:3");
   } else {
     CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
     InitGraph(
-        "node { name: 'A' op: 'Input'}"
-        "node { name: 'B' op: 'Input'}"
-        "node { name: 'M' op: '_MklInput'}"
-        "node { name: 'N' op: '_MklInput'}"
+        "node { name: 'A' op: 'Float_Input'}"
+        "node { name: 'B' op: 'Float_Input'}"
+        "node { name: 'M' op: '_Mkl_Input'}"
+        "node { name: 'N' op: '_Mkl_Input'}"
         "node { name: 'C' op: '_MklConv2D'"
         " attr { key: 'T'                value { type: DT_FLOAT } }"
         " attr { key: 'data_format'      value { s: 'NCHW' } }"
@@ -226,13 +228,14 @@ TEST_F(MklToTfConversionPass, Negative_DoubleInsert) {
         " attr { key: 'T'                value { type: DT_FLOAT } }"
         " attr { key: 'data_format'      value { s: 'NCHW' } }"
         " input: ['C:0', 'C:2']}"
-        "node { name: 'E' op: 'Input'}"
+        "node { name: 'E' op: 'Float_Input'}"
         "node { name: 'F' op: 'Sub'"
         " attr {key: 'T'                 value { type: DT_FLOAT } }"
         " input: ['D', 'E']}");
     EXPECT_EQ(DoRunMklToTfConversionPass(),
-              "A(Input);B(Input);C(_MklConv2D);D(_MklToTf);E(Input);"
-              "F(Sub);M(_MklInput);N(_MklInput)|"
+              "A(Float_Input);B(Float_Input);C(_MklConv2D);D(_MklToTf);E(Float_"
+              "Input);"
+              "F(Sub);M(_Mkl_Input);N(_Mkl_Input)|"
               "A->C;B->C:1;C->D;C:2->D:1;D->F;E->F:1;M->C:2;N->C:3");
   }
 }
@@ -241,8 +244,8 @@ TEST_F(MklToTfConversionPass, Negative_DoubleInsert) {
 // There is no Mkl layer so no conversion op should be inserted.
 TEST_F(MklToTfConversionPass, Negative_NoMklLayer) {
   InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
+      "node { name: 'A' op: 'Float_Input'}"
+      "node { name: 'B' op: 'Float_Input'}"
       "node { name: 'C' op: 'Conv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
@@ -250,17 +253,18 @@ TEST_F(MklToTfConversionPass, Negative_NoMklLayer) {
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
       " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Input'}"
+      "node { name: 'D' op: 'Float_Input'}"
       "node { name: 'E' op: 'BiasAdd'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
       " input: ['C', 'D'] }"
-      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Y' op: 'Float_Input'}"
       "node { name: 'Z' op: 'Sub'"
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
       " input: ['E', 'Y']}");
   EXPECT_EQ(DoRunMklToTfConversionPass(),
-            "A(Input);B(Input);C(Conv2D);D(Input);E(BiasAdd);Y(Input);Z(Sub)|"
+            "A(Float_Input);B(Float_Input);C(Conv2D);D(Float_Input);E(BiasAdd);"
+            "Y(Float_Input);Z(Sub)|"
             "A->C;B->C:1;C->E;D->E:1;E->Z;Y->Z:1");
 }
 
@@ -268,7 +272,7 @@ static void BM_RunMklToTfConversionPass(int iters, int op_nodes) {
   testing::StopTiming();
   string s;
   for (int in = 0; in < 10; in++) {
-    s += strings::Printf("node { name: 'in%04d' op: 'Input'}", in);
+    s += strings::Printf("node { name: 'in%04d' op: 'Float_Input'}", in);
   }
   random::PhiloxRandom philox(301, 17);
   random::SimplePhilox rnd(&philox);
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index f8ab8748285..618cc9a990f 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -40,6 +40,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
diff --git a/tensorflow/core/grappler/clusters/BUILD b/tensorflow/core/grappler/clusters/BUILD
index 74c8837313b..ab1c36010e9 100644
--- a/tensorflow/core/grappler/clusters/BUILD
+++ b/tensorflow/core/grappler/clusters/BUILD
@@ -1,6 +1,5 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_library")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
@@ -138,6 +137,7 @@ cc_library(
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/kernels:ops_util",
     ],
+    alwayslink = 1,
 )
 
 tf_cc_test(
diff --git a/tensorflow/core/grappler/clusters/single_machine.h b/tensorflow/core/grappler/clusters/single_machine.h
index c0421dd4de8..9e085d161b6 100644
--- a/tensorflow/core/grappler/clusters/single_machine.h
+++ b/tensorflow/core/grappler/clusters/single_machine.h
@@ -70,7 +70,7 @@ class SingleMachine : public Cluster {
   std::vector<QueueRunnerDef> queue_runner_defs_;
   string last_graph_id_;
   mutex last_graph_mu_;
-  const GraphDef* last_graph_ GUARDED_BY(last_graph_mu_) = nullptr;
+  const GraphDef* last_graph_ TF_GUARDED_BY(last_graph_mu_) = nullptr;
   std::vector<string> init_ops_;
   int64 expected_init_time_s_;
   std::unique_ptr<Coordinator> coordinator_;
@@ -80,7 +80,7 @@ class SingleMachine : public Cluster {
   RunMetadata init_metadata_;
 
   mutex close_mu_;
-  bool closing_ GUARDED_BY(close_mu_);
+  bool closing_ TF_GUARDED_BY(close_mu_);
 
   bool cpu_allocator_stats_enabled_ = false;
 };
diff --git a/tensorflow/core/grappler/clusters/single_machine_test.cc b/tensorflow/core/grappler/clusters/single_machine_test.cc
index 1770701033e..a85e4e17748 100644
--- a/tensorflow/core/grappler/clusters/single_machine_test.cc
+++ b/tensorflow/core/grappler/clusters/single_machine_test.cc
@@ -40,7 +40,12 @@ class SingleMachineTest : public ::testing::Test {
     // Provision a single machine with 3 cpu cores, and a short timeout of 5
     // seconds: since there isn't much work to process a test graph that should
     // be plenty.
+#if TENSORFLOW_USE_ROCM
+    // ROCm takes longer to start up
+    int timeout_s = 10;
+#else
     int timeout_s = 5;
+#endif
 #ifdef THREAD_SANITIZER
     timeout_s *= 5;
 #endif
@@ -348,10 +353,11 @@ static void RunInfiniteTFLoop() {
 }
 
 TEST_F(SingleMachineTest, InfiniteLoops) {
+#if !(TENSORFLOW_USE_ROCM)  // fails with ROCm (investigate)
   // The RunInfiniteTFLoop function creates its own cluster.
   TF_CHECK_OK(cluster_->Shutdown());
-
   EXPECT_EXIT(RunInfiniteTFLoop(), ::testing::ExitedWithCode(0), ".*");
+#endif
 }
 
 TEST_F(SingleMachineTest, InitializationMemory) {
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index 2d547b968fc..9104cea896d 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -1,4 +1,8 @@
-load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_library")
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+    "tf_cuda_library",
+)
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_additional_all_protos",
@@ -303,6 +307,7 @@ cc_library(
         "//tensorflow/core/grappler/costs:cost_estimator",
         "//tensorflow/core/kernels:ops_util",
     ],
+    alwayslink = 1,
 )
 
 cc_library(
@@ -353,6 +358,7 @@ cc_library(
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
     ] + tf_protos_grappler(),
+    alwayslink = 1,
 )
 
 tf_cc_test(
diff --git a/tensorflow/core/grappler/devices.cc b/tensorflow/core/grappler/devices.cc
index 920900c5aa9..99e5995b012 100644
--- a/tensorflow/core/grappler/devices.cc
+++ b/tensorflow/core/grappler/devices.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #endif  // GOOGLE_CUDA
@@ -30,12 +30,22 @@ namespace grappler {
 int GetNumAvailableGPUs(
     const std::pair<int, int>& min_cuda_compute_capability) {
   int num_eligible_gpus = 0;
-#if GOOGLE_CUDA
+
+#if TENSORFLOW_USE_ROCM
+  if (min_cuda_compute_capability.first != 0 ||
+      min_cuda_compute_capability.second != 0) {
+    LOG(ERROR) << "GetNumAvailableGPUs() should receive zero "
+                  "min_cuda_compute_capability";
+    return 0;
+  }
+#endif
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   if (ValidateGPUMachineManager().ok()) {
     se::Platform* gpu_manager = GPUMachineManager();
     if (gpu_manager != nullptr) {
       int num_gpus = gpu_manager->VisibleDeviceCount();
       for (int i = 0; i < num_gpus; i++) {
+#if GOOGLE_CUDA
         auto desc_status = gpu_manager->DescriptionForDevice(i);
         if (desc_status.ok()) {
           auto desc = desc_status.ConsumeValueOrDie();
@@ -49,25 +59,33 @@ int GetNumAvailableGPUs(
             num_eligible_gpus++;
           }
         }
+#else
+        num_eligible_gpus++;
+#endif
       }
     }
   }
+#if GOOGLE_CUDA
   LOG(INFO)
       << "Number of eligible GPUs (core count >= 8, compute capability >= "
       << min_cuda_compute_capability.first << "."
       << min_cuda_compute_capability.second << "): " << num_eligible_gpus;
 #else
+  LOG(INFO) << "Number of eligible GPUs: " << num_eligible_gpus;
+#endif
+
+#else   // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   LOG(INFO)
       << "Number of eligible GPUs (core count >= 8, compute capability >= "
       << min_cuda_compute_capability.first << "."
       << min_cuda_compute_capability.second << "): " << num_eligible_gpus
-      << " (Note: TensorFlow was not compiled with CUDA support)";
-#endif  // GOOGLE_CUDA
+      << " (Note: TensorFlow was not compiled with CUDA or ROCm support)";
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   return num_eligible_gpus;
 }
 
 int64 AvailableGPUMemory(int gpu_id) {
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   // Look up the device, to see its attributes.
   se::Platform* gpu_platform = GPUMachineManager();
   CHECK_LT(gpu_id, gpu_platform->VisibleDeviceCount());
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index f6384c35360..8b8df527041 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -695,7 +695,7 @@ bool IsFreeOfSideEffect(const NodeDef& node) {
 
 bool ModifiesInputsInPlace(const NodeDef& node) {
   // Some nodes do in-place updates on regular tensor inputs.
-  string op_name = node.op();
+  const string& op_name = node.op();
 
   // Ops that modify resource variables effectively modify one of their inputs.
   if (op_name == "AssignVariableOp" || op_name == "AssignAddVariableOp" ||
@@ -706,8 +706,10 @@ bool ModifiesInputsInPlace(const NodeDef& node) {
     return false;
   }
 
-  std::transform(op_name.begin(), op_name.end(), op_name.begin(), ::tolower);
-  if (absl::StrContains(op_name, "inplace")) {
+  string lower_op_name = op_name;
+  std::transform(lower_op_name.begin(), lower_op_name.end(),
+                 lower_op_name.begin(), ::tolower);
+  if (absl::StrContains(lower_op_name, "inplace")) {
     return true;
   }
   return GetBoolAttr(node, "in_place") || GetBoolAttr(node, "inplace");
@@ -862,20 +864,25 @@ bool NeverForwardsInputs(const NodeDef& node) {
       (new gtl::FlatSet<string>{"ArgMax",
                                 "ArgMin",
                                 "AudioSpectrogram",
+                                "AvgPool",
                                 "BatchMatMul",
                                 "BatchMatMulV2",
+                                "BatchNormWithGlobalNormalization",
                                 "BatchToSpace",
                                 "BatchToSpaceND",
                                 "Bincount",
                                 "BroadcastArgs",
                                 "BroadcastGradientArgs",
+                                "Bucketize",
                                 "CTCBeamSearchDecoder",
                                 "CTCGreedyDecoder",
                                 "CTCLoss",
+                                "CompareAndBitpack",
                                 "ComplexAbs",
                                 "Concat",
                                 "ConcatOffset",
                                 "ConcatV2",
+                                "Conv2D",
                                 "Copy",
                                 "CopyHost",
                                 "Cross",
@@ -890,8 +897,8 @@ bool NeverForwardsInputs(const NodeDef& node) {
                                 "CudnnRNNParamsToCanonicalV2",
                                 "CudnnRNNV2",
                                 "CudnnRNNV3",
-                                "CumSum",
                                 "CumProd",
+                                "CumSum",
                                 "DebugNanCount",
                                 "DebugNumericSummary",
                                 "DecodeProtoV2",
@@ -920,15 +927,25 @@ bool NeverForwardsInputs(const NodeDef& node) {
                                 "LowerBound",
                                 "MatMul",
                                 "MatrixDiag",
-                                "MatrixDiagV2",
                                 "MatrixDiagPart",
                                 "MatrixDiagPartV2",
+                                "MatrixDiagV2",
                                 "Mfcc",
+                                "Multinomial",
                                 "OneHot",
                                 "Pack",
+                                "ParameterizedTruncatedNormal",
                                 "PopulationCount",
+                                "RandomGamma",
+                                "RandomPoisson",
+                                "RandomPoissonV2",
+                                "RandomStandardNormal",
+                                "RandomUniform",
+                                "RandomUniformInt",
                                 "Range",
                                 "Rank",
+                                "RequantizationRange",
+                                "Requantize",
                                 "ReverseSequence",
                                 "Shape",
                                 "ShapeN",
@@ -939,6 +956,7 @@ bool NeverForwardsInputs(const NodeDef& node) {
                                 "SparseMatMul",
                                 "Split",
                                 "SplitV",
+                                "TruncatedNormal",
                                 "Unique",
                                 "UniqueV2",
                                 "UniqueWithCounts",
@@ -946,23 +964,7 @@ bool NeverForwardsInputs(const NodeDef& node) {
                                 "Unpack",
                                 "UnravelIndex",
                                 "UpperBound",
-                                "Where",
-                                "CompareAndBitpack",
-                                "Requantize",
-                                "RequantizationRange",
-                                "Bucketize",
-                                "AvgPool",
-                                "BatchNormWithGlobalNormalization",
-                                "Conv2D",
-                                "RandomUniform",
-                                "RandomUniformInt",
-                                "RandomStandardNormal",
-                                "ParameterizedTruncatedNormal",
-                                "TruncatedNormal",
-                                "Multinomial",
-                                "RandomGamma",
-                                "RandomPoisson",
-                                "RandomPoissonV2"}));
+                                "Where"}));
   const string& op_name = node.op();
   return kNonForwardingOps->count(op_name) > 0 ||
          absl::StrContains(op_name, "Segment") ||
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 9be609b3970..56b7754355c 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -302,6 +302,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":arithmetic_optimizer",
+        ":common_subgraph_elimination",
         ":constant_folding",
         ":model_pruner",
         "//tensorflow/core:test",
@@ -334,6 +335,59 @@ tf_cuda_cc_test(
     ],
 )
 
+cc_library(
+    name = "common_subgraph_elimination",
+    srcs = ["common_subgraph_elimination.cc"],
+    hdrs = [
+        "common_subgraph_elimination.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_optimizer",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_topology_view",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/utils:canonicalizer",
+        "//tensorflow/core/grappler/utils:topological_sort",
+        "//tensorflow/core/grappler/utils:traversal",
+        "//tensorflow/core/platform:hash",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "common_subgraph_elimination_test",
+    size = "small",
+    srcs = ["common_subgraph_elimination_test.cc"],
+    deps = [
+        ":arithmetic_optimizer_test_utils",
+        ":common_subgraph_elimination",
+        ":model_pruner",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+        "//tensorflow/core/grappler/utils:grappler_test",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "dependency_optimizer",
     srcs = ["dependency_optimizer.cc"],
@@ -605,6 +659,7 @@ cc_library(
         ":arithmetic_optimizer",
         ":auto_mixed_precision",
         ":auto_parallel",
+        ":common_subgraph_elimination",
         ":constant_folding",
         ":custom_graph_optimizer_registry",
         ":debug_stripper",
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 3281f97457f..0b9701ca0c3 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -3462,215 +3462,6 @@ class RemoveStackSliceSameAxis : public ArithmeticOptimizerStage {
 
 }  // namespace
 
-class UniqueNodes {
- public:
-  NodeDef* FindOrAddRepresentative(NodeDef* node) {
-    uint64 sig = ComputeSignature(*node);
-    std::vector<NodeDef*>& candidates = rep_[sig];
-    for (auto& candidate : candidates) {
-      if ((candidate == node) || SameNode(*candidate, *node)) {
-        return candidate;
-      }
-    }
-    candidates.push_back(node);
-    return node;
-  }
-
-  void RemoveRepresentative(NodeDef* node) {
-    auto it = memoized_signatures_.find(node);
-    if (it == memoized_signatures_.end()) return;
-
-    std::vector<NodeDef*>& candidates = rep_[it->second];
-    for (int i = 0; i < candidates.size(); ++i) {
-      if (candidates[i] == node) {
-        std::swap(candidates[i], candidates[candidates.size() - 1]);
-        candidates.resize(candidates.size() - 1);
-        break;
-      }
-    }
-    memoized_signatures_.erase(node);
-  }
-
- private:
-  uint64 ComputeSignature(const NodeDef& node);
-  bool SameNode(const NodeDef& node1, const NodeDef& node2) const;
-
-  absl::flat_hash_map<uint64, std::vector<NodeDef*>> rep_;
-  absl::flat_hash_map<const NodeDef*, uint64> memoized_signatures_;
-};
-
-uint64 UniqueNodes::ComputeSignature(const NodeDef& node) {
-  auto it = memoized_signatures_.find(&node);
-  if (it != memoized_signatures_.end()) return it->second;
-
-  uint64 h = Hash64(node.op());
-  h = Hash64Combine(Hash64(node.device()), h);
-
-  for (const auto& input : node.input()) {
-    const TensorId input_tensor = ParseTensorName(input);
-    uint64 input_hash = Hash64Combine(
-        Hash64(input_tensor.node().data(), input_tensor.node().size()),
-        std::hash<int>()(input_tensor.index()));
-    h = Hash64CombineUnordered(input_hash, h);
-  }
-  for (const auto& attr : node.attr()) {
-    uint64 attr_hash =
-        Hash64Combine(Hash64(attr.first), FastAttrValueHash(attr.second));
-    h = Hash64CombineUnordered(attr_hash, h);
-  }
-  memoized_signatures_.emplace(&node, h);
-  return h;
-}
-
-// PRECONDITION:
-//  Node input orders are assumed to be canonicalized, i.e. control inputs for
-//  all nodes as well as regular inputs for commutative nodes must be sorted.
-bool UniqueNodes::SameNode(const NodeDef& node1, const NodeDef& node2) const {
-  if (node1.op() != node2.op()) {
-    return false;
-  }
-  if (node1.device() != node2.device()) {
-    return false;
-  }
-  if (node1.input_size() != node2.input_size()) {
-    return false;
-  }
-  if (node1.attr_size() != node2.attr_size()) {
-    return false;
-  }
-
-  // Compare inputs.
-  auto it1 = node1.input().begin();
-  auto it2 = node2.input().begin();
-  for (; it1 != node1.input().end(); ++it1, ++it2) {
-    if (*it1 != *it2) return false;
-  }
-
-  // Compare attributes.
-  for (const auto& attr1 : node1.attr()) {
-    auto it = node2.attr().find(attr1.first);
-    if (it == node2.attr().end()) return false;
-    if (!FastAreAttrValuesEqual(attr1.second, it->second)) return false;
-  }
-
-  return true;
-}
-
-bool ArithmeticOptimizer::CanDedup(const NodeDef& node) const {
-  if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
-    return false;
-  }
-  if (IsEnter(node) || IsExit(node)) {
-    return false;
-  }
-  if (node.device().find("SPU") != string::npos) {
-    return false;
-  }
-  if (IsAssert(node) || IsPrint(node)) {
-    return true;
-  }
-  return IsFreeOfSideEffect(node);
-}
-
-void ArithmeticOptimizer::DedupComputations() {
-  CanonicalizeGraph(optimized_graph_);
-
-  GraphTopologyView graph_view;
-  if (!graph_view.InitializeFromGraph(*optimized_graph_).ok()) {
-    LOG(WARNING) << "Failed to initialize GraphTopologyView.";
-    return;
-  }
-
-  // Populate feed_inplace_op;
-  absl::flat_hash_set<const NodeDef*> feeds_inplace_op;
-  for (const NodeDef& root : optimized_graph_->node()) {
-    if (feeds_inplace_op.find(&root) != feeds_inplace_op.end()) continue;
-
-    if (ModifiesInputsInPlace(root)) {
-      const auto is_continue_traversal = [&](const NodeDef* node) -> bool {
-        return node->op() == root.op() || !NeverForwardsInputs(*node);
-      };
-
-      DfsTraversal(graph_view, {&root}, TraversalDirection::kFollowInputs,
-                   DfsPredicates::Advance(is_continue_traversal),
-                   DfsCallbacks::PreOrder([&](const NodeDef* node) {
-                     feeds_inplace_op.insert(node);
-                   }));
-    }
-  }
-
-  bool stop = true;
-  std::set<int> duplicates;
-  UniqueNodes nodes;
-  do {
-    stop = true;
-    for (int i = 0; i < optimized_graph_->node_size(); ++i) {
-      if (duplicates.find(i) != duplicates.end()) {
-        continue;
-      }
-      NodeDef* node = optimized_graph_->mutable_node(i);
-      if (!CanDedup(*node) ||
-          feeds_inplace_op.find(node) != feeds_inplace_op.end()) {
-        continue;
-      }
-      NodeDef* rep = nodes.FindOrAddRepresentative(node);
-      if (rep == node) {
-        continue;
-      }
-      // If either node or rep feeds an inplace op, deduping them may cause data
-      // races. For example: If we dedup nodes initializing two independent
-      // inplace accumulations, they will write to the same buffer, clobbering
-      // each other's results.
-      if (feeds_inplace_op.find(rep) != feeds_inplace_op.end()) {
-        continue;
-      }
-      const std::set<NodeDef*>& tmp = node_map_->GetOutputs(node->name());
-      std::vector<NodeDef*> fanouts(tmp.begin(), tmp.end());
-      for (NodeDef* fanout : fanouts) {
-        // Update consumers of node.
-        bool updated_fanout = false;
-        for (int i = 0; i < fanout->input_size(); ++i) {
-          string* fanout_input = fanout->mutable_input(i);
-
-          const int position =
-              NodePositionIfSameNode(*fanout_input, node->name());
-          // Update name in-place.
-          if (position < -1) {
-            continue;
-          } else {
-            if (!updated_fanout) {
-              // The signature of the fanout node will change. Remove it from
-              // nodes.
-              nodes.RemoveRepresentative(fanout);
-            }
-            updated_fanout = true;
-            if (position > 0) {
-              *fanout_input = StrCat(rep->name(), ":", position);
-            } else if (position == 0) {
-              *fanout_input = rep->name();
-            } else {
-              *fanout_input = StrCat("^", rep->name());
-            }
-          }
-        }
-        if (updated_fanout) {
-          node_map_->UpdateInput(fanout->name(), node->name(), rep->name());
-          CanonicalizeNode(fanout);
-        }
-      }
-      duplicates.insert(i);
-      stop = false;
-    }
-  } while (!stop);
-
-  // Delete duplicates
-  if (fetch_nodes_known_ && !duplicates.empty()) {
-    EraseNodesFromGraph(duplicates, optimized_graph_);
-    // Rebuild the NodeMap which was invalidated by the node  swapping above.
-    node_map_.reset(new NodeMap(optimized_graph_));
-  }
-}
-
 Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
   SetVector<NodeDef*> nodes_to_simplify;
   nodes_to_simplify.Reserve(optimized_graph_->node_size());
@@ -3818,11 +3609,6 @@ Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
   TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph_));
   GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
 
-  if (options_.dedup_computations) {
-    DedupComputations();
-    GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
-  }
-
   graph_properties_.reset(new GraphProperties(optimized_item));
   const bool assume_valid_feeds = opt_level_ == RewriterConfig::AGGRESSIVE;
   const Status status =
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index a421daa88a5..50896b11923 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -104,115 +104,6 @@ TEST_F(ArithmeticOptimizerTest, NoOp) {
   VerifyGraphsMatch(item.graph, output, __LINE__);
 }
 
-TEST_F(ArithmeticOptimizerTest, OpDedupping) {
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output c1 = ops::Const(s.WithOpName("c1"), {3.14, 2.7}, {1, 2});
-  Output c2 = ops::Const(s.WithOpName("c2"), {3.14, 2.7}, {1, 2});
-  Output div = ops::Div(s.WithOpName("div"), c1, c2);
-  GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  item.fetch = {"div"};
-
-  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
-  ASSERT_EQ(tensors_expected.size(), 1);
-
-  ArithmeticOptimizer optimizer;
-  GraphDef output;
-  OptimizeTwice(&optimizer, &item, &output);
-  NodeMap node_map(&output);
-  EXPECT_EQ(output.node_size(), 2);
-  const NodeDef* new_c1 = node_map.GetNode("c1");
-  ASSERT_NE(new_c1, nullptr);
-
-  const NodeDef* new_div = node_map.GetNode("div");
-  ASSERT_NE(new_div, nullptr);
-  ASSERT_EQ(new_div->input_size(), 2);
-  EXPECT_EQ(new_div->input(0), "c1");
-  EXPECT_EQ(new_div->input(1), "c1");
-
-  auto tensors = EvaluateNodes(output, item.fetch);
-  ASSERT_EQ(tensors.size(), 1);
-  test::ExpectTensorNear<double>(tensors[0], tensors_expected[0], 1e-6);
-}
-
-TEST_F(ArithmeticOptimizerTest, OpDeduppingAssertAndCheckNumerics) {
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output p = ops::Placeholder(s, DT_BOOL, ops::Placeholder::Shape({}));
-  Output c = ops::Const(s.WithOpName("c"), {3.14, 2.7}, {1, 2});
-  auto check1 = ops::CheckNumerics(s.WithOpName("check1"), c, "foo");
-  auto check2 = ops::CheckNumerics(s.WithOpName("check2"), c, "foo");
-  auto assert1 = ops::Assert(s.WithOpName("assert1"), p, {c});
-  auto assert2 = ops::Assert(s.WithOpName("assert2"), p, {c});
-  Output div = ops::Div(s.WithOpName("div").WithControlDependencies(
-                            {assert1.operation, assert2.operation}),
-                        check1, check2);
-  GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  item.fetch = {"div"};
-  Tensor bool_t(DT_BOOL, TensorShape({}));
-  bool_t.scalar<bool>().setConstant(true);
-  auto tensors_expected =
-      EvaluateNodes(item.graph, item.fetch, {{"Placeholder", bool_t}});
-  ASSERT_EQ(tensors_expected.size(), 1);
-
-  ArithmeticOptimizer optimizer;
-  GraphDef output;
-
-  OptimizeTwice(&optimizer, &item, &output);
-  NodeMap node_map(&output);
-
-  EXPECT_EQ(output.node_size(), 6);
-  const NodeDef* new_div = node_map.GetNode("div");
-  ASSERT_NE(new_div, nullptr);
-  ASSERT_EQ(new_div->input_size(), 3);
-  EXPECT_EQ(new_div->input(0), "check1");
-  EXPECT_EQ(new_div->input(1), "check2");
-  EXPECT_EQ(new_div->input(2), "^assert1");
-
-  auto tensors = EvaluateNodes(output, item.fetch, {{"Placeholder", bool_t}});
-  EXPECT_EQ(tensors.size(), 1);
-  test::ExpectTensorNear<double>(tensors[0], tensors_expected[0], 1e-6);
-}
-
-TEST_F(ArithmeticOptimizerTest, OpDedupCommutative) {
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output c1 = ops::Const(s.WithOpName("c1"), {1.0f, 2.0f}, {1, 2});
-  Output c2 = ops::Const(s.WithOpName("c2"), {3.0f, 4.0f}, {1, 2});
-  Output mul1 = ops::Mul(s.WithOpName("mul1"), c1, c2);
-  Output mul2 = ops::Mul(s.WithOpName("mul2"), c2, c1);
-  Output div1 = ops::Div(s.WithOpName("div1"), mul1, mul2);
-  GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  item.fetch = {"div1"};
-  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
-  ASSERT_EQ(tensors_expected.size(), 1);
-
-  ArithmeticOptimizer optimizer;
-  GraphDef output;
-  OptimizeTwice(&optimizer, &item, &output);
-  NodeMap node_map(&output);
-
-  EXPECT_EQ(output.node_size(), 4);
-  const NodeDef* new_c1 = node_map.GetNode("c1");
-  ASSERT_NE(new_c1, nullptr);
-  const NodeDef* new_c2 = node_map.GetNode("c2");
-  ASSERT_NE(new_c2, nullptr);
-  const NodeDef* new_mul1 = node_map.GetNode("mul1");
-  ASSERT_NE(new_mul1, nullptr);
-  ASSERT_EQ(new_mul1->input_size(), 2);
-  EXPECT_EQ(new_mul1->input(0), "c1");
-  EXPECT_EQ(new_mul1->input(1), "c2");
-  const NodeDef* new_div1 = node_map.GetNode("div1");
-  ASSERT_NE(new_div1, nullptr);
-  ASSERT_EQ(new_div1->input_size(), 2);
-  EXPECT_EQ(new_div1->input(0), "mul1");
-  EXPECT_EQ(new_div1->input(1), "mul1");
-
-  auto tensors = EvaluateNodes(output, item.fetch);
-  ASSERT_EQ(tensors.size(), 1);
-  test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
-}
-
 TEST_F(ArithmeticOptimizerTest, ReplaceMulWithSquare) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output c = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
@@ -474,6 +365,7 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
   Output id = ops::Identity(s.WithOpName("id"), add6);
 
   GrapplerItem item;
+  item.fetch = {"id"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
   const std::vector<string> devices{
@@ -488,16 +380,16 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
   DisableAddToAddNCombining(&optimizer);
 
   GraphDef output;
-  OptimizeTwice(&optimizer, &item, &output);
+  DedupAndOptimizeTwiceAndPrune(&optimizer, &item, &output);
 
   // We expect the following rewrite(s) to occur:
   //
   // Mul(p,
   //     Add_6(Add_4(Const(2), Const(2)),
-  //           Add_5(Const(2), Const(2))))
+  //           Add_5(Const(2), Const(2)))
   NodeMap node_map(&output);
 
-  EXPECT_EQ(output.node_size(), 17);
+  EXPECT_EQ(output.node_size(), 8);
 
   const NodeDef* id_node = node_map.GetNode("id");
   ASSERT_NE(id_node, nullptr);
@@ -507,8 +399,8 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
   const NodeDef* mul_node = node_map.GetNode(HoistMulName("Add_6"));
   ASSERT_NE(mul_node, nullptr);
   ASSERT_EQ(mul_node->input_size(), 2);
-  EXPECT_EQ(mul_node->input(0), HoistAddName("Add_6"));
-  EXPECT_EQ(mul_node->input(1), "Placeholder");
+  EXPECT_EQ(mul_node->input(0), "Placeholder");
+  EXPECT_EQ(mul_node->input(1), HoistAddName("Add_6"));
 
   const NodeDef* add_6_node = node_map.GetNode(HoistAddName("Add_6"));
   ASSERT_NE(add_6_node, nullptr);
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test_utils.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test_utils.h
index 4d3ba976c4f..73bb5a0d97c 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test_utils.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test_utils.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_TEST_UTILS_H_
 
 #include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/common_subgraph_elimination.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
 #include "tensorflow/core/grappler/utils/grappler_test.h"
@@ -27,9 +28,9 @@ namespace grappler {
 
 class ArithmeticOptimizerTest : public GrapplerTest {
  protected:
-  // Optimize a graph using ArithmeticOptimizer and prune all the nodes that no
+  // Optimize a graph using optimizer and prune all the nodes that no
   // longer have any output consumers.
-  void OptimizeAndPrune(ArithmeticOptimizer* optimizer, GrapplerItem* item,
+  void OptimizeAndPrune(GraphOptimizer* optimizer, GrapplerItem* item,
                         GraphDef* output) {
     TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
     item->graph.Swap(output);
@@ -37,8 +38,23 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     TF_EXPECT_OK(ModelPruner().Optimize(nullptr, *item, output));
   }
 
-  // Run ArithmeticOptimizer twice to make sure the rewrite is idempotent.
-  void OptimizeTwice(ArithmeticOptimizer* optimizer, GrapplerItem* item,
+  // Run optimizer twice to make sure the rewrite is idempotent.
+  void DedupAndOptimizeTwiceAndPrune(GraphOptimizer* optimizer,
+                                     GrapplerItem* item, GraphDef* output) {
+    TF_EXPECT_OK(CommonSubgraphElimination().Optimize(nullptr, *item, output));
+    item->graph.Swap(output);
+    output->Clear();
+    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+    item->graph.Swap(output);
+    output->Clear();
+    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+    item->graph.Swap(output);
+    output->Clear();
+    TF_EXPECT_OK(ModelPruner().Optimize(nullptr, *item, output));
+  }
+
+  // Run optimizer twice to make sure the rewrite is idempotent.
+  void OptimizeTwice(GraphOptimizer* optimizer, GrapplerItem* item,
                      GraphDef* output) {
     TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
     item->graph.Swap(output);
@@ -46,9 +62,9 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
   }
 
-  // Run ArithmeticOptimizer twice to make sure the rewrite is idempotent.
+  // Run optimizer twice to make sure the rewrite is idempotent.
   // Optionally run a constant folding pass before pruning.
-  void OptimizeTwiceAndPrune(ArithmeticOptimizer* optimizer, GrapplerItem* item,
+  void OptimizeTwiceAndPrune(GraphOptimizer* optimizer, GrapplerItem* item,
                              GraphDef* output, bool const_folding = false) {
     TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
 
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
index 14dde978dc5..fe213d8aafb 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
@@ -44,7 +44,11 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
+#if GOOGLE_CUDA
 const std::pair<int, int> kMinGPUArch = {7, 0};
+#else
+const std::pair<int, int> kMinGPUArch = {0, 0};
+#endif
 
 const char kSuffix[] = "AutoMixedPrecision";
 const char kCastToFp16[] = "CastToFp16";
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
index 2d1d44093c9..951279d37cd 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
@@ -17,7 +17,7 @@ limitations under the License.
 // otherwise the optimizer will not turn clearlist nodes to float16. When
 // looking at clearlist nodes, this optimizer checks if the nodes have a float16
 // GPU OpKernel, but without CUDA there are no GPU OpKernels at all.
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include "tensorflow/core/grappler/optimizers/auto_mixed_precision.h"
 
@@ -77,16 +77,20 @@ class AutoMixedPrecisionTest : public GrapplerTest {
   void SetUp() override {
     int num_gpus = GetNumAvailableGPUs();
     // If GPUs are available, require that they all satisfy the min arch.
+    gpu_available_ = (num_gpus > 0);
+#if GOOGLE_CUDA
     gpu_available_ =
-        num_gpus > 0 && num_gpus == GetNumAvailableGPUs(kMinGPUArch);
-
+        gpu_available_ && (num_gpus == GetNumAvailableGPUs(kMinGPUArch));
+#endif
     if (gpu_available_) {
       virtual_cluster_.reset(new SingleMachine(/* timeout_s = */ 10, 1, 1));
     } else {
       DeviceProperties device_properties;
       device_properties.set_type("GPU");
+#if GOOGLE_CUDA
       device_properties.mutable_environment()->insert({"architecture", "7"});
       device_properties.mutable_environment()->insert({"cuda", "9010"});
+#endif
       virtual_cluster_.reset(
           new VirtualCluster({{"/GPU:1", device_properties}}));
     }
@@ -1164,4 +1168,4 @@ TEST_F(AutoMixedPrecisionTest, TanhOp) {
 }  // namespace grappler
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/grappler/optimizers/common_subgraph_elimination.cc b/tensorflow/core/grappler/optimizers/common_subgraph_elimination.cc
new file mode 100644
index 00000000000..8924e4c6bea
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/common_subgraph_elimination.cc
@@ -0,0 +1,291 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/common_subgraph_elimination.h"
+
+#include <set>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/graph/tensor_id.h"
+#include "tensorflow/core/grappler/graph_topology_view.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/canonicalizer.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "tensorflow/core/grappler/utils/traversal.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/hash.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/strcat.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace grappler {
+class Cluster;
+}  // namespace grappler
+}  // namespace tensorflow
+
+using tensorflow::strings::StrCat;
+
+namespace tensorflow {
+namespace grappler {
+
+class UniqueNodes {
+ public:
+  NodeDef* FindOrAddRepresentative(NodeDef* node) {
+    uint64 sig = ComputeSignature(*node);
+    std::vector<NodeDef*>& candidates = rep_[sig];
+    for (auto& candidate : candidates) {
+      if ((candidate == node) || SameNode(*candidate, *node)) {
+        return candidate;
+      }
+    }
+    candidates.push_back(node);
+    return node;
+  }
+
+  void RemoveRepresentative(NodeDef* node) {
+    auto it = memoized_signatures_.find(node);
+    if (it == memoized_signatures_.end()) return;
+
+    std::vector<NodeDef*>& candidates = rep_[it->second];
+    for (int i = 0; i < candidates.size(); ++i) {
+      if (candidates[i] == node) {
+        std::swap(candidates[i], candidates[candidates.size() - 1]);
+        candidates.resize(candidates.size() - 1);
+        break;
+      }
+    }
+    memoized_signatures_.erase(node);
+  }
+
+ private:
+  uint64 ComputeSignature(const NodeDef& node);
+  bool SameNode(const NodeDef& node1, const NodeDef& node2) const;
+
+  absl::flat_hash_map<uint64, std::vector<NodeDef*>> rep_;
+  absl::flat_hash_map<const NodeDef*, uint64> memoized_signatures_;
+};
+
+uint64 UniqueNodes::ComputeSignature(const NodeDef& node) {
+  auto it = memoized_signatures_.find(&node);
+  if (it != memoized_signatures_.end()) return it->second;
+
+  uint64 h = Hash64(node.op());
+  h = Hash64Combine(Hash64(node.device()), h);
+
+  for (const auto& input : node.input()) {
+    const TensorId input_tensor = ParseTensorName(input);
+    uint64 input_hash = Hash64Combine(
+        Hash64(input_tensor.node().data(), input_tensor.node().size()),
+        std::hash<int>()(input_tensor.index()));
+    h = Hash64CombineUnordered(input_hash, h);
+  }
+  for (const auto& attr : node.attr()) {
+    uint64 attr_hash =
+        Hash64Combine(Hash64(attr.first), FastAttrValueHash(attr.second));
+    h = Hash64CombineUnordered(attr_hash, h);
+  }
+  memoized_signatures_.emplace(&node, h);
+  return h;
+}
+
+// PRECONDITION:
+//  Node input orders are assumed to be canonicalized, i.e. control inputs for
+//  all nodes as well as regular inputs for commutative nodes must be sorted.
+bool UniqueNodes::SameNode(const NodeDef& node1, const NodeDef& node2) const {
+  if (node1.op() != node2.op()) {
+    return false;
+  }
+  if (node1.device() != node2.device()) {
+    return false;
+  }
+  if (node1.input_size() != node2.input_size()) {
+    return false;
+  }
+  if (node1.attr_size() != node2.attr_size()) {
+    return false;
+  }
+
+  // Compare inputs.
+  auto it1 = node1.input().begin();
+  auto it2 = node2.input().begin();
+  for (; it1 != node1.input().end(); ++it1, ++it2) {
+    if (*it1 != *it2) return false;
+  }
+
+  // Compare attributes.
+  for (const auto& attr1 : node1.attr()) {
+    auto it = node2.attr().find(attr1.first);
+    if (it == node2.attr().end()) return false;
+    if (!FastAreAttrValuesEqual(attr1.second, it->second)) return false;
+  }
+
+  return true;
+}
+
+bool CommonSubgraphElimination::CanDedup(const NodeDef& node) const {
+  if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
+    return false;
+  }
+  if (IsEnter(node) || IsExit(node)) {
+    return false;
+  }
+  if (node.device().find("SPU") != string::npos) {
+    return false;
+  }
+  // Workaround for Assert and Print mistakenly being labeled as stateful.
+  if (IsAssert(node) || IsPrint(node)) {
+    return true;
+  }
+  return IsFreeOfSideEffect(node);
+}
+
+Status CommonSubgraphElimination::DedupComputations(GraphDef* optimized_graph) {
+  CanonicalizeGraph(optimized_graph);
+
+  GraphTopologyView graph_view;
+  if (!graph_view.InitializeFromGraph(*optimized_graph).ok()) {
+    LOG(WARNING) << "Failed to initialize GraphTopologyView.";
+    return Status::OK();
+  }
+
+  // If either node or rep feeds an inplace op, deduping them may cause data
+  // races. For example: If we dedup nodes initializing two independent
+  // inplace accumulations, they will write to the same buffer, clobbering
+  // each other's results.
+  absl::flat_hash_set<const NodeDef*> feeds_inplace_op;
+  for (int i = 0; i < optimized_graph->node_size(); ++i) {
+    const NodeDef& root = optimized_graph->node(i);
+    if (feeds_inplace_op.find(&root) != feeds_inplace_op.end()) continue;
+    if (ModifiesInputsInPlace(root)) {
+      const auto is_continue_traversal = [&](const NodeDef* node) -> bool {
+        return node->op() == root.op() || !NeverForwardsInputs(*node);
+      };
+
+      DfsTraversal(graph_view, {&root}, TraversalDirection::kFollowInputs,
+                   DfsPredicates::Advance(is_continue_traversal),
+                   DfsCallbacks::PreOrder([&](const NodeDef* node) {
+                     feeds_inplace_op.insert(node);
+                   }));
+    }
+  }
+
+  std::vector<bool> can_dedup(optimized_graph->node_size());
+  for (int i = 0; i < optimized_graph->node_size(); ++i) {
+    const NodeDef& node = optimized_graph->node(i);
+    can_dedup[i] = (feeds_inplace_op.find(&node) == feeds_inplace_op.end()) &&
+                   CanDedup(node);
+  }
+
+  bool stop = true;
+  std::set<int> duplicates;
+  UniqueNodes nodes;
+  NodeMap node_map(optimized_graph);
+  do {
+    stop = true;
+    for (int i = 0; i < optimized_graph->node_size(); ++i) {
+      if (!can_dedup[i] || duplicates.find(i) != duplicates.end()) {
+        continue;
+      }
+      NodeDef* node = optimized_graph->mutable_node(i);
+      NodeDef* rep = nodes.FindOrAddRepresentative(node);
+      if (rep == node) {
+        continue;
+      }
+      const std::set<NodeDef*>& tmp = node_map.GetOutputs(node->name());
+      std::vector<NodeDef*> fanouts(tmp.begin(), tmp.end());
+      for (NodeDef* fanout : fanouts) {
+        // Update consumers of node.
+        bool updated_fanout = false;
+        for (int i = 0; i < fanout->input_size(); ++i) {
+          string* fanout_input = fanout->mutable_input(i);
+
+          const int position =
+              NodePositionIfSameNode(*fanout_input, node->name());
+          // Update name in-place.
+          if (position < -1) {
+            continue;
+          } else {
+            if (!updated_fanout) {
+              // The signature of the fanout node will change. Remove it from
+              // nodes.
+              nodes.RemoveRepresentative(fanout);
+            }
+            updated_fanout = true;
+            if (position > 0) {
+              *fanout_input = StrCat(rep->name(), ":", position);
+            } else if (position == 0) {
+              *fanout_input = rep->name();
+            } else {
+              *fanout_input = StrCat("^", rep->name());
+            }
+          }
+        }
+        if (updated_fanout) {
+          node_map.UpdateInput(fanout->name(), node->name(), rep->name());
+          CanonicalizeNode(fanout);
+        }
+      }
+      duplicates.insert(i);
+      stop = false;
+    }
+  } while (!stop);
+
+  // Delete duplicates
+  if (fetch_nodes_known_ && !duplicates.empty()) {
+    EraseNodesFromGraph(duplicates, optimized_graph);
+  }
+
+  return Status::OK();
+}
+
+Status CommonSubgraphElimination::Optimize(Cluster* /*cluster*/,
+                                           const GrapplerItem& item,
+                                           GraphDef* optimized_graph) {
+  // Set up helper data structures.
+  nodes_to_preserve_ = item.NodesToPreserve();
+  fetch_nodes_known_ = !item.fetch.empty();
+  *optimized_graph = item.graph;
+
+  // Perform topological sort on the graph in order to help DedupComputations
+  // optimize larger subgraphs starting from the roots with more inputs.
+  TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph));
+  GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
+
+  return DedupComputations(optimized_graph);
+}
+
+void CommonSubgraphElimination::Feedback(Cluster* /*cluster*/,
+                                         const GrapplerItem& /*item*/,
+                                         const GraphDef& /*optimized_graph*/,
+                                         double /*result*/) {
+  // Nothing to do for ArithmeticOptimizer.
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/common_subgraph_elimination.h b/tensorflow/core/grappler/optimizers/common_subgraph_elimination.h
new file mode 100644
index 00000000000..eec6ba79b3f
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/common_subgraph_elimination.h
@@ -0,0 +1,73 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_COMMON_SUBGRAPH_ELIMINATION_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_COMMON_SUBGRAPH_ELIMINATION_H_
+
+#include <unordered_set>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/platform/hash.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Optimize TF computations by deduping equivalent subgraphs.
+class Cluster;
+struct GrapplerItem;
+
+class CommonSubgraphElimination : public GraphOptimizer {
+ public:
+  CommonSubgraphElimination() {}
+
+  explicit CommonSubgraphElimination(RewriterConfig::Toggle opt_level)
+      : opt_level_(opt_level) {}
+
+  ~CommonSubgraphElimination() override {}
+
+  string name() const override { return "common_subgraph_elimination"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* optimized_graph) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimized_graph, double result) override;
+
+ private:
+  friend class CommonSubgraphEliminationTest;
+
+  // Returns true if it is safe to dedup node from the graph.
+  bool CanDedup(const NodeDef& node) const;
+
+  // Dedup redundant nodes in the graph.
+  Status DedupComputations(GraphDef* optimized_graph);
+
+  RewriterConfig::Toggle opt_level_;
+
+  bool fetch_nodes_known_ = false;
+  std::unordered_set<string> nodes_to_preserve_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_COMMON_SUBGRAPH_ELIMINATION_H_
diff --git a/tensorflow/core/grappler/optimizers/common_subgraph_elimination_test.cc b/tensorflow/core/grappler/optimizers/common_subgraph_elimination_test.cc
new file mode 100644
index 00000000000..3341a8abe56
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/common_subgraph_elimination_test.cc
@@ -0,0 +1,178 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/common_subgraph_elimination.h"
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/grappler/optimizers/arithmetic_optimizer_test_utils.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+
+namespace {
+
+void VerifyGraphsMatch(const GraphDef& original_graph,
+                       const GraphDef& optimized_graph, int line) {
+  EXPECT_EQ(original_graph.node_size(), optimized_graph.node_size()) << line;
+  for (int i = 0; i < original_graph.node_size(); ++i) {
+    const NodeDef& original = original_graph.node(i);
+    const NodeDef& optimized = optimized_graph.node(i);
+    EXPECT_EQ(original.name(), optimized.name()) << line;
+    EXPECT_EQ(original.op(), optimized.op()) << line;
+    EXPECT_EQ(original.input_size(), optimized.input_size()) << line;
+    for (int j = 0; j < original.input_size(); ++j) {
+      EXPECT_EQ(original.input(j), optimized.input(j)) << line;
+    }
+  }
+}
+}  // namespace
+
+class CommonSubgraphEliminationTest : public ArithmeticOptimizerTest {};
+
+TEST_F(CommonSubgraphEliminationTest, NoOp) {
+  // This trivial graph is so basic there's nothing to optimize.
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  CommonSubgraphElimination optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  VerifyGraphsMatch(item.graph, output, __LINE__);
+}
+
+TEST_F(CommonSubgraphEliminationTest, OpDedupping) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output c1 = ops::Const(s.WithOpName("c1"), {3.14, 2.7}, {1, 2});
+  Output c2 = ops::Const(s.WithOpName("c2"), {3.14, 2.7}, {1, 2});
+  Output div = ops::Div(s.WithOpName("div"), c1, c2);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"div"};
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  ASSERT_EQ(tensors_expected.size(), 1);
+
+  CommonSubgraphElimination optimizer;
+  GraphDef output;
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
+  EXPECT_EQ(output.node_size(), 2);
+  const NodeDef* new_c1 = node_map.GetNode("c1");
+  ASSERT_NE(new_c1, nullptr);
+
+  const NodeDef* new_div = node_map.GetNode("div");
+  ASSERT_NE(new_div, nullptr);
+  ASSERT_EQ(new_div->input_size(), 2);
+  EXPECT_EQ(new_div->input(0), "c1");
+  EXPECT_EQ(new_div->input(1), "c1");
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  ASSERT_EQ(tensors.size(), 1);
+  test::ExpectTensorNear<double>(tensors[0], tensors_expected[0], 1e-6);
+}
+
+TEST_F(CommonSubgraphEliminationTest, OpDeduppingAssertAndCheckNumerics) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output p = ops::Placeholder(s, DT_BOOL, ops::Placeholder::Shape({}));
+  Output c = ops::Const(s.WithOpName("c"), {3.14, 2.7}, {1, 2});
+  auto check1 = ops::CheckNumerics(s.WithOpName("check1"), c, "foo");
+  auto check2 = ops::CheckNumerics(s.WithOpName("check2"), c, "foo");
+  auto assert1 = ops::Assert(s.WithOpName("assert1"), p, {c});
+  auto assert2 = ops::Assert(s.WithOpName("assert2"), p, {c});
+  Output div = ops::Div(s.WithOpName("div").WithControlDependencies(
+                            {assert1.operation, assert2.operation}),
+                        check1, check2);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"div"};
+  Tensor bool_t(DT_BOOL, TensorShape({}));
+  bool_t.scalar<bool>().setConstant(true);
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"Placeholder", bool_t}});
+  ASSERT_EQ(tensors_expected.size(), 1);
+
+  CommonSubgraphElimination optimizer;
+  GraphDef output;
+
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
+
+  EXPECT_EQ(output.node_size(), 6);
+  const NodeDef* new_div = node_map.GetNode("div");
+  ASSERT_NE(new_div, nullptr);
+  ASSERT_EQ(new_div->input_size(), 3);
+  EXPECT_EQ(new_div->input(0), "check1");
+  EXPECT_EQ(new_div->input(1), "check2");
+  EXPECT_EQ(new_div->input(2), "^assert1");
+
+  auto tensors = EvaluateNodes(output, item.fetch, {{"Placeholder", bool_t}});
+  EXPECT_EQ(tensors.size(), 1);
+  test::ExpectTensorNear<double>(tensors[0], tensors_expected[0], 1e-6);
+}
+
+TEST_F(CommonSubgraphEliminationTest, OpDedupCommutative) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output c1 = ops::Const(s.WithOpName("c1"), {1.0f, 2.0f}, {1, 2});
+  Output c2 = ops::Const(s.WithOpName("c2"), {3.0f, 4.0f}, {1, 2});
+  Output mul1 = ops::Mul(s.WithOpName("mul1"), c1, c2);
+  Output mul2 = ops::Mul(s.WithOpName("mul2"), c2, c1);
+  Output div1 = ops::Div(s.WithOpName("div1"), mul1, mul2);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"div1"};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  ASSERT_EQ(tensors_expected.size(), 1);
+
+  CommonSubgraphElimination optimizer;
+  GraphDef output;
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
+
+  EXPECT_EQ(output.node_size(), 4);
+  const NodeDef* new_c1 = node_map.GetNode("c1");
+  ASSERT_NE(new_c1, nullptr);
+  const NodeDef* new_c2 = node_map.GetNode("c2");
+  ASSERT_NE(new_c2, nullptr);
+  const NodeDef* new_mul1 = node_map.GetNode("mul1");
+  ASSERT_NE(new_mul1, nullptr);
+  ASSERT_EQ(new_mul1->input_size(), 2);
+  EXPECT_EQ(new_mul1->input(0), "c1");
+  EXPECT_EQ(new_mul1->input(1), "c2");
+  const NodeDef* new_div1 = node_map.GetNode("div1");
+  ASSERT_NE(new_div1, nullptr);
+  ASSERT_EQ(new_div1->input_size(), 2);
+  EXPECT_EQ(new_div1->input(0), "mul1");
+  EXPECT_EQ(new_div1->input(1), "mul1");
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  ASSERT_EQ(tensors.size(), 1);
+  test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 3cf20ca7dab..24758409386 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -383,7 +383,6 @@ Status ConstantFolding::MaterializeShapes(const GraphProperties& properties) {
         op != "TensorArraySizeV3") {
       continue;
     }
-
     const std::vector<OpInfo::TensorProperties>& output =
         properties.GetOutputProperties(node->name());
     const std::vector<OpInfo::TensorProperties>& input =
@@ -410,8 +409,16 @@ Status ConstantFolding::MaterializeShapes(const GraphProperties& properties) {
         continue;
       }
 
+      // TODO(rmlarsen): Remove this workaround for b/150861569
+      // The bug involves an expression of the form Shape(ExpandDims(x)
+      // with an incorrectly inferred zero-size first dimension.
+      if (op == "Shape") {
+        if (shape.dims() > 0 && shape.dim_size(0) == 0) continue;
+      }
+
       // Repurpose the existing node to be the constant.
       // Device placement is preserved.
+      graph_modified_ = true;
       node->set_op("Const");
       node->clear_attr();
       (*node->mutable_attr())["dtype"].set_type(type);
@@ -424,9 +431,8 @@ Status ConstantFolding::MaterializeShapes(const GraphProperties& properties) {
       // the original graph.
       string ctrl_dep =
           AddControlDependency(node->input(0), graph_, node_map_.get());
+      node_map_->UpdateInput(node->name(), node->input(0), ctrl_dep);
       node->set_input(0, ctrl_dep);
-      node_map_->AddOutput(NodeName(ctrl_dep), node->name());
-
       // Done with the Shape/Size/Rank node, move to the next node.
       continue;
     }
@@ -458,6 +464,7 @@ Status ConstantFolding::MaterializeShapes(const GraphProperties& properties) {
           continue;
         }
 
+        graph_modified_ = true;
         node->set_op("Const");
         *node->mutable_attr() = array_size->attr();
         node->set_input(0, AsControlDependency(NodeName(node->input(0))));
@@ -519,6 +526,7 @@ Status ConstantFolding::MaterializeShapes(const GraphProperties& properties) {
             }
             *output->mutable_input(k) = const_name;
             node_map_->AddOutput(const_name, output->name());
+            graph_modified_ = true;
           }
           if (node_name == shape_n_node->name() && port != port_idx) {
             direct_edges_exist = true;
@@ -1911,6 +1919,7 @@ Status ConstantFolding::ReplaceOperationWithConstantTensor(DataType dtype,
                                                            TensorProto* value,
                                                            NodeDef* node,
                                                            GraphDef* graph) {
+  if (dtype == DT_VARIANT) return Status::OK();
   node->set_op("Const");
   node->clear_attr();
   (*node->mutable_attr())["dtype"].set_type(dtype);
@@ -1934,6 +1943,7 @@ Status ConstantFolding::ReplaceOperationWithConstant(
     double value, const GraphProperties& properties,
     const TensorShapeProto& shape, NodeDef* node, GraphDef* graph) {
   const DataType dtype = GetDataTypeFromNodeOrProps(*node, properties);
+  if (dtype == DT_VARIANT) return Status::OK();
   AttrValue tensor_attr;
   Status s = CreateConstantTensorAttrValue(dtype, value, shape, &tensor_attr);
   if (!s.ok()) {
@@ -1956,10 +1966,11 @@ Status ConstantFolding::SimplifyGraph(
     // generalize to only restrict certain simplifications.
     if (nodes_to_not_simplify->find(node->name()) ==
         nodes_to_not_simplify->end()) {
-      if (HasTPUAttributes(optimized_graph->node(i))) {
+      if (HasTPUAttributes(*node)) {
         nodes_to_not_simplify->insert(node->name());
         continue;
       }
+
       TF_RETURN_IF_ERROR(
           SimplifyNode(use_shape_info, node, optimized_graph, properties));
     }
@@ -3705,8 +3716,9 @@ Status ConstantFolding::AddQuantizedMatMulMinMaxOutConstNodes(
 }
 
 Status ConstantFolding::RunOptimizationPass(Cluster* cluster,
-                                            const GrapplerItem& item,
+                                            GrapplerItem* item,
                                             GraphDef* optimized_graph) {
+  graph_ = &item->graph;
   node_map_.reset(new NodeMap(graph_));
   nodes_whitelist_.clear();
   // Fold fetch nodes iff it has a single fanout. Note that if a fetch node
@@ -3716,14 +3728,14 @@ Status ConstantFolding::RunOptimizationPass(Cluster* cluster,
   // replace the node with multiple constants (each for one fanout) with
   // new names, and as a result users would not be able to fetch the node any
   // more with the original node name.
-  for (const auto& fetch : item.fetch) {
+  for (const auto& fetch : item->fetch) {
     const NodeDef* fetch_node = node_map_->GetNode(fetch);
     if (fetch_node && NumOutputs(*fetch_node, graph_) == 1) {
       nodes_whitelist_.insert(fetch_node->name());
     }
   }
 
-  GraphProperties properties(item);
+  GraphProperties properties(*item);
   // It's possible to feed a placeholder with a tensor of any shape: make sure
   // that the shape inference deals with this conservatively unless we're in
   // aggressive mode.
@@ -3732,15 +3744,18 @@ Status ConstantFolding::RunOptimizationPass(Cluster* cluster,
                                         /*aggressive_shape_inference=*/false,
                                         /*include_input_tensor_values=*/false,
                                         /*include_output_tensor_values=*/true);
+
   const bool can_use_shape_info = s.ok();
 
+  absl::flat_hash_set<string> nodes_to_not_simplify;
   if (can_use_shape_info) {
     TF_RETURN_IF_ERROR(MaterializeShapes(properties));
     TF_RETURN_IF_ERROR(MaterializeConstants(properties));
+    TF_RETURN_IF_ERROR(
+        FoldGraph(properties, optimized_graph, &nodes_to_not_simplify));
+  } else {
+    *optimized_graph = *graph_;
   }
-  absl::flat_hash_set<string> nodes_to_not_simplify;
-  TF_RETURN_IF_ERROR(
-      FoldGraph(properties, optimized_graph, &nodes_to_not_simplify));
   node_map_.reset(new NodeMap(optimized_graph));
   TF_RETURN_IF_ERROR(SimplifyGraph(can_use_shape_info, optimized_graph,
                                    &properties, &nodes_to_not_simplify));
@@ -3795,11 +3810,10 @@ Status ConstantFolding::Optimize(Cluster* cluster, const GrapplerItem& item,
     GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
     graph_modified_ = false;
     item_to_optimize.graph.Swap(optimized_graph);
-    graph_ = &item_to_optimize.graph;
-    *optimized_graph = GraphDef();
-    node_count = graph_->node_size();
+    optimized_graph->Clear();
+    node_count = item_to_optimize.graph.node_size();
     TF_RETURN_IF_ERROR(
-        RunOptimizationPass(cluster, item_to_optimize, optimized_graph));
+        RunOptimizationPass(cluster, &item_to_optimize, optimized_graph));
   } while (graph_modified_ || optimized_graph->node_size() != node_count);
   *optimized_graph->mutable_library() = item.graph.library();
   *optimized_graph->mutable_versions() = item.graph.versions();
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
index 88c4094bb1a..074f0c5f057 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.h
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -131,8 +131,8 @@ class ConstantFolding : public GraphOptimizer {
   Status SimplifyNode(bool use_shape_info, NodeDef* node,
                       GraphDef* optimized_graph, GraphProperties* properties);
 
-  Status RunOptimizationPass(Cluster* cluster, const GrapplerItem& item,
-                             GraphDef* output);
+  Status RunOptimizationPass(Cluster* cluster, GrapplerItem* item,
+                             GraphDef* optimized_graph);
 
   // Applies partial constant folding for Concat which is not commutative.
   // Returns true if the transformation applied successfully.
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index a929892d073..bdc36c97e59 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -655,6 +655,7 @@ cc_library(
         "noop_elimination.h",
     ],
     deps = [
+        ":function_utils",
         ":graph_utils",
         ":optimizer_base",
         "@com_google_absl//absl/container:flat_hash_set",
diff --git a/tensorflow/core/grappler/optimizers/data/noop_elimination.cc b/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
index 851bbbdc1a2..8d84162d900 100644
--- a/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
+++ b/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/data/function_utils.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -31,6 +32,8 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
+constexpr char kIdentity[] = "Identity";
+
 bool IsTakeAll(const NodeDef& take_node, const MutableGraphView& graph) {
   if (take_node.op() != "TakeDataset") return false;
 
@@ -64,9 +67,71 @@ bool IsPrefetchZero(const NodeDef& prefetch_node,
   return IsConstNodeWithValue(*graph.GetNode(prefetch_node.input(1)), 0);
 }
 
+bool IsOutputIdentityOfInput(const FunctionDef& fdef, const string& output_arg,
+                             const string& input_arg) {
+  if (!fdef.ret().contains(output_arg)) {
+    LOG(WARNING)
+        << "Malformed FunctionDef: ret dict does not contain output arg key.";
+    return false;
+  }
+
+  const auto& ret_val = fdef.ret().at(output_arg);
+  auto input = function_utils::FunctionDefTensorDesc(ret_val);
+
+  // Walk from output to input. If any node along the path is not an
+  // Identity node, return false.
+  while (function_utils::ContainsFunctionNodeWithName(input.node_name, fdef)) {
+    int idx = function_utils::FindFunctionNodeWithName(input.node_name, fdef);
+
+    const NodeDef& node = fdef.node_def(idx);
+    if (node.op() != kIdentity) {
+      return false;
+    }
+
+    input = function_utils::FunctionDefTensorDesc(node.input(0));
+  }
+
+  // If we get here, input is not a node. Check that it matches the correct
+  // input arg name.
+  return input.node_name == input_arg;
+}
+
+bool IsMapIdentity(const NodeDef& map_node, const MutableGraphView& graph) {
+  if (map_node.op() != "MapDataset" && map_node.op() != "ParallelMapDataset" &&
+      map_node.op() != "ParallelMapDatasetV2") {
+    return false;
+  }
+
+  // We are looking only for map(lambda *x: x) nodes.
+
+  // Don't eliminate map nodes with captured arguments.
+  if (map_node.attr().at("Targuments").list().type_size() != 0) return false;
+
+  FunctionLibraryDefinition function_library(OpRegistry::Global(),
+                                             graph.graph()->library());
+  const FunctionDef* fdef =
+      function_library.Find(map_node.attr().at("f").func().name());
+
+  // Don't eliminate map nodes with stateful functions.
+  if (function_utils::IsFunctionStateful(function_library, *fdef)) return false;
+
+  const auto& sig = fdef->signature();
+  if (sig.input_arg_size() != sig.output_arg_size()) return false;
+
+  // For each output, check that it maps to input i
+  for (int i = 0; i < sig.input_arg_size(); ++i) {
+    if (!IsOutputIdentityOfInput(*fdef, sig.output_arg(i).name(),
+                                 sig.input_arg(i).name())) {
+      return false;
+    }
+  }
+  return true;
+}
+
 bool IsNoOp(const NodeDef& node, const MutableGraphView& graph) {
   return IsTakeAll(node, graph) || IsSkipNone(node, graph) ||
-         IsRepeatOne(node, graph) || IsPrefetchZero(node, graph);
+         IsRepeatOne(node, graph) || IsPrefetchZero(node, graph) ||
+         IsMapIdentity(node, graph);
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc
index fd5ae22eac8..4ceb0264909 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc
@@ -179,9 +179,9 @@ void VerifyDataFormatAttributeMatch(const utils::NodeView* node,
 }
 
 TEST_F(GenericLayoutOptimizerTest, OptimizeSimpleConv2DGraph) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   // A simple graph contains 1 "NHWC" Conv2D node, 2 input and 1 output nodes.
   Scope scope = Scope::NewRootScope();
 
@@ -245,9 +245,9 @@ TEST_F(GenericLayoutOptimizerTest, PreserveFetch) {
 }
 
 TEST_F(GenericLayoutOptimizerTest, EmptyDevice) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   auto conv = SimpleConv2D(&s, 4, 2, "VALID", "");
   Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
@@ -267,9 +267,9 @@ TEST_F(GenericLayoutOptimizerTest, EmptyDevice) {
 }
 
 TEST_F(GenericLayoutOptimizerTest, GPUDevice) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   auto conv =
       SimpleConv2D(&s, 4, 2, "VALID", "/job:w/replica:0/task:0/device:GPU:0");
@@ -290,9 +290,9 @@ TEST_F(GenericLayoutOptimizerTest, GPUDevice) {
 }
 
 TEST_F(GenericLayoutOptimizerTest, CPUDevice) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   auto conv = SimpleConv2D(&s, 4, 2, "VALID", "/CPU:0");
   Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
@@ -312,9 +312,9 @@ TEST_F(GenericLayoutOptimizerTest, CPUDevice) {
 }
 
 TEST_F(GenericLayoutOptimizerTest, Connectivity) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   Scope scope = Scope::NewRootScope();
   auto conv = SimpleConv2D(&scope, 4, 2, "VALID", "/device:GPU:0");
   auto i1 = ops::Identity(scope.WithOpName("i1"), conv);
@@ -349,9 +349,9 @@ TEST_F(GenericLayoutOptimizerTest, Connectivity) {
 }
 
 TEST_F(GenericLayoutOptimizerTest, Conv2DBackpropInputNonConstInputSizes) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   Scope s = Scope::NewRootScope();
   auto conv = SimpleConv2DBackpropInput(&s, 7, 2, "SAME", /*dilated=*/false);
   Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
@@ -381,9 +381,9 @@ TEST_F(GenericLayoutOptimizerTest, Conv2DBackpropInputNonConstInputSizes) {
 }
 
 TEST_F(GenericLayoutOptimizerTest, Conv2DDataFormatVecPermuteCollapse) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   Scope scope = Scope::NewRootScope().WithDevice("/device:GPU:0");
   auto conv = SimpleConv2D(&scope, 4, 2, "VALID", "/device:GPU:0");
   auto shape = ops::Shape(scope.WithOpName("shape"), conv);
@@ -434,9 +434,9 @@ TEST_F(GenericLayoutOptimizerTest, Conv2DDataFormatVecPermuteCollapse) {
 }
 
 TEST_F(GenericLayoutOptimizerTest, DoNotPruneNonAddedCancellableTransposes) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   {
     Scope scope = Scope::NewRootScope().WithDevice("/device:GPU:0");
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_test.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_test.cc
index 9c0f2150412..90e96fec673 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_test.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_test.cc
@@ -374,9 +374,9 @@ class TransposerTest : public ::testing::Test {
 };
 
 TEST_F(TransposerTest, CreateConstPermNode) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   TransposeContext context;
   TF_ASSERT_OK(CreateSimpleConv2DGraph(&item.graph));
@@ -418,9 +418,9 @@ TensorShapeProto MakeTensorShapeFromDimensions(absl::Span<const int> dims) {
 }
 
 TEST_F(TransposerTest, CreateTransposeNode) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   TransposeContext context;
   TF_ASSERT_OK(CreateSimpleConv2DGraph(&item.graph));
@@ -458,9 +458,9 @@ TEST_F(TransposerTest, CreateTransposeNode) {
 }
 
 TEST_F(TransposerTest, UpdateNode) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   TransposeContext context;
   TF_ASSERT_OK(CreateSimpleConv2DGraph(&item.graph));
@@ -489,9 +489,9 @@ AttrValue_ListValue MakeAttrValueListValueFromVector(
 }
 
 TEST_F(TransposerTest, UpdateStrides) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   TransposeContext context;
   TF_ASSERT_OK(CreateSimpleConv2DGraph(&item.graph));
@@ -527,9 +527,9 @@ TEST_F(TransposerTest, UpdateStrides) {
 }
 
 TEST_F(TransposerTest, UpdateFaninEdgesTranspose) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   TransposeContext context;
   TF_ASSERT_OK(CreateSimpleFusedBatchNormGrad(&item.graph, true));
@@ -586,9 +586,9 @@ TEST_F(TransposerTest, UpdateFaninEdgesTranspose) {
 }
 
 TEST_F(TransposerTest, UpdateFanoutEdgesTranspose) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   TransposeContext context;
   TF_ASSERT_OK(CreateSimpleConv2DGraph(&item.graph));
@@ -639,9 +639,9 @@ TEST_F(TransposerTest, UpdateFanoutEdgesTranspose) {
 }
 
 TEST_F(TransposerTest, DefaultLayoutSensitiveOpTransposerTestFusedBatchNorm) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   // Use FusedBatchNorm for default transposer test
   GrapplerItem item;
   TransposeContext context;
@@ -695,9 +695,9 @@ TEST_F(TransposerTest, DefaultLayoutSensitiveOpTransposerTestFusedBatchNorm) {
 }
 
 TEST_F(TransposerTest, DefaultLayoutSensitiveOpTransposerTestConv2D) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   // Use Conv2D for default transposer test
   GrapplerItem item;
   TransposeContext context;
@@ -746,9 +746,9 @@ TEST_F(TransposerTest, DefaultLayoutSensitiveOpTransposerTestConv2D) {
 }
 
 TEST_F(TransposerTest, MaxPoolGradTransposerTest) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   for (bool use_grad_grad : {false, true}) {
     GrapplerItem item;
     TransposeContext context;
@@ -800,9 +800,9 @@ TEST_F(TransposerTest, MaxPoolGradTransposerTest) {
 }
 
 TEST_F(TransposerTest, BiasAddGradTransposerTest) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   TransposeContext context;
   TF_ASSERT_OK(CreateSimpleBiasAddGrad(
@@ -872,9 +872,9 @@ TEST_F(TransposerTest, BiasAddGradTransposerIncorrectInputTest) {
 }
 
 TEST_F(TransposerTest, Conv2DBackpropFilterTransposerTest) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   TransposeContext context;
   TF_ASSERT_OK(CreateSimpleConv2DBackpropFilter(&item.graph));
@@ -925,9 +925,9 @@ TEST_F(TransposerTest, Conv2DBackpropFilterTransposerTest) {
 }
 
 TEST_F(TransposerTest, NodeAttributes) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   TransposeContext context;
   TF_ASSERT_OK(
@@ -966,9 +966,9 @@ TEST_F(TransposerTest, NodeAttributes) {
 }
 
 TEST_F(TransposerTest, Conv2DBackpropInputTransposerTest) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   TransposeContext context;
   TF_ASSERT_OK(CreateSimpleConv2DBackpropInput(&item.graph));
@@ -1024,9 +1024,9 @@ TEST_F(TransposerTest, Conv2DBackpropInputTransposerTest) {
 }
 
 TEST_F(TransposerTest, FusedBatchNormGradTransposerIsTrainingTest) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   TransposeContext context;
   TF_ASSERT_OK(CreateSimpleFusedBatchNormGrad(&item.graph, true));
@@ -1159,9 +1159,9 @@ TEST_F(TransposerTest, FusedBatchNormGradTransposerNotTrainingTest) {
 }
 
 TEST_F(TransposerTest, DefaultLayoutAgnosticOpTransposerIdentityTest) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
   auto conv2d = SimpleConv2D(&scope);
@@ -1210,9 +1210,9 @@ TEST_F(TransposerTest, DefaultLayoutAgnosticOpTransposerIdentityTest) {
 }
 
 TEST_F(TransposerTest, DefaultLayoutAgnosticOpTransposerIdentityBadInputTest) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
   auto conv2d = SimpleConv2D(&scope);
@@ -1258,9 +1258,9 @@ TEST_F(TransposerTest, DefaultLayoutAgnosticOpTransposerIdentityBadInputTest) {
 }
 
 TEST_F(TransposerTest, AddNTransposerTest) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   TF_ASSERT_OK(CreateSimpleAddN(&item.graph));
   TransposeContext context;
@@ -1373,9 +1373,9 @@ TEST_F(TransposerTest, AddNTransposerNotAfterTransformTest) {
 }
 
 TEST_F(TransposerTest, IdentityNTransposerTest) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   TF_ASSERT_OK(CreateSimpleIdentityN(&item.graph));
   TransposeContext context;
@@ -1466,9 +1466,9 @@ TEST_F(TransposerTest, IdentityNTransposerTest) {
 }
 
 TEST_F(TransposerTest, MergeTransposerTestMergeBothInputsConvertible) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
   auto conv2d = SimpleConv2D(&scope);
@@ -1525,9 +1525,9 @@ TEST_F(TransposerTest, MergeTransposerTestMergeBothInputsConvertible) {
 }
 
 TEST_F(TransposerTest, MergeTransposerTestMergeOneInputNotConvertible) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
   auto conv2d = SimpleConv2D(&scope);
@@ -1580,9 +1580,9 @@ TEST_F(TransposerTest, MergeTransposerTestMergeOneInputNotConvertible) {
 }
 
 TEST_F(TransposerTest, PadTransposerTest) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
   auto conv2d = SimpleConv2D(&scope);
@@ -1640,9 +1640,9 @@ TEST_F(TransposerTest, PadTransposerTest) {
 }
 
 TEST_F(TransposerTest, SwitchTransposerTest) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
   auto conv2d = SimpleConv2D(&scope);
@@ -1705,9 +1705,9 @@ TEST_F(TransposerTest, SwitchTransposerTest) {
 }
 
 TEST_F(TransposerTest, TernaryOpTransposerTest) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
   auto conv2d = SimpleConv2D(&scope);
@@ -1775,9 +1775,9 @@ TEST_F(TransposerTest, TernaryOpTransposerTest) {
 }
 
 TEST_F(TransposerTest, UnaryGradTransposerTestTanhGrad) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
   auto conv2d = SimpleConv2D(&scope);
@@ -1839,9 +1839,9 @@ TEST_F(TransposerTest, UnaryGradTransposerTestTanhGrad) {
 }
 
 TEST_F(TransposerTest, UnaryGradTransposerTestRelu6Grad) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
   auto conv2d = SimpleConv2D(&scope);
@@ -1903,9 +1903,9 @@ TEST_F(TransposerTest, UnaryGradTransposerTestRelu6Grad) {
 }
 
 TEST_F(TransposerTest, SqueezeTransposerTest) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
   auto input =
@@ -1958,9 +1958,9 @@ TEST_F(TransposerTest, SqueezeTransposerTest) {
 }
 
 TEST_F(TransposerTest, SqueezeTransposerTestUnsupportedInputShape) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
   auto input =
@@ -1997,9 +1997,9 @@ TEST_F(TransposerTest, SqueezeTransposerTestUnsupportedInputShape) {
 }
 
 TEST_F(TransposerTest, SqueezeTransposerTestInvalidHWAxis) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
   auto input =
@@ -2037,9 +2037,9 @@ TEST_F(TransposerTest, SqueezeTransposerTestInvalidHWAxis) {
 }
 
 TEST_F(TransposerTest, SqueezeTransposerTestInvalidNHWAxis) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
   auto input =
@@ -2077,9 +2077,9 @@ TEST_F(TransposerTest, SqueezeTransposerTestInvalidNHWAxis) {
 }
 
 TEST_F(TransposerTest, SqueezeTransposerTestSqueezeDimsUpdated) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
   auto input =
@@ -2138,9 +2138,9 @@ TEST_F(TransposerTest, SqueezeTransposerTestSqueezeDimsUpdated) {
 }
 
 TEST_F(TransposerTest, MaxPoolV2Transposer) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
   auto input =
@@ -2196,9 +2196,9 @@ TEST_F(TransposerTest, MaxPoolV2Transposer) {
 }
 
 TEST_F(TransposerTest, MaxPoolGradV2Transposer) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   for (bool use_grad_grad : {false, true}) {
     GrapplerItem item;
     Scope scope = Scope::NewRootScope();
@@ -2280,9 +2280,9 @@ TEST_F(TransposerTest, MaxPoolGradV2Transposer) {
 }
 
 TEST_F(TransposerTest, BinaryOpTransposerAdd) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
   auto input =
@@ -2352,9 +2352,9 @@ TEST_F(TransposerTest, BinaryOpTransposerAdd) {
 }
 
 TEST_F(TransposerTest, BinaryOpTransposerMul) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
   auto input =
@@ -2424,9 +2424,9 @@ TEST_F(TransposerTest, BinaryOpTransposerMul) {
 }
 
 TEST_F(TransposerTest, BinaryOpTransposerPolygamma) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
   auto input =
@@ -2511,9 +2511,9 @@ bool CreateConcatV1Op(const Scope& scope, const InputList& tensors,
 }
 
 TEST_F(TransposerTest, ConcatOpTransposerConcat) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
   Output input_1 = ops::RandomUniform(scope.WithOpName("input_1"),
@@ -2589,9 +2589,9 @@ TEST_F(TransposerTest, ConcatOpTransposerConcat) {
 }
 
 TEST_F(TransposerTest, ConcatOpTransposerConcatV2) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
   Output input_1 = ops::RandomUniform(scope.WithOpName("input_1"),
@@ -2666,9 +2666,9 @@ TEST_F(TransposerTest, ConcatOpTransposerConcatV2) {
 }
 
 TEST_F(TransposerTest, ReverseV2Transposer) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
 
@@ -2734,9 +2734,9 @@ TEST_F(TransposerTest, ReverseV2Transposer) {
 }
 
 TEST_F(TransposerTest, TileTransposer) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
 
@@ -2801,9 +2801,9 @@ TEST_F(TransposerTest, TileTransposer) {
 }
 
 TEST_F(TransposerTest, ShapeTransposer) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
   auto input =
@@ -2858,9 +2858,9 @@ TEST_F(TransposerTest, ShapeTransposer) {
 }
 
 TEST_F(TransposerTest, ShapeNTransposer) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
   auto input =
@@ -2961,9 +2961,9 @@ TEST_F(TransposerTest, ShapeNTransposer) {
 }
 
 TEST_F(TransposerTest, FillOpTransposer) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
   auto input =
@@ -3020,9 +3020,9 @@ TEST_F(TransposerTest, FillOpTransposer) {
 }
 
 TEST_F(TransposerTest, SliceTransposer) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
 
@@ -3097,9 +3097,9 @@ TEST_F(TransposerTest, SliceTransposer) {
 }
 
 TEST_F(TransposerTest, SplitTransposer) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
 
@@ -3183,9 +3183,9 @@ TEST_F(TransposerTest, SplitTransposer) {
 }
 
 TEST_F(TransposerTest, SplitVTransposer) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
 
@@ -3273,9 +3273,9 @@ TEST_F(TransposerTest, SplitVTransposer) {
 }
 
 TEST_F(TransposerTest, StridedSliceTransposer) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
 
@@ -3363,9 +3363,9 @@ TEST_F(TransposerTest, StridedSliceTransposer) {
 }
 
 TEST_F(TransposerTest, StridedSliceTransposerEllipsisMaskPresent) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
 
@@ -3425,9 +3425,9 @@ TEST_F(TransposerTest, StridedSliceTransposerEllipsisMaskPresent) {
 }
 
 TEST_F(TransposerTest, StridedSliceTransposerConstFaninBadRank) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
 
@@ -3510,9 +3510,9 @@ TEST_F(TransposerTest, StridedSliceTransposerConstFaninBadRank) {
 }
 
 TEST_F(TransposerTest, ReduceTransposerKeepDims) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
 
@@ -3577,9 +3577,9 @@ TEST_F(TransposerTest, ReduceTransposerKeepDims) {
 }
 
 TEST_F(TransposerTest, ReduceTransposerValidAxisNode) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GrapplerItem item;
   Scope scope = Scope::NewRootScope();
 
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer.h b/tensorflow/core/grappler/optimizers/graph_optimizer.h
index 238606ee673..bff200cbe18 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer.h
@@ -17,9 +17,12 @@ limitations under the License.
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_H_
 
 #include <string>
+
 #include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -55,6 +58,12 @@ class GraphOptimizer {
   virtual Status Optimize(Cluster* cluster, const GrapplerItem& item,
                           GraphDef* optimized_graph) = 0;
 
+  // Subclasses may define a version of Optimize that consumes item.
+  virtual Status Optimize(Cluster* cluster, GrapplerItem&& item,
+                          GraphDef* optimized_graph) {
+    return Optimize(cluster, item, optimized_graph);
+  }
+
   // Method invoked by the framework so that it can provide feedback
   // on how well the "optimized_graph" (produced as *optimized_graph from a
   // call to Optimize) performed.  Lower "result" scores are better.
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
index ba3b77a7c36..2d6e201f084 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
@@ -14,8 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
+
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/clusters/single_machine.h"
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index 4f92948fc72..d60fb2042a7 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -510,7 +510,8 @@ void RecomputationRewritingPass(RewriterConfig::MemOptType optimization_level,
   }
 }
 
-bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
+bool SchedulingPass(Cluster* cluster, std::unique_ptr<GraphMemory>* memory_ptr,
+                    GrapplerItem* item) {
   // Look for AddN nodes (and equivalent) and record input names.
   MutableGraphView view(&item->graph);
 
@@ -536,17 +537,19 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
     return false;
   }
 
-  GraphMemory memory(*item);
-  const std::unordered_map<string, DeviceProperties>& devices =
-      cluster->GetDevices();
-  Status s = memory.InferStatically(devices);
-  if (!s.ok()) {
-    VLOG(1) << "Failed to infer memory usage: " << s.error_message();
-    return false;
+  if ((*memory_ptr) == nullptr) {
+    memory_ptr->reset(new GraphMemory(*item));
+    Status s = (*memory_ptr)->InferStatically(cluster->GetDevices());
+    if (!s.ok()) {
+      memory_ptr->reset();
+      VLOG(1) << "Failed to infer memory usage: " << s.error_message();
+      return false;
+    }
   }
+  const GraphMemory& memory = **memory_ptr;
 
   std::unordered_set<NodeDef*> addn_to_rewrite;
-  for (const auto& device : devices) {
+  for (const auto& device : cluster->GetDevices()) {
     const string& name = device.first;
     const DeviceProperties& prop = device.second;
     if (prop.memory_size() <= 0) {
@@ -572,9 +575,9 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
     return false;
   }
   GraphProperties properties(*item);
-  s = properties.InferStatically(/*assume_valid_feeds=*/false,
-                                 /*aggressive_shape_inference=*/false,
-                                 /*include_tensor_values=*/false);
+  Status s = properties.InferStatically(/*assume_valid_feeds=*/false,
+                                        /*aggressive_shape_inference=*/false,
+                                        /*include_tensor_values=*/false);
   if (!s.ok()) {
     VLOG(1) << "Failed to infer shapes: " << s.error_message();
     return false;
@@ -973,19 +976,23 @@ struct MemInfo {
 };
 
 static bool IdentifySwappingCandidates(
-    Cluster* cluster, GrapplerItem* item, std::unordered_set<string>* skip_list,
+    Cluster* cluster, GrapplerItem* item,
+    std::unique_ptr<GraphMemory>* memory_ptr,
+    std::unordered_set<string>* skip_list,
     std::unordered_map<NodeDef*, SwapInfo>* nodes_to_swap) {
-  GraphMemory memory(*item);
-  const std::unordered_map<string, DeviceProperties>& devices =
-      cluster->GetDevices();
-  Status s = memory.InferStatically(devices);
-  if (!s.ok()) {
-    VLOG(1) << "Failed to infer memory usage: " << s.error_message();
-    return false;
+  if ((*memory_ptr) == nullptr) {
+    memory_ptr->reset(new GraphMemory(*item));
+    Status s = (*memory_ptr)->InferStatically(cluster->GetDevices());
+    if (!s.ok()) {
+      memory_ptr->reset();
+      VLOG(1) << "Failed to infer memory usage: " << s.error_message();
+      return false;
+    }
   }
+  const GraphMemory& memory = **memory_ptr;
 
   bool updated_graph = false;
-  for (const auto& device : devices) {
+  for (const auto& device : cluster->GetDevices()) {
     const string& name = device.first;
     const DeviceProperties& prop = device.second;
     if (prop.type() != "GPU") {
@@ -1138,14 +1145,15 @@ static bool IdentifySwappingCandidates(
 }
 
 bool SwappingPass(RewriterConfig::MemOptType optimization_level,
-                  Cluster* cluster, GrapplerItem* item,
-                  std::unordered_set<string>* skip_list) {
+                  Cluster* cluster, std::unique_ptr<GraphMemory>* memory,
+                  GrapplerItem* item, std::unordered_set<string>* skip_list) {
   std::unordered_map<NodeDef*, SwapInfo> nodes_to_swap;
   if (optimization_level == RewriterConfig::DEFAULT_MEM_OPT ||
       optimization_level == RewriterConfig::SWAPPING_HEURISTICS ||
       optimization_level == RewriterConfig::HEURISTICS) {
     // Use heuristics to figure out what needs to be swapped;
-    IdentifySwappingCandidates(cluster, item, skip_list, &nodes_to_swap);
+    IdentifySwappingCandidates(cluster, item, memory, skip_list,
+                               &nodes_to_swap);
   }
   // Look for manual annotations in the graph.
   for (auto& node : *item->graph.mutable_node()) {
@@ -1395,7 +1403,8 @@ Status MemoryOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   // that simply won't fit in memory.
   // SchedulingPass() and SwappingPass() rely on defined fetches in order to
   // infer the memory usage, so skip optimization if there are no fetches.
-  if (!item.fetch.empty()) {
+  std::unique_ptr<GraphMemory> memory;
+  if (!item.fetch.empty() && cluster != nullptr) {
     bool updated_graph = true;
     for (int i = 0; i < 25 && updated_graph; ++i) {
       GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
@@ -1404,7 +1413,11 @@ Status MemoryOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
            optimization_level_ == RewriterConfig::SCHEDULING_HEURISTICS ||
            optimization_level_ == RewriterConfig::HEURISTICS) &&
           cluster != nullptr) {
-        updated_graph |= SchedulingPass(cluster, &optimized_item);
+        if (SchedulingPass(cluster, &memory, &optimized_item)) {
+          // Reset the inferred memory usage since the graph changed.
+          memory.reset();
+          updated_graph = true;
+        }
       }
 
       GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
@@ -1413,8 +1426,12 @@ Status MemoryOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
            optimization_level_ == RewriterConfig::HEURISTICS ||
            optimization_level_ == RewriterConfig::MANUAL) &&
           cluster != nullptr) {
-        updated_graph |= SwappingPass(optimization_level_, cluster,
-                                      &optimized_item, &skip_list);
+        if (SwappingPass(optimization_level_, cluster, &memory, &optimized_item,
+                         &skip_list)) {
+          // Reset the inferred memory usage since the graph changed.
+          memory.reset();
+          updated_graph = true;
+        }
       }
     }
   }
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
index 064c293416b..ed7a537d23b 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
@@ -284,7 +284,7 @@ TEST_F(MemoryOptimizerTest, SimpleSwapping) {
   status = optimizer.Optimize(cluster.get(), item_copy, &output);
   TF_EXPECT_OK(status);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   item.fetch = {"e"};
   item.init_ops = {init.name()};
   auto tensors_expected = EvaluateFetchNodes(item);
@@ -336,7 +336,7 @@ TEST_F(MemoryOptimizerTest, SwappingHeuristics) {
     }
   }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   auto tensors_expected = EvaluateFetchNodes(item);
   GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
@@ -385,7 +385,7 @@ TEST_F(MemoryOptimizerTest, UnswappableInputs) {
     }
   }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   auto tensors_expected = EvaluateFetchNodes(item);
   GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
@@ -502,7 +502,7 @@ TEST_F(RelaxAllocatorConstraintsTest, DifferentDevice) {
   auto node = output.node(2);
   EXPECT_EQ("assign", node.name());
   EXPECT_EQ(0, node.attr().count("_grappler_relax_allocator_constraints"));
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   item.fetch = {"exp"};
   item.init_ops = {"variable"};
   auto tensors_expected = EvaluateFetchNodes(item);
@@ -629,7 +629,7 @@ TEST_F(RelaxAllocatorConstraintsTest, AssignNodeInFanout) {
   EXPECT_EQ(1, node.attr().count("_grappler_relax_allocator_constraints"));
   EXPECT_EQ(true, node.attr().at("_grappler_relax_allocator_constraints").b());
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   item.init_ops = {"exp_cpu", "variable_gpu"};
   auto tensors_expected = EvaluateFetchNodes(item);
   GrapplerItem optimized = item.WithGraph(std::move(output));
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index da83e413ff6..0c8fa0449f5 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/auto_mixed_precision.h"
 #include "tensorflow/core/grappler/optimizers/auto_parallel.h"
+#include "tensorflow/core/grappler/optimizers/common_subgraph_elimination.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
 #include "tensorflow/core/grappler/optimizers/debug_stripper.h"
@@ -184,6 +185,8 @@ std::unique_ptr<GraphOptimizer> MetaOptimizer::MakeNewOptimizer(
   MK_OPT("auto_mixed_precision",
          new AutoMixedPrecision(cfg_.auto_mixed_precision()));
   MK_OPT("memory", new MemoryOptimizer(RewriterConfig::MANUAL));
+  MK_OPT("common_subgraph_elimination",
+         new CommonSubgraphElimination(cfg_.common_subgraph_elimination()));
   MK_OPT("arithmetic", new ArithmeticOptimizer(cfg_.arithmetic_optimization()));
   MK_OPT("autoparallel", new AutoParallel(cfg_.auto_parallel().num_replicas()));
   MK_OPT("loop", new LoopOptimizer(cfg_.loop_optimization(), cpu_device_));
@@ -224,6 +227,11 @@ Status MetaOptimizer::InitializeOptimizers(
         cfg_.function_optimization(),
         /*lower_control_flow=*/!IsSingleThreadedExecutor()));
   }
+  if (cfg_.common_subgraph_elimination() != RewriterConfig::OFF &&
+      cfg_.arithmetic_optimization() != RewriterConfig::OFF) {
+    optimizers->push_back(MakeUnique<CommonSubgraphElimination>(
+        cfg_.common_subgraph_elimination()));
+  }
   if (cfg_.debug_stripper() == RewriterConfig::ON) {
     optimizers->push_back(MakeUnique<DebugStripper>());
   }
@@ -371,7 +379,7 @@ void MetaOptimizer::InitializeVerifiers(
   }
 }
 
-Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
+Status MetaOptimizer::OptimizeGraph(Cluster* cluster, GrapplerItem&& item,
                                     GraphDef* optimized_graph) {
   int min_graph_nodes = cfg_.min_graph_nodes() == 0 ? kDefaultMinGraphNodes
                                                     : cfg_.min_graph_nodes();
@@ -418,8 +426,8 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
 
   // Invariant: optimized_graph contains the most recently optimized version of
   // the graph.
-  GrapplerItem optimized_item = item;
-  optimized_graph->Swap(&optimized_item.graph);
+  auto original_producer = item.graph.versions().producer();
+  optimized_graph->Swap(&item.graph);
 
   GraphOptimizationResult optimization_result(item.id);
   GraphOptimizer* sa_optimizer = nullptr;
@@ -457,7 +465,7 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
         continue;
       }
 
-      TF_RETURN_IF_ERROR(RunOptimizer(optimizer.get(), cluster, &optimized_item,
+      TF_RETURN_IF_ERROR(RunOptimizer(optimizer.get(), cluster, &item,
                                       optimized_graph, &optimization_result));
 
       if (iteration == 0 && optimizer->name() == "model_pruner") {
@@ -490,7 +498,7 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
 
   // ScopedAllocatorOptimizer must run last.
   if (sa_optimizer != nullptr) {
-    TF_RETURN_IF_ERROR(RunOptimizer(sa_optimizer, cluster, &optimized_item,
+    TF_RETURN_IF_ERROR(RunOptimizer(sa_optimizer, cluster, &item,
                                     optimized_graph, &optimization_result));
     GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
   }
@@ -508,8 +516,7 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
     TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph));
     ReassignColocation(optimized_graph);
     // Make sure that the optimizers preserved the graph version.
-    DCHECK_EQ(optimized_graph->versions().producer(),
-              item.graph.versions().producer());
+    DCHECK_EQ(optimized_graph->versions().producer(), original_producer);
   }
 
   return Status::OK();
@@ -582,8 +589,8 @@ Status MetaOptimizer::RunOptimizer(
   return Status::OK();
 }
 
-Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
-                               GraphDef* optimized_graph) {
+Status MetaOptimizer::OptimizeConsumeItem(Cluster* cluster, GrapplerItem&& item,
+                                          GraphDef* optimized_graph) {
   VLOG(1) << "Starting optimization for grappler item: " << item.id;
   optimization_results_.clear();
 
@@ -601,21 +608,21 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   // remove all the unreachable functions.
   // TODO(ezhulenev): Construct reachable function library definition directly
   // from the proto without constructing temporary FunctionLibraryDefinition.
-  GraphDef trimmed_graph;  // do not copy graph with a potentially huge library
-  *trimmed_graph.mutable_node() = item.graph.node();
-  *trimmed_graph.mutable_versions() = item.graph.versions();
-  *trimmed_graph.mutable_library() = minimized_flib(item.graph).ToProto();
-
-  GrapplerItem trimmed_item = item.WithGraph(std::move(trimmed_graph));
+  *item.graph.mutable_library() = minimized_flib(item.graph).ToProto();
 
   VLOG(1) << absl::Substitute(
       "Deleted $0 unreachable functions from the graph (library size = $1)",
       item.graph.library().function_size() -
-          trimmed_item.graph.library().function_size(),
-      trimmed_item.graph.library().function_size());
+          item.graph.library().function_size(),
+      item.graph.library().function_size());
+
+  // Save a few small fields from item before we move it.
+  bool optimize_function_library =
+      item.optimization_options().optimize_function_library;
+  const auto producer = item.graph.versions().producer();
 
   // 1. Optimize main graph
-  TF_RETURN_IF_ERROR(OptimizeGraph(cluster, trimmed_item, optimized_graph));
+  TF_RETURN_IF_ERROR(OptimizeGraph(cluster, std::move(item), optimized_graph));
   VLOG(1) << "Optimized main graph.";
   GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
 
@@ -667,9 +674,6 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 
   // Optimize each function only once.
   absl::flat_hash_set<string> optimized_funcs;
-  bool optimize_function_library =
-      item.optimization_options().optimize_function_library;
-
   while (optimize_function_library) {
     optimize_function_library = false;
 
@@ -703,8 +707,8 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 
       // Make a GrapplerItem from a FunctionDef.
       GrapplerFunctionItem func_item;
-      TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(
-          func, flib, trimmed_item.graph.versions().producer(), &func_item));
+      TF_RETURN_IF_ERROR(
+          MakeGrapplerFunctionItem(func, flib, producer, &func_item));
 
       // If we need to compute the gradient of optimized function at runtime, we
       // can't perform non-differentiable rewrites.
@@ -752,8 +756,9 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
         TF_RETURN_IF_ERROR(implementation_selector.Optimize(
             cluster, func_item, &optimized_func_graph));
       } else {
-        TF_RETURN_IF_ERROR(
-            OptimizeGraph(cluster, func_item, &optimized_func_graph));
+        GrapplerFunctionItem func_item_copy = func_item;
+        TF_RETURN_IF_ERROR(OptimizeGraph(cluster, std::move(func_item_copy),
+                                         &optimized_func_graph));
       }
 
       // Function body optimization might have created new specialized
@@ -812,6 +817,7 @@ bool MetaOptimizerEnabled(const ConfigProto& cfg) {
          rewrite_cfg.constant_folding() != RewriterConfig::OFF ||
          rewrite_cfg.shape_optimization() != RewriterConfig::OFF ||
          rewrite_cfg.remapping() != RewriterConfig::OFF ||
+         rewrite_cfg.common_subgraph_elimination() != RewriterConfig::OFF ||
          rewrite_cfg.arithmetic_optimization() != RewriterConfig::OFF ||
          rewrite_cfg.loop_optimization() != RewriterConfig::OFF ||
          rewrite_cfg.dependency_optimization() != RewriterConfig::OFF ||
@@ -825,13 +831,14 @@ bool MetaOptimizerEnabled(const ConfigProto& cfg) {
          !rewrite_cfg.custom_optimizers().empty();
 }
 
-Status RunMetaOptimizer(const GrapplerItem& item, const ConfigProto& cfg,
+Status RunMetaOptimizer(GrapplerItem&& item, const ConfigProto& cfg,
                         DeviceBase* cpu_device, Cluster* cluster,
                         GraphDef* optimized_graph) {
   MetaOptimizer optimizer(cpu_device, cfg);
   optimizer.set_deadline_usec(
       DeadlineMicroSeconds(cfg.graph_options().rewrite_options()));
-  return optimizer.Optimize(cluster, item, optimized_graph);
+  return optimizer.OptimizeConsumeItem(cluster, std::move(item),
+                                       optimized_graph);
 }
 
 Status OptimizeGraph(
@@ -874,7 +881,7 @@ Status OptimizeGraph(
   // TODO(nareshmodi): Consider adding and using the more generic GraphOptions
   // proto (which also contain the OptimizerOptions).
   TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
-      item, config_proto, cpu_device, &cluster, &out_graph));
+      std::move(item), config_proto, cpu_device, &cluster, &out_graph));
 
   std::unique_ptr<tensorflow::Graph> optimized_graph(
       new tensorflow::Graph(OpRegistry::Global()));
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h
index 18392f667b4..f39f0b62bb6 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h
@@ -42,7 +42,13 @@ class MetaOptimizer : public GraphOptimizer {
   bool UsesFunctionLibrary() const override { return true; }
 
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
-                  GraphDef* optimized_graph) override;
+                  GraphDef* optimized_graph) override {
+    GrapplerItem copy(item);
+    return OptimizeConsumeItem(cluster, std::move(copy), optimized_graph);
+  }
+
+  Status OptimizeConsumeItem(Cluster* cluster, GrapplerItem&& item,
+                             GraphDef* optimized_graph);
 
   void PrintResult();
 
@@ -77,7 +83,7 @@ class MetaOptimizer : public GraphOptimizer {
 
   // Run optimization pass over a single GrapplerItem. Meta optimizer might run
   // multiple such passes: 1) for the main graph 2) for the function library
-  Status OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
+  Status OptimizeGraph(Cluster* cluster, GrapplerItem&& item,
                        GraphDef* optimized_graph);
 
   DeviceBase* const cpu_device_;  // may be NULL
@@ -111,7 +117,7 @@ bool MetaOptimizerEnabled(const ConfigProto& cfg);
 // during constant folding; if NULL, a new device is created for doing constant
 // folding. For performance, it is recommended to pass in an existing cpu_device
 // when possible.
-Status RunMetaOptimizer(const GrapplerItem& item, const ConfigProto& cfg,
+Status RunMetaOptimizer(GrapplerItem&& item, const ConfigProto& cfg,
                         DeviceBase* cpu_device, Cluster* cluster,
                         GraphDef* optimized_graph);
 
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index 0b40363ac7d..595b636c7a9 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -722,12 +722,13 @@ TEST_F(MetaOptimizerTest, OptimizerTimesOut) {
   rewriter_config.set_meta_optimizer_iterations(RewriterConfig::ONE);
 
   GraphDef output;
+  GraphDef original = item.graph;
   const Status status =
-      RunMetaOptimizer(item, config, nullptr, nullptr, &output);
+      RunMetaOptimizer(std::move(item), config, nullptr, nullptr, &output);
   EXPECT_EQ(status.error_message(), "meta_optimizer exceeded deadline.");
   // Make sure the graph was reverted to the original regardless of when the
   // optimizer timed out.
-  CompareGraphs(item.graph, output);
+  CompareGraphs(original, output);
 }
 
 TEST_F(MetaOptimizerTest, MetaOptimizerTimesOut) {
@@ -744,11 +745,12 @@ TEST_F(MetaOptimizerTest, MetaOptimizerTimesOut) {
   rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
 
   GraphDef output;
+  const int original_node_size = item.graph.node_size();
   const Status status =
-      RunMetaOptimizer(item, config, nullptr, nullptr, &output);
+      RunMetaOptimizer(std::move(item), config, nullptr, nullptr, &output);
   EXPECT_EQ(status.error_message(), "meta_optimizer exceeded deadline.");
   // The meta optimizer should manage to finish one iteration.
-  EXPECT_EQ(item.graph.node_size() + 1, output.node_size());
+  EXPECT_EQ(original_node_size + 1, output.node_size());
 }
 
 TEST_F(MetaOptimizerTest, OptimizerDoesNotTimeOut) {
@@ -764,11 +766,12 @@ TEST_F(MetaOptimizerTest, OptimizerDoesNotTimeOut) {
   rewriter_config.set_meta_optimizer_timeout_ms(2500);
   rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
   GraphDef output;
+  const int original_node_size = item.graph.node_size();
   const Status status =
-      RunMetaOptimizer(item, config, nullptr, nullptr, &output);
+      RunMetaOptimizer(std::move(item), config, nullptr, nullptr, &output);
   TF_EXPECT_OK(status);
   // The meta optimizer should manage to finish two iterations.
-  EXPECT_EQ(item.graph.node_size() + 2, output.node_size());
+  EXPECT_EQ(original_node_size + 2, output.node_size());
 }
 
 TEST_F(MetaOptimizerTest, RunPostOptimizationVerifiersOnValidGraph) {
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index d549cbdc820..5b41ad38089 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -282,6 +282,10 @@ bool IsCpuCompatible(const RemapperContext& ctx, const Pattern& matched) {
 // Checks if we can rewrite a pattern to the `_FusedConv2D` on GPU device.
 bool IsGpuCompatible(const RemapperContext& ctx,
                      const ContractionWithBiasAddAndActivation& matched) {
+#if TENSORFLOW_USE_ROCM
+  // ROCm does not support _FusedConv2D
+  return false;
+#endif
   const GraphDef* graph = ctx.graph_view.graph();
   const NodeDef& contraction_node = graph->node(matched.contraction);
   if (!IsConv2D(contraction_node)) return false;
diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
index 1fe2e237fda..35e09b28205 100644
--- a/tensorflow/core/grappler/optimizers/remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -69,9 +69,9 @@ TEST_F(RemapperTest, FusedBatchNorm) {
 }
 
 TEST_F(RemapperTest, FusedBatchNormNCHW) {
-#if !GOOGLE_CUDA
-  GTEST_SKIP() << "CUDA is not enabled";
-#endif  // !GOOGLE_CUDA
+#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
+#endif  // !GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output dflt =
       ops::Const(s.WithOpName("dflt"), {3.14f, 2.7f, 1.0f, 2.0f, 3.0f, 100.0f},
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index 87835245762..a50c6f71fee 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/node_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -65,8 +66,8 @@ class NodeMap {
 
  private:
   const std::set<NodeDef*> empty_set_;
-  gtl::FlatMap<string, NodeDef*> nodes_;
-  gtl::FlatMap<string, std::set<NodeDef*>> outputs_;
+  absl::node_hash_map<string, NodeDef*> nodes_;
+  absl::node_hash_map<string, std::set<NodeDef*>> outputs_;
 };
 
 // A vector with a set. The set stores the same elements as the vector, and
diff --git a/tensorflow/core/grappler/utils_test.cc b/tensorflow/core/grappler/utils_test.cc
index 7e3d4d90dcd..38098c84965 100644
--- a/tensorflow/core/grappler/utils_test.cc
+++ b/tensorflow/core/grappler/utils_test.cc
@@ -423,10 +423,10 @@ TEST(IsKernelRegisteredForNode, All) {
   v.set_type(DataType::DT_FLOAT);
   (*node.mutable_attr())["T"] = v;
   TF_EXPECT_OK(IsKernelRegisteredForNode(node));
-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   node.set_device("/gpu:0");
   TF_EXPECT_OK(IsKernelRegisteredForNode(node));
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
   // Bad device name.
   node.set_device("");
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index b3aa9278c92..fdb55e8c928 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -288,6 +288,7 @@ tf_kernel_library(
     deps = [
         ":bounds_check",
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
         "//third_party/eigen3",
     ],
     alwayslink = 0,
@@ -325,15 +326,14 @@ tf_kernel_library(
     deps = [
         ":eigen_helpers",
         ":fill_functor",
-        ":gpu_utils",
         ":image_resizer_state",
         ":ops_util",
+        "//third_party/eigen3",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//third_party/eigen3",
-    ],
+    ] + if_cuda_or_rocm([":gpu_utils"]),
     alwayslink = 1,
 )
 
@@ -2280,6 +2280,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels/data:single_threaded_executor",
     ],
 )
 
@@ -2626,6 +2627,7 @@ tf_kernel_library(
     deps = DYNAMIC_DEPS + [
         ":fill_functor",
         ":gather_functor",
+        "//tensorflow/core:framework_internal",
     ] + if_cuda(["@cub_archive//:cub"]) + if_rocm([
         "@local_config_rocm//rocm:rocprim",
     ]),
@@ -3028,7 +3030,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "crop_and_resize_op",
     prefix = "crop_and_resize_op",
-    deps = IMAGE_DEPS,
+    deps = IMAGE_DEPS + ["//tensorflow/core:framework_internal"],
 )
 
 tf_kernel_library(
@@ -3576,6 +3578,7 @@ tf_kernel_library(
     visibility = [":friends"],
     deps = [
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform:dso_loader",
@@ -3991,6 +3994,48 @@ cc_library(
     }),
 )
 
+# TODO(annarev): conv_ops_3d_headers currently depends on android target build
+# from selected sources. We should switch to use granular dependencies instead.
+# Then, we can just depend on "conv3d".
+cc_library(
+    name = "conv_3d_mobile",
+    hdrs = [
+        "conv_3d.h",
+        "eigen_backward_cuboid_convolutions.h",
+        "eigen_convolution_helpers.h",
+        "eigen_cuboid_convolution.h",
+        "eigen_volume_patch.h",
+    ],
+    deps = [
+        ":eigen_spatial_convolutions-inl",
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:framework",
+        ],
+    }),
+)
+
+cc_library(
+    name = "conv_ops_3d_headers",
+    hdrs = [
+        "conv_ops_3d.h",
+    ],
+    deps = select({
+        "//tensorflow:android": [
+            ":conv_3d_mobile",
+            "//tensorflow/core:android_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
+        ],
+        "//conditions:default": [
+            ":conv_3d",
+            "//third_party/eigen3",
+            "//tensorflow/core:framework",
+        ],
+    }),
+)
+
 tf_kernel_library(
     name = "argmax_op",
     prefix = "argmax_op",
@@ -4039,7 +4084,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "check_numerics_op",
     prefix = "check_numerics_op",
-    deps = MATH_DEPS,
+    deps = MATH_DEPS + ["//tensorflow/core:framework_internal"],
 )
 
 tf_kernel_library(
@@ -4089,13 +4134,12 @@ tf_kernel_library(
     deps = MATH_DEPS + [
         ":eigen_contraction_kernel",
         ":fused_eigen_output_kernels",
-        ":gpu_utils",
     ] + select({
         ":xsmm": ["@libxsmm_archive//:xsmm_avx"],
         "//conditions:default": [],
     }) + mkl_deps() + if_cuda([
         "//tensorflow/core/platform/default/build_config:cublas_plugin",
-    ]),
+    ]) + if_cuda_or_rocm([":gpu_utils"]),
 )
 
 tf_mkl_kernel_library(
@@ -4537,11 +4581,11 @@ tf_kernel_library(
     prefix = "conv_ops",
     deps = [
         ":conv_grad_shape_utils",
+        ":conv_ops_3d_headers",
         ":bounds_check",
         ":conv_2d",
         ":conv_3d",
         ":eigen_contraction_kernel",
-        ":gpu_utils",
         ":image_resizer_state",
         ":fill_functor",
         ":fused_eigen_output_kernels",
@@ -4555,7 +4599,6 @@ tf_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/util/proto:proto_utils",
         "//tensorflow/stream_executor/gpu:gpu_asm_opts",
-        "//tensorflow/stream_executor/gpu:redzone_allocator",
     ] + select({
         ":xsmm_convolutions": [
             "@libxsmm_archive//:xsmm_avx",
@@ -4567,6 +4610,9 @@ tf_kernel_library(
         "//tensorflow/stream_executor:tf_allocator_adapter",
         "//tensorflow/stream_executor:stream_executor_headers",
         "//tensorflow/core:stream_executor",
+    ]) + if_cuda_or_rocm([
+        ":gpu_utils",
+        "//tensorflow/stream_executor/gpu:redzone_allocator",
     ]),
 )
 
@@ -6514,6 +6560,7 @@ filegroup(
         "conv_2d.h",
         "conv_3d.h",
         "conv_ops.h",
+        "conv_ops_3d.h",
         "conv_ops_gpu.h",
         "data_format_ops.h",
         "depthtospace_op.h",
diff --git a/tensorflow/core/kernels/avgpooling_op.cc b/tensorflow/core/kernels/avgpooling_op.cc
index 0f22daba43c..20df833a934 100644
--- a/tensorflow/core/kernels/avgpooling_op.cc
+++ b/tensorflow/core/kernels/avgpooling_op.cc
@@ -20,7 +20,9 @@ limitations under the License.
 #include "tensorflow/core/kernels/avgpooling_op.h"
 
 #include <vector>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc
index adbe370395c..280fecc5c82 100644
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@@ -293,7 +293,7 @@ class Barrier : public ResourceBase {
     return value_component_shapes_;
   }
 
-  ~Barrier() override EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  ~Barrier() override TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     mutex_lock lock(mu_);
     incomplete_.clear();
     ready_queue_->Unref();
@@ -307,7 +307,7 @@ class Barrier : public ResourceBase {
                          const Tensor& values, const TensorShape& element_shape,
                          int component_index, int i,
                          std::vector<Tuple>* ready_tuples, bool* new_elements)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     auto keys_vec = keys.flat<tstring>();
     auto values_matrix = values.flat_outer_dims<T>();
 
@@ -405,7 +405,7 @@ class Barrier : public ResourceBase {
 
   void CloseQueueLocked(OpKernelContext* ctx, bool cancel_pending_enqueues,
                         const DoneCallback& callback)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     // CloseQueueLocked may only be called with mu_ held.
     if (!cancel_pending_enqueues && queue_closed_) {
       callback();
@@ -425,15 +425,15 @@ class Barrier : public ResourceBase {
  private:
   typedef std::vector<PersistentTensor> PersistentTuple;
   mutex mu_;
-  bool closed_ GUARDED_BY(mu_);
-  bool queue_closed_ GUARDED_BY(mu_);
-  bool queue_cancelled_ GUARDED_BY(mu_);
-  bool cancel_pending_enqueues_ GUARDED_BY(mu_);
+  bool closed_ TF_GUARDED_BY(mu_);
+  bool queue_closed_ TF_GUARDED_BY(mu_);
+  bool queue_cancelled_ TF_GUARDED_BY(mu_);
+  bool cancel_pending_enqueues_ TF_GUARDED_BY(mu_);
   const DataTypeVector value_component_types_;
   const std::vector<TensorShape>& value_component_shapes_;
   const string name_;
-  int64 input_index_ GUARDED_BY(mu_);
-  std::unordered_map<string, PersistentTuple> incomplete_ GUARDED_BY(mu_);
+  int64 input_index_ TF_GUARDED_BY(mu_);
+  std::unordered_map<string, PersistentTuple> incomplete_ TF_GUARDED_BY(mu_);
   PriorityQueue* ready_queue_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Barrier);
@@ -463,7 +463,7 @@ class BarrierOp : public ResourceOpKernel<Barrier> {
 
  private:
   Status CreateResource(Barrier** barrier) override
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     *barrier = new Barrier(value_component_types_, value_component_shapes_,
                            cinfo_.name());
     if (*barrier == nullptr) {
@@ -473,7 +473,7 @@ class BarrierOp : public ResourceOpKernel<Barrier> {
   }
 
   Status VerifyResource(Barrier* barrier) override
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (barrier->component_types() != value_component_types_) {
       return errors::InvalidArgument(
           "Shared barrier '", cinfo_.name(), "' has component types ",
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index 69349b47eff..0af40436490 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -515,6 +515,7 @@ class BatchResource : public ResourceBase {
     opts.stats_collector = last_task_context->stats_collector();
     opts.rendezvous = last_task_context->rendezvous();
     opts.runner = last_task_context->runner();
+    opts.run_all_kernels_inline = last_task_context->run_all_kernels_inline();
 
     auto* flib = last_task_context->function_library();
     std::vector<Tensor> combined_outputs;
@@ -685,7 +686,7 @@ class BatchResource : public ResourceBase {
   // ones (with a time delay?); it's okay if they get recreated later).
   mutable mutex batcher_queues_mu_;
   std::map<string, std::unique_ptr<BatcherQueue>> batcher_queues_
-      GUARDED_BY(batcher_queues_mu_);
+      TF_GUARDED_BY(batcher_queues_mu_);
 
   std::vector<int32> allowed_batch_sizes_;
   FunctionLibraryRuntime::Handle fhandle_;
@@ -1044,8 +1045,9 @@ class UnbatchResource : public ResourceBase {
 
   // Maps keyed by BatchKey of tensors waiting for callbacks and callbacks
   // waiting for tensors.
-  std::unordered_map<int64, WaitingTensor> waiting_tensors_ GUARDED_BY(mu_);
-  std::unordered_map<int64, WaitingCallback> waiting_callbacks_ GUARDED_BY(mu_);
+  std::unordered_map<int64, WaitingTensor> waiting_tensors_ TF_GUARDED_BY(mu_);
+  std::unordered_map<int64, WaitingCallback> waiting_callbacks_
+      TF_GUARDED_BY(mu_);
 
   // A thread that evicts waiting tensors and callbacks that have exceeded their
   // deadline.
@@ -1101,7 +1103,7 @@ class UnbatchGradResource : public ResourceBase {
   // callback. Clears all information about it from the available_tensors_.
   Status OutputBatch(OpKernelContext* context,
                      const AsyncOpKernel::DoneCallback& done)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     const Tensor& batch_index_t = context->input(1);
     auto batch_index =
         batch_index_t.shaped<int64, 2>({batch_index_t.dim_size(0), 3});
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
index b4b96675b47..0aa848e3555 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
@@ -154,14 +154,14 @@ class AdaptiveSharedBatchScheduler
                        BatchProcessor callback, bool is_express);
 
   // Schedules batch if in_flight_batches_limit_ is not met.
-  void MaybeScheduleNextBatch() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  void MaybeScheduleNextBatch() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Schedules the earliest closed batch in batches_
   // if batch_thread_pool_ has an idle thead.
   // Batches scheduled this way are called express batches.
   // Express batches are not limited by in_flight_batches_limit_, and
   // their latencies will not affect in_flight_batches_limit_.
-  void MaybeScheduleClosedBatch() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  void MaybeScheduleClosedBatch() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Notifies scheduler of non-empty batch which is eligible for processing.
   void AddBatch(const internal::ASBSBatch<TaskType>* batch,
@@ -176,11 +176,11 @@ class AdaptiveSharedBatchScheduler
 
   // Collection of batches added by AddBatch, ordered by age. Owned by scheduler
   // until they are released for processing.
-  std::vector<const internal::ASBSBatch<TaskType>*> batches_ GUARDED_BY(mu_);
+  std::vector<const internal::ASBSBatch<TaskType>*> batches_ TF_GUARDED_BY(mu_);
 
   // Unowned queues and callbacks added by AddQueue.
   std::unordered_map<const internal::ASBSQueue<TaskType>*, BatchProcessor>
-      queues_and_callbacks_ GUARDED_BY(mu_);
+      queues_and_callbacks_ TF_GUARDED_BY(mu_);
 
   mutex mu_;
 
@@ -190,12 +190,12 @@ class AdaptiveSharedBatchScheduler
   // Limit on number of batches which can be concurrently processed.
   // Non-integer values correspond to probabilistic limits - i.e. a value of 3.2
   // results in an actual cap of 3 80% of the time, and 4 20% of the time.
-  double in_flight_batches_limit_ GUARDED_BY(mu_);
+  double in_flight_batches_limit_ TF_GUARDED_BY(mu_);
 
   // Number of regular batches currently being processed.
-  int64 in_flight_batches_ GUARDED_BY(mu_) = 0;
+  int64 in_flight_batches_ TF_GUARDED_BY(mu_) = 0;
   // Number of express batches currently being processed.
-  int64 in_flight_express_batches_ GUARDED_BY(mu_) = 0;
+  int64 in_flight_express_batches_ TF_GUARDED_BY(mu_) = 0;
 
   // RNG engine and distribution.
   std::default_random_engine rand_engine_;
@@ -203,21 +203,21 @@ class AdaptiveSharedBatchScheduler
 
   // Fields controlling the dynamic adjustment of in_flight_batches_limit_.
   // Number of batches since the last in_flight_batches_limit_ adjustment.
-  int64 batch_count_ GUARDED_BY(mu_) = 0;
+  int64 batch_count_ TF_GUARDED_BY(mu_) = 0;
   // Sum of processing latency for batches counted by batch_count_.
-  int64 batch_latency_sum_ GUARDED_BY(mu_) = 0;
+  int64 batch_latency_sum_ TF_GUARDED_BY(mu_) = 0;
   // Average batch latency for previous value of in_flight_batches_limit_.
-  double last_avg_latency_ms_ GUARDED_BY(mu_) = 0;
+  double last_avg_latency_ms_ TF_GUARDED_BY(mu_) = 0;
   // Did last_avg_latency_ms_ decrease from the previous last_avg_latency_ms_?
-  bool last_latency_decreased_ GUARDED_BY(mu_) = false;
+  bool last_latency_decreased_ TF_GUARDED_BY(mu_) = false;
   // Current direction (+-) to adjust in_flight_batches_limit_
-  int step_direction_ GUARDED_BY(mu_) = 1;
+  int step_direction_ TF_GUARDED_BY(mu_) = 1;
   // Max adjustment size (as a fraction of in_flight_batches_limit_).
   constexpr static double kMaxStepSizeMultiplier = 0.125;  // 1/8;
   // Min adjustment size (as a fraction of in_flight_batches_limit_).
   constexpr static double kMinStepSizeMultiplier = 0.0078125;  // 1/128
   // Current adjustment size (as a fraction of in_flight_batches_limit_).
-  double step_size_multiplier_ GUARDED_BY(mu_) = kMaxStepSizeMultiplier;
+  double step_size_multiplier_ TF_GUARDED_BY(mu_) = kMaxStepSizeMultiplier;
 
   TF_DISALLOW_COPY_AND_ASSIGN(AdaptiveSharedBatchScheduler);
 };
@@ -260,9 +260,9 @@ class ASBSQueue : public BatchScheduler<TaskType> {
   std::shared_ptr<AdaptiveSharedBatchScheduler<TaskType>> scheduler_;
   const QueueOptions options_;
   // Owned by scheduler_.
-  ASBSBatch<TaskType>* current_batch_ GUARDED_BY(mu_) = nullptr;
-  int64 num_enqueued_batches_ GUARDED_BY(mu_) = 0;
-  int64 num_enqueued_tasks_ GUARDED_BY(mu_) = 0;
+  ASBSBatch<TaskType>* current_batch_ TF_GUARDED_BY(mu_) = nullptr;
+  int64 num_enqueued_batches_ TF_GUARDED_BY(mu_) = 0;
+  int64 num_enqueued_tasks_ TF_GUARDED_BY(mu_) = 0;
   mutable mutex mu_;
   TF_DISALLOW_COPY_AND_ASSIGN(ASBSQueue);
 };
diff --git a/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc b/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc
index 65c9c00da57..af93a3ec9a6 100644
--- a/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc
+++ b/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc
@@ -197,7 +197,7 @@ class LatencyBenchmark {
 
  private:
   // Resets all mutable state, including the scheduler and latency measurements.
-  void ResetState() LOCKS_EXCLUDED(mu_);
+  void ResetState() TF_LOCKS_EXCLUDED(mu_);
 
   // Processes a batch of tasks. (Invoked by 'scheduler_' on one of its batch
   // threads.)
@@ -225,10 +225,10 @@ class LatencyBenchmark {
 
   // A histogram of the task latencies, i.e. queue time plus processing time, in
   // milliseconds.
-  Histogram task_latency_millis_histogram_ GUARDED_BY(mu_);
+  Histogram task_latency_millis_histogram_ TF_GUARDED_BY(mu_);
 
   // A histogram of the batch sizes.
-  Histogram batch_size_histogram_ GUARDED_BY(mu_);
+  Histogram batch_size_histogram_ TF_GUARDED_BY(mu_);
 };
 
 LatencyBenchmark::LatencyBenchmark(
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler.h b/tensorflow/core/kernels/batching_util/batch_scheduler.h
index f6d9a8f0c88..e418f8acbb1 100644
--- a/tensorflow/core/kernels/batching_util/batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler.h
@@ -117,10 +117,10 @@ class Batch {
   mutable mutex mu_;
 
   // The tasks in the batch.
-  std::vector<std::unique_ptr<TaskType>> tasks_ GUARDED_BY(mu_);
+  std::vector<std::unique_ptr<TaskType>> tasks_ TF_GUARDED_BY(mu_);
 
   // The sum of the sizes of the tasks in 'tasks_'.
-  size_t size_ GUARDED_BY(mu_) = 0;
+  size_t size_ TF_GUARDED_BY(mu_) = 0;
 
   // Whether the batch has been closed.
   Notification closed_;
diff --git a/tensorflow/core/kernels/batching_util/fake_clock_env.h b/tensorflow/core/kernels/batching_util/fake_clock_env.h
index 0e545633f46..f7cc9f433ef 100644
--- a/tensorflow/core/kernels/batching_util/fake_clock_env.h
+++ b/tensorflow/core/kernels/batching_util/fake_clock_env.h
@@ -58,13 +58,13 @@ class FakeClockEnv : public EnvWrapper {
  private:
   mutable mutex mu_;
 
-  uint64 current_time_ GUARDED_BY(mu_) = 0;
+  uint64 current_time_ TF_GUARDED_BY(mu_) = 0;
 
   struct SleepingThread {
     uint64 wake_time;
     Notification* wake_notification;
   };
-  std::vector<SleepingThread> sleeping_threads_ GUARDED_BY(mu_);
+  std::vector<SleepingThread> sleeping_threads_ TF_GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(FakeClockEnv);
 };
diff --git a/tensorflow/core/kernels/batching_util/periodic_function.h b/tensorflow/core/kernels/batching_util/periodic_function.h
index 36a4019002a..3db99ab5cf3 100644
--- a/tensorflow/core/kernels/batching_util/periodic_function.h
+++ b/tensorflow/core/kernels/batching_util/periodic_function.h
@@ -111,7 +111,7 @@ class PeriodicFunction {
   void NotifyStop();
 
   // (Blocking.) Loops forever calling "function_" every "interval_micros_".
-  void RunLoop(int64 start) LOCKS_EXCLUDED(mutex_);
+  void RunLoop(int64 start) TF_LOCKS_EXCLUDED(mutex_);
 
   const std::function<void()> function_;  // Actual client function
   const int64 interval_micros_;           // Interval between calls.
@@ -123,7 +123,7 @@ class PeriodicFunction {
   Notification stop_thread_;
 
   // Thread for running "function_"
-  std::unique_ptr<Thread> thread_ GUARDED_BY(mutex_) = nullptr;
+  std::unique_ptr<Thread> thread_ TF_GUARDED_BY(mutex_) = nullptr;
 
   TF_DISALLOW_COPY_AND_ASSIGN(PeriodicFunction);
 };
diff --git a/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h b/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h
index d3664db25bd..ed5eca78aff 100644
--- a/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h
@@ -148,28 +148,28 @@ class SerialDeviceBatchScheduler : public std::enable_shared_from_this<
 
   // Collection of batches added by AddBatch. Owned by scheduler until they are
   // released for processing.
-  std::vector<const internal::SDBSBatch<TaskType>*> batches_ GUARDED_BY(mu_);
+  std::vector<const internal::SDBSBatch<TaskType>*> batches_ TF_GUARDED_BY(mu_);
 
   // Unowned queues and callbacks added by AddQueue.
   std::unordered_map<const internal::SDBSQueue<TaskType>*, BatchProcessor>
-      queues_and_callbacks_ GUARDED_BY(mu_);
+      queues_and_callbacks_ TF_GUARDED_BY(mu_);
 
   // Responsible for running the batch processing callbacks.
   std::unique_ptr<thread::ThreadPool> batch_thread_pool_;
 
   // Limit on number of batches which can be concurrently processed.
-  int64 in_flight_batches_limit_ GUARDED_BY(mu_);
+  int64 in_flight_batches_limit_ TF_GUARDED_BY(mu_);
 
   // Number of batch processing threads.
-  int64 processing_threads_ GUARDED_BY(mu_) = 0;
+  int64 processing_threads_ TF_GUARDED_BY(mu_) = 0;
 
   // Number of batches processed since the last in_flight_batches_limit_
   // adjustment.
-  int64 batch_count_ GUARDED_BY(mu_) = 0;
+  int64 batch_count_ TF_GUARDED_BY(mu_) = 0;
 
   // Number of times since the last in_flight_batches_limit_ adjustment when a
   // processing thread was available but there were no batches to process.
-  int64 no_batch_count_ GUARDED_BY(mu_) = 0;
+  int64 no_batch_count_ TF_GUARDED_BY(mu_) = 0;
 
   // Sum of batches pending on the serial device since the last
   // in_flight_batches_limit_ adjustment.
@@ -229,9 +229,9 @@ class SDBSQueue : public BatchScheduler<TaskType> {
   std::shared_ptr<SerialDeviceBatchScheduler<TaskType>> scheduler_;
   const QueueOptions options_;
   // Owned by scheduler_.
-  SDBSBatch<TaskType>* current_batch_ GUARDED_BY(mu_) = nullptr;
-  int64 num_enqueued_batches_ GUARDED_BY(mu_) = 0;
-  int64 num_enqueued_tasks_ GUARDED_BY(mu_) = 0;
+  SDBSBatch<TaskType>* current_batch_ TF_GUARDED_BY(mu_) = nullptr;
+  int64 num_enqueued_batches_ TF_GUARDED_BY(mu_) = 0;
+  int64 num_enqueued_tasks_ TF_GUARDED_BY(mu_) = 0;
   mutable mutex mu_;
   TF_DISALLOW_COPY_AND_ASSIGN(SDBSQueue);
 };
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index a2c7cc81b54..796b94f72a6 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -187,11 +187,11 @@ class SharedBatchScheduler
   // All "active" queues, i.e. ones that either:
   //  - have not been removed, or
   //  - have been removed but are not yet empty.
-  QueueList queues_ GUARDED_BY(mu_);
+  QueueList queues_ TF_GUARDED_BY(mu_);
 
   // An iterator over 'queues_', pointing to the queue from which the next
   // available batch thread should grab work.
-  typename QueueList::iterator next_queue_to_schedule_ GUARDED_BY(mu_);
+  typename QueueList::iterator next_queue_to_schedule_ TF_GUARDED_BY(mu_);
 
   // Used by idle batch threads to wait for work to enter the system. Notified
   // whenever a batch becomes schedulable.
@@ -277,15 +277,15 @@ class Queue {
 
  private:
   // Same as IsEmpty(), but assumes the caller already holds a lock on 'mu_'.
-  bool IsEmptyInternal() const EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  bool IsEmptyInternal() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Closes the open batch residing at the back of 'batches_', and inserts a
   // fresh open batch behind it.
-  void StartNewBatch() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  void StartNewBatch() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Determines whether the open batch residing at the back of 'batches_' is
   // currently schedulable.
-  bool IsOpenBatchSchedulable() const EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  bool IsOpenBatchSchedulable() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   const typename SharedBatchScheduler<TaskType>::QueueOptions options_;
 
@@ -305,28 +305,28 @@ class Queue {
   // Whether this queue can accept new tasks. This variable is monotonic: it
   // starts as false, and then at some point gets set to true and remains true
   // for the duration of this object's life.
-  bool closed_ GUARDED_BY(mu_) = false;
+  bool closed_ TF_GUARDED_BY(mu_) = false;
 
   // The enqueued batches. See the invariants in the class comments above.
-  std::deque<std::unique_ptr<Batch<TaskType>>> batches_ GUARDED_BY(mu_);
+  std::deque<std::unique_ptr<Batch<TaskType>>> batches_ TF_GUARDED_BY(mu_);
 
   // The time at which the first task was added to the open (back-most) batch
   // in 'batches_'. Valid iff that batch contains at least one task.
-  uint64 open_batch_start_time_micros_ GUARDED_BY(mu_);
+  uint64 open_batch_start_time_micros_ TF_GUARDED_BY(mu_);
 
   // Whether this queue contains a batch that is eligible to be scheduled. Used
   // to keep track of when to call 'schedulable_batch_callback_'.
-  bool schedulable_batch_ GUARDED_BY(mu_) = false;
+  bool schedulable_batch_ TF_GUARDED_BY(mu_) = false;
 
   // The number of batches currently being processed by batch threads.
   // Incremented in ScheduleBatch() and decremented in ProcessBatch().
-  int num_batches_being_processed_ GUARDED_BY(mu_) = 0;
+  int num_batches_being_processed_ TF_GUARDED_BY(mu_) = 0;
 
   // Used by CloseAndWaitUntilEmpty() to wait until the queue is empty, for the
   // case in which the queue is not empty when CloseAndWaitUntilEmpty() starts.
   // When ProcessBatch() dequeues the last batch and makes the queue empty, if
   // 'empty_notification_' is non-null it calls 'empty_notification_->Notify()'.
-  Notification* empty_notification_ GUARDED_BY(mu_) = nullptr;
+  Notification* empty_notification_ TF_GUARDED_BY(mu_) = nullptr;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Queue);
 };
diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc
index 8db2ad8e5ab..e8c428a80d0 100644
--- a/tensorflow/core/kernels/cast_op.cc
+++ b/tensorflow/core/kernels/cast_op.cc
@@ -96,20 +96,21 @@ void CastOpBase::Compute(OpKernelContext* ctx) {
   const Tensor& inp = ctx->input(0);
   if (work_ == nullptr) {
     ctx->set_output(0, inp);
-  } else {
+  } else if (external_src_dtype_ != src_dtype_ ||
+             external_dst_dtype_ != dst_dtype_) {
     Tensor in;
-    if (external_src_dtype_ != src_dtype_) {
-      // If the type is a quantized type we need to do a bitcast since the
-      // src_dtype_ is different from external_src_type_.
-      OP_REQUIRES_OK(ctx, in.BitcastFrom(inp, src_dtype_, inp.shape()));
-    } else {
-      in = inp;
-    }
+    // If the type is a quantized type we need to do a bitcast since the
+    // src_dtype_ is different from external_src_type_.
+    OP_REQUIRES_OK(ctx, in.BitcastFrom(inp, src_dtype_, inp.shape()));
     Tensor* out = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, in.shape(), &out));
     out->set_dtype(dst_dtype_);
     work_(ctx, in, out, use_truncation_);
     out->set_dtype(external_dst_dtype_);
+  } else {
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, inp.shape(), &out));
+    work_(ctx, inp, out, use_truncation_);
   }
 }
 
diff --git a/tensorflow/core/kernels/check_numerics_op.cc b/tensorflow/core/kernels/check_numerics_op.cc
index 17cf5c37ae9..0feef164fec 100644
--- a/tensorflow/core/kernels/check_numerics_op.cc
+++ b/tensorflow/core/kernels/check_numerics_op.cc
@@ -15,15 +15,18 @@ limitations under the License.
 
 // See docs in ../ops/array_ops.cc.
 
+// clang-format off
 #include "tensorflow/core/lib/bfloat16/bfloat16.h"
 
-#include <math.h>
-#include <algorithm>
-#include <numeric>
+#include <math.h>  // NOLINT
+#include <algorithm>  // NOLINT
+#include <numeric>  // NOLINT
+// clang-format on
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_reference.h"
 #include "tensorflow/core/framework/types.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/collective_nccl_test.cc b/tensorflow/core/kernels/collective_nccl_test.cc
index ef51b7ff323..8659dd9805b 100644
--- a/tensorflow/core/kernels/collective_nccl_test.cc
+++ b/tensorflow/core/kernels/collective_nccl_test.cc
@@ -428,7 +428,7 @@ class NcclTestBase : public ::testing::Test {
   std::vector<std::unique_ptr<DeviceInstance>> instances_;
   CollectiveParams col_params_;
   mutex mu_;
-  int32 op_counter_ GUARDED_BY(mu_) = 0;
+  int32 op_counter_ TF_GUARDED_BY(mu_) = 0;
 };
 
 class NcclReducerTest : public NcclTestBase {
diff --git a/tensorflow/core/kernels/conditional_accumulator.h b/tensorflow/core/kernels/conditional_accumulator.h
index 390db8fe5ae..6594b2174c7 100644
--- a/tensorflow/core/kernels/conditional_accumulator.h
+++ b/tensorflow/core/kernels/conditional_accumulator.h
@@ -67,7 +67,7 @@ class ConditionalAccumulator
   functor::SetZeroFunctor<Device, T> set_zero_functor_;
 
   Status ValidateShape(const Tensor* tensor)
-      EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
     // Must be compatible with accumulated gradient if available
     if (counter_ > 0) {
       if (!accum_grad_->shape().IsSameSize(tensor->shape())) {
@@ -102,7 +102,7 @@ class ConditionalAccumulator
   }
 
   void DivideAccumGradByCounter(OpKernelContext* ctx) override
-      EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
     Tensor c(DataTypeToEnum<T>::value, {});
     c.scalar<T>()() = TypeConverter<T, int>::ConvertUToT(this->counter_);
     this->accum_grad_->template flat<T>().device(
@@ -117,7 +117,7 @@ class ConditionalAccumulator
 
   bool GetAndValidateTensorInputForApplyGrad(OpKernelContext* ctx,
                                              const Tensor** tensor) override
-      EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
     // Get input gradient tensor
     const Tensor* grad_tensor;
     OP_REQUIRES_OK_BOOLEAN(ctx, ctx->input("gradient", &grad_tensor));
diff --git a/tensorflow/core/kernels/conditional_accumulator_base.cc b/tensorflow/core/kernels/conditional_accumulator_base.cc
index 292cf0cd640..78248c2f45a 100644
--- a/tensorflow/core/kernels/conditional_accumulator_base.cc
+++ b/tensorflow/core/kernels/conditional_accumulator_base.cc
@@ -79,7 +79,7 @@ void ConditionalAccumulatorBase::TryTakeGrad(int num_required,
       if (!already_cancelled) {
         takegrad_attempts_.emplace_back(
             num_required, callback, ctx, cm, token,
-            [this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+            [this](Attempt* attempt) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
               if (counter_ >= attempt->elements_requested) {
                 bool successful_take_grad = TakeGradLockedHelper(
                     attempt->context, attempt->done_callback);
diff --git a/tensorflow/core/kernels/conditional_accumulator_base.h b/tensorflow/core/kernels/conditional_accumulator_base.h
index 2618ffbb099..a9b25e21374 100644
--- a/tensorflow/core/kernels/conditional_accumulator_base.h
+++ b/tensorflow/core/kernels/conditional_accumulator_base.h
@@ -81,7 +81,7 @@ class ConditionalAccumulatorBase : public ResourceBase {
   // Virtual methods to be implemented by sub-classes for different datatypes.
   // Implements arithmetic operations specific to datatype.
   virtual void DivideAccumGradByCounter(OpKernelContext* ctx)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) = 0;
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) = 0;
   virtual bool SetOutput(OpKernelContext* ctx) = 0;
 
   enum RunResult { kNoProgress, kComplete };
@@ -127,10 +127,10 @@ class ConditionalAccumulatorBase : public ResourceBase {
   const string name_;
   const string reduction_type_;
   mutex mu_;
-  int counter_ GUARDED_BY(mu_);
-  int64 current_global_step_ GUARDED_BY(mu_);
+  int counter_ TF_GUARDED_BY(mu_);
+  int64 current_global_step_ TF_GUARDED_BY(mu_);
 
-  std::deque<Attempt> takegrad_attempts_ GUARDED_BY(mu_);
+  std::deque<Attempt> takegrad_attempts_ TF_GUARDED_BY(mu_);
 
   // Methods
 
@@ -149,12 +149,12 @@ class ConditionalAccumulatorBase : public ResourceBase {
   //       (if it is not stale) or drop it silently (if it is stale).
   void FlushUnlocked();
   bool TryAttemptLocked(std::vector<CleanUp>* clean_up)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Helper methods
   //  void DeepCopy(Tensor* dst);
   bool TakeGradLockedHelper(OpKernelContext* ctx, DoneCallback callback)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 };
 
 /*
diff --git a/tensorflow/core/kernels/conditional_accumulator_base_op.h b/tensorflow/core/kernels/conditional_accumulator_base_op.h
index a2bfa2cdc8c..5b3841a7a5e 100644
--- a/tensorflow/core/kernels/conditional_accumulator_base_op.h
+++ b/tensorflow/core/kernels/conditional_accumulator_base_op.h
@@ -75,7 +75,7 @@ class ConditionalAccumulatorBaseOp : public OpKernel {
 
  protected:
   virtual void SetHandleToOutput(OpKernelContext* ctx)
-      SHARED_LOCKS_REQUIRED(mu_) = 0;
+      TF_SHARED_LOCKS_REQUIRED(mu_) = 0;
 
   virtual Status CheckSignature(OpKernelContext* ctx) = 0;
 
@@ -91,12 +91,12 @@ class ConditionalAccumulatorBaseOp : public OpKernel {
   ContainerInfo cinfo_;
   string reduction_type_;
   mutex mu_;
-  PersistentTensor accumulator_handle_ GUARDED_BY(mu_);
-  bool accumulator_handle_set_ GUARDED_BY(mu_);
+  PersistentTensor accumulator_handle_ TF_GUARDED_BY(mu_);
+  bool accumulator_handle_set_ TF_GUARDED_BY(mu_);
 
  private:
   Status SetAccumulatorHandle(OpKernelContext* ctx)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     TF_RETURN_IF_ERROR(cinfo_.Init(ctx->resource_manager(), def()));
 
     // Check input signature
@@ -119,7 +119,6 @@ class ConditionalAccumulatorBaseOp : public OpKernel {
     accumulator_handle_set_ = true;
     return Status::OK();
   }
-
 };
 
 // ------------------Sync kernels ------------------------------------------
diff --git a/tensorflow/core/kernels/conditional_accumulator_op.cc b/tensorflow/core/kernels/conditional_accumulator_op.cc
index 3c7fbe0c65a..6b6feb81cfa 100644
--- a/tensorflow/core/kernels/conditional_accumulator_op.cc
+++ b/tensorflow/core/kernels/conditional_accumulator_op.cc
@@ -47,7 +47,7 @@ class ConditionalAccumulatorOp : public ConditionalAccumulatorBaseOp {
   }
 
   void SetHandleToOutput(OpKernelContext* ctx)
-      SHARED_LOCKS_REQUIRED(mu_) override {
+      TF_SHARED_LOCKS_REQUIRED(mu_) override {
     ctx->set_output_ref(0, &mu_, accumulator_handle_.AccessTensor(ctx));
   }
 
@@ -84,7 +84,7 @@ class ResourceConditionalAccumulatorOp : public ConditionalAccumulatorBaseOp {
   }
 
   void SetHandleToOutput(OpKernelContext* ctx)
-      SHARED_LOCKS_REQUIRED(mu_) override {
+      TF_SHARED_LOCKS_REQUIRED(mu_) override {
     auto h = accumulator_handle_.AccessTensor(ctx)->template flat<tstring>();
     h(0) = cinfo_.container();
     h(1) = cinfo_.name();
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index ed75a05db49..1e26bddd7cb 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -109,6 +109,8 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_REF_SWITCH);
 REGISTER_GPU_SWITCH(uint64);
 TF_CALL_variant(REGISTER_GPU_SWITCH);
+TF_CALL_uint32(REGISTER_GPU_SWITCH);
+TF_CALL_uint32(REGISTER_GPU_REF_SWITCH);
 
 #undef REGISTER_CPU_SWITCH
 #undef REGISTER_CPU_REF_SWITCH
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index f9bf64f2df3..169fc2e1f63 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <algorithm>
 #include <vector>
 
+#include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -34,7 +35,6 @@ limitations under the License.
 #ifdef TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS
 #include "tensorflow/core/kernels/xsmm_conv2d.h"
 #endif
-#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index be5d821fc32..d479963556f 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include "absl/base/dynamic_annotations.h"
 #include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -36,7 +37,6 @@ limitations under the License.
 #ifdef TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS
 #include "tensorflow/core/kernels/xsmm_conv2d.h"
 #endif
-#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index bc6c64963ad..4fe112f7b5e 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #define USE_EIGEN_TENSOR
 #define EIGEN_USE_THREADS
 
+#include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -28,7 +29,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/conv_grad_ops.h"
 #include "tensorflow/core/kernels/conv_grad_shape_utils.h"
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
-#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/util/padding.h"
diff --git a/tensorflow/core/kernels/conv_grad_shape_utils.cc b/tensorflow/core/kernels/conv_grad_shape_utils.cc
index 7857257658f..81c20ab0c7f 100644
--- a/tensorflow/core/kernels/conv_grad_shape_utils.cc
+++ b/tensorflow/core/kernels/conv_grad_shape_utils.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index d265e9d8f8b..55bfa35e8a5 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -32,6 +32,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index e9e11aebf61..c874684cf1a 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #define USE_EIGEN_TENSOR
 #define EIGEN_USE_THREADS
 
+#include "tensorflow/core/kernels/conv_ops_3d.h"
+
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -48,147 +50,11 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-template <typename Device, typename T>
-struct LaunchConvOp;
-
-template <typename T>
-struct LaunchConvOp<CPUDevice, T> {
-  static void launch(OpKernelContext* context, bool cudnn_use_autotune,
-                     const Tensor& input, const Tensor& filter,
-                     const std::array<int64, 3>& dilations,
-                     const std::array<int64, 3>& strides, const Padding padding,
-                     TensorFormat data_format, Tensor* output) {
-    OP_REQUIRES(context, data_format == FORMAT_NHWC,
-                errors::InvalidArgument("CPU implementation of Conv3D "
-                                        "currently only supports the NHWC "
-                                        "tensor format."));
-    OP_REQUIRES(context,
-                dilations[0] == 1 && dilations[1] == 1 && dilations[2] == 1,
-                errors::InvalidArgument("CPU implementation of Conv3D "
-                                        "currently only supports dilated rates "
-                                        "of 1."));
-    functor::CuboidConvolution<CPUDevice, T>()(
-        context->eigen_device<CPUDevice>(), output->tensor<T, 5>(),
-        input.tensor<T, 5>(), filter.tensor<T, 5>(), strides[2], strides[1],
-        strides[0], BrainPadding2EigenPadding(padding));
-  }
-};
-
-template <typename Device, typename T>
-class Conv3DOp : public BinaryOp<T> {
- public:
-  explicit Conv3DOp(OpKernelConstruction* context) : BinaryOp<T>(context) {
-    string data_format;
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
-    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
-    OP_REQUIRES(context, stride_.size() == 5,
-                errors::InvalidArgument("Sliding window strides field must "
-                                        "specify 5 dimensions"));
-    OP_REQUIRES(
-        context,
-        (GetTensorDim(stride_, data_format_, 'N') == 1 &&
-         GetTensorDim(stride_, data_format_, 'C') == 1),
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
-    OP_REQUIRES(
-        context,
-        (GetTensorDim(stride_, data_format_, '0') > 0 &&
-         GetTensorDim(stride_, data_format_, '1') > 0 &&
-         GetTensorDim(stride_, data_format_, '2') > 0),
-        errors::InvalidArgument("Spatial strides should be larger than 0."));
-    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_));
-    OP_REQUIRES(context, dilation_.size() == 5,
-                errors::InvalidArgument("Dilation rates field must "
-                                        "specify 5 dimensions"));
-    OP_REQUIRES(context,
-                (GetTensorDim(dilation_, data_format_, 'N') == 1 &&
-                 GetTensorDim(dilation_, data_format_, 'C') == 1),
-                errors::InvalidArgument(
-                    "Current implementation does not yet support "
-                    "dilation rates in the batch and depth dimensions."));
-    OP_REQUIRES(
-        context,
-        (GetTensorDim(dilation_, data_format_, '0') > 0 &&
-         GetTensorDim(dilation_, data_format_, '1') > 0 &&
-         GetTensorDim(dilation_, data_format_, '2') > 0),
-        errors::InvalidArgument("Dilated rates should be larger than 0."));
-    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-    cudnn_use_autotune_ = CudnnUseAutotune();
-  }
-
-  void Compute(OpKernelContext* context) override {
-    // Input tensor is of the following dimensions:
-    // [ batch, in_z, in_y, in_x, in_channels ]
-    const Tensor& input = context->input(0);
-
-    // Input filter is of the following dimensions:
-    // [ filter_z, filter_y, filter_x, in_channels, out_channels]
-    const Tensor& filter = context->input(1);
-
-    // NOTE: The ordering of the spatial dimensions is arbitrary, but has to be
-    // kept consistent between input/filter/output.
-    OP_REQUIRES(context, input.dims() == 5,
-                errors::InvalidArgument("input must be 5-dimensional"));
-    OP_REQUIRES(context, filter.dims() == 5,
-                errors::InvalidArgument("filter must be 5-dimensional"));
-
-    const int64 in_depth = GetTensorDim(input, data_format_, 'C');
-    const int64 in_batch = GetTensorDim(input, data_format_, 'N');
-
-    const int64 filter_depth = filter.dim_size(3);
-    const int64 out_depth = filter.dim_size(4);
-
-    OP_REQUIRES(context, in_depth % filter_depth == 0,
-                errors::InvalidArgument(
-                    "Input depth must be evenly divisible by filter depth: ",
-                    in_depth, " vs ", filter_depth));
-
-    // Dimension order for these arrays is: z, y, x.
-    std::array<int64, 3> input_size = {
-        {GetTensorDim(input, data_format_, '0'),
-         GetTensorDim(input, data_format_, '1'),
-         GetTensorDim(input, data_format_, '2')}};
-    std::array<int64, 3> filter_size = {
-        {filter.dim_size(0), filter.dim_size(1), filter.dim_size(2)}};
-    std::array<int64, 3> dilations = {
-        {GetTensorDim(dilation_, data_format_, '0'),
-         GetTensorDim(dilation_, data_format_, '1'),
-         GetTensorDim(dilation_, data_format_, '2')}};
-    std::array<int64, 3> strides = {{GetTensorDim(stride_, data_format_, '0'),
-                                     GetTensorDim(stride_, data_format_, '1'),
-                                     GetTensorDim(stride_, data_format_, '2')}};
-    std::array<int64, 3> out, padding;
-
-    OP_REQUIRES_OK(
-        context, Get3dOutputSizeV2(input_size, filter_size, dilations, strides,
-                                   padding_, &out, &padding));
-    TensorShape out_shape = ShapeFromFormat(
-        data_format_, in_batch, {{out[0], out[1], out[2]}}, out_depth);
-    Tensor* output;
-    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
-
-    // Return early if nothing to do.
-    if (out_shape.num_elements() == 0) return;
-
-    LaunchConvOp<Device, T>::launch(context, cudnn_use_autotune_, input, filter,
-                                    dilations, strides, padding_, data_format_,
-                                    output);
-  }
-
- private:
-  std::vector<int32> dilation_;
-  std::vector<int32> stride_;
-  Padding padding_;
-  TensorFormat data_format_;
-  bool cudnn_use_autotune_;
-};
-
 #define REGISTER_CPU_KERNEL(T)                                  \
   REGISTER_KERNEL_BUILDER(                                      \
       Name("Conv3D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
-      Conv3DOp<CPUDevice, T>);
+      Conv3DOp<CPUDevice, T, OpKernel, OpKernelConstruction,    \
+               OpKernelContext>);
 TF_CALL_half(REGISTER_CPU_KERNEL);
 TF_CALL_float(REGISTER_CPU_KERNEL);
 TF_CALL_double(REGISTER_CPU_KERNEL);
@@ -206,7 +72,7 @@ typedef AutoTuneSingleton<Conv3dAutoTuneGroup, ConvParameters,
 
 // TODO(mjanusz): Share logic with 2d implementation as much as possible.
 template <typename T>
-struct LaunchConvOp<GPUDevice, T> {
+struct LaunchConvOp<GPUDevice, T, OpKernelContext> {
   static void launch(OpKernelContext* ctx, bool cudnn_use_autotune,
                      const Tensor& input_param, const Tensor& filter,
                      const std::array<int64, 3>& dilations,
@@ -638,13 +504,16 @@ DECLARE_GPU_SPEC(double);
 // Registration of the GPU implementations.
 REGISTER_KERNEL_BUILDER(
     Name("Conv3D").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
-    Conv3DOp<GPUDevice, Eigen::half>);
+    Conv3DOp<GPUDevice, Eigen::half, OpKernel, OpKernelConstruction,
+             OpKernelContext>);
 REGISTER_KERNEL_BUILDER(
     Name("Conv3D").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    Conv3DOp<GPUDevice, float>);
+    Conv3DOp<GPUDevice, float, OpKernel, OpKernelConstruction,
+             OpKernelContext>);
 REGISTER_KERNEL_BUILDER(
     Name("Conv3D").Device(DEVICE_GPU).TypeConstraint<double>("T"),
-    Conv3DOp<GPUDevice, double>);
+    Conv3DOp<GPUDevice, double, OpKernel, OpKernelConstruction,
+             OpKernelContext>);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops_3d.h b/tensorflow/core/kernels/conv_ops_3d.h
new file mode 100644
index 00000000000..9dcdea5b18f
--- /dev/null
+++ b/tensorflow/core/kernels/conv_ops_3d.h
@@ -0,0 +1,187 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_CONV_OPS_3D_H_
+#define TENSORFLOW_CORE_KERNELS_CONV_OPS_3D_H_
+
+#include <vector>
+
+#define USE_EIGEN_TENSOR
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/numeric_op_base.h"
+#include "tensorflow/core/framework/kernel_shape_util.h"
+#include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/framework/ops_util.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/conv_3d.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+#if GOOGLE_CUDA
+#include "tensorflow/core/util/use_cudnn.h"
+#endif
+
+namespace tensorflow {
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, typename T, class OpKernelContextT>
+struct LaunchConvOp;
+
+template <typename T, class OpKernelContextT>
+struct LaunchConvOp<CPUDevice, T, OpKernelContextT> {
+  static void launch(OpKernelContextT* context, bool cudnn_use_autotune,
+                     const Tensor& input, const Tensor& filter,
+                     const std::array<int64, 3>& dilations,
+                     const std::array<int64, 3>& strides, const Padding padding,
+                     TensorFormat data_format, Tensor* output) {
+    OP_REQUIRES(context, data_format == FORMAT_NHWC,
+                errors::InvalidArgument("CPU implementation of Conv3D "
+                                        "currently only supports the NHWC "
+                                        "tensor format."));
+    OP_REQUIRES(context,
+                dilations[0] == 1 && dilations[1] == 1 && dilations[2] == 1,
+                errors::InvalidArgument("CPU implementation of Conv3D "
+                                        "currently only supports dilated rates "
+                                        "of 1."));
+    functor::CuboidConvolution<CPUDevice, T>()(
+        context->template eigen_device<CPUDevice>(), output->tensor<T, 5>(),
+        input.tensor<T, 5>(), filter.tensor<T, 5>(), strides[2], strides[1],
+        strides[0], BrainPadding2EigenPadding(padding));
+  }
+};
+
+template <typename Device, typename T, class OpKernelT,
+          class OpKernelConstructionT, class OpKernelContextT>
+class Conv3DOp : public BinaryOpBase<T, OpKernelT, OpKernelConstructionT> {
+ public:
+  explicit Conv3DOp(OpKernelConstructionT* context) :
+      BinaryOpBase<T, OpKernelT, OpKernelConstructionT>(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
+    OP_REQUIRES(context, stride_.size() == 5,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 5 dimensions"));
+    OP_REQUIRES(
+        context,
+        (GetTensorDim(stride_, data_format_, 'N') == 1 &&
+         GetTensorDim(stride_, data_format_, 'C') == 1),
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context,
+        (GetTensorDim(stride_, data_format_, '0') > 0 &&
+         GetTensorDim(stride_, data_format_, '1') > 0 &&
+         GetTensorDim(stride_, data_format_, '2') > 0),
+        errors::InvalidArgument("Spatial strides should be larger than 0."));
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilation_));
+    OP_REQUIRES(context, dilation_.size() == 5,
+                errors::InvalidArgument("Dilation rates field must "
+                                        "specify 5 dimensions"));
+    OP_REQUIRES(context,
+                (GetTensorDim(dilation_, data_format_, 'N') == 1 &&
+                 GetTensorDim(dilation_, data_format_, 'C') == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilation rates in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context,
+        (GetTensorDim(dilation_, data_format_, '0') > 0 &&
+         GetTensorDim(dilation_, data_format_, '1') > 0 &&
+         GetTensorDim(dilation_, data_format_, '2') > 0),
+        errors::InvalidArgument("Dilated rates should be larger than 0."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+#if GOOGLE_CUDA
+    cudnn_use_autotune_ = CudnnUseAutotune();
+#else
+    cudnn_use_autotune_ = false;
+#endif
+  }
+
+  void Compute(OpKernelContextT* context) override {
+    // Input tensor is of the following dimensions:
+    // [ batch, in_z, in_y, in_x, in_channels ]
+    const Tensor& input = context->input(0);
+
+    // Input filter is of the following dimensions:
+    // [ filter_z, filter_y, filter_x, in_channels, out_channels]
+    const Tensor& filter = context->input(1);
+
+    // NOTE: The ordering of the spatial dimensions is arbitrary, but has to be
+    // kept consistent between input/filter/output.
+    OP_REQUIRES(context, input.dims() == 5,
+                errors::InvalidArgument("input must be 5-dimensional"));
+    OP_REQUIRES(context, filter.dims() == 5,
+                errors::InvalidArgument("filter must be 5-dimensional"));
+
+    const int64 in_depth = GetTensorDim(input, data_format_, 'C');
+    const int64 in_batch = GetTensorDim(input, data_format_, 'N');
+
+    const int64 filter_depth = filter.dim_size(3);
+    const int64 out_depth = filter.dim_size(4);
+
+    OP_REQUIRES(context, in_depth % filter_depth == 0,
+                errors::InvalidArgument(
+                    "Input depth must be evenly divisible by filter depth: ",
+                    in_depth, " vs ", filter_depth));
+
+    // Dimension order for these arrays is: z, y, x.
+    std::array<int64, 3> input_size = {
+        {GetTensorDim(input, data_format_, '0'),
+         GetTensorDim(input, data_format_, '1'),
+         GetTensorDim(input, data_format_, '2')}};
+    std::array<int64, 3> filter_size = {
+        {filter.dim_size(0), filter.dim_size(1), filter.dim_size(2)}};
+    std::array<int64, 3> dilations = {
+        {GetTensorDim(dilation_, data_format_, '0'),
+         GetTensorDim(dilation_, data_format_, '1'),
+         GetTensorDim(dilation_, data_format_, '2')}};
+    std::array<int64, 3> strides = {{GetTensorDim(stride_, data_format_, '0'),
+                                     GetTensorDim(stride_, data_format_, '1'),
+                                     GetTensorDim(stride_, data_format_, '2')}};
+    std::array<int64, 3> out, padding;
+
+    OP_REQUIRES_OK(
+        context, Get3dOutputSizeV2(input_size, filter_size, dilations, strides,
+                                   padding_, &out, &padding));
+    TensorShape out_shape = ShapeFromFormat(
+        data_format_, in_batch, {{out[0], out[1], out[2]}}, out_depth);
+    Tensor* output;
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+
+    // Return early if nothing to do.
+    if (out_shape.num_elements() == 0) return;
+
+    LaunchConvOp<Device, T, OpKernelContextT>::launch(
+        context, cudnn_use_autotune_, input, filter,
+        dilations, strides, padding_, data_format_,
+        output);
+  }
+
+ private:
+  std::vector<int32> dilation_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_;
+  bool cudnn_use_autotune_;
+};
+
+}  // namespace tensorflow
+
+
+#endif  // TENSORFLOW_CORE_KERNELS_CONV_OPS_3D_H_
diff --git a/tensorflow/core/kernels/conv_ops_fused_image_transform.cc b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
index 21c151d3b67..9055639aaaf 100644
--- a/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
+++ b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
@@ -21,8 +21,9 @@ limitations under the License.
 
 #include <string>
 #include <vector>
+
 #include "tensorflow/core/framework/bounds_check.h"
-#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/core/kernels/conv_ops_using_gemm.cc b/tensorflow/core/kernels/conv_ops_using_gemm.cc
index 05df9e0207e..dff1a533ee0 100644
--- a/tensorflow/core/kernels/conv_ops_using_gemm.cc
+++ b/tensorflow/core/kernels/conv_ops_using_gemm.cc
@@ -47,10 +47,12 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include <string.h>
+
 #include <map>
 #include <vector>
+
 #include "tensorflow/core/framework/bounds_check.h"
-#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/core/kernels/crop_and_resize_op.cc b/tensorflow/core/kernels/crop_and_resize_op.cc
index 5223501997e..23058788a4b 100644
--- a/tensorflow/core/kernels/crop_and_resize_op.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_reference.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/core/kernels/cuda_solvers.h b/tensorflow/core/kernels/cuda_solvers.h
index fa0984d05c7..6833905e379 100644
--- a/tensorflow/core/kernels/cuda_solvers.h
+++ b/tensorflow/core/kernels/cuda_solvers.h
@@ -33,6 +33,7 @@ limitations under the License.
 #endif
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_reference.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/stream_executor.h"
 
diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc
index 4a27394f289..b9b96d3fc70 100644
--- a/tensorflow/core/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc
@@ -1565,7 +1565,7 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
 
   mutex mu_;
   bool is_training_;
-  RnnStateCache rnn_state_cache_ GUARDED_BY(mu_);
+  RnnStateCache rnn_state_cache_ TF_GUARDED_BY(mu_);
 };
 
 #define REGISTER_GPU(T)                                           \
@@ -1932,7 +1932,7 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
 
  private:
   mutex mu_;
-  RnnStateCache rnn_state_cache_ GUARDED_BY(mu_);
+  RnnStateCache rnn_state_cache_ TF_GUARDED_BY(mu_);
 
   Status ExtractBackwardInputs(
       OpKernelContext* context, const CudnnRnnModelShapes& model_shapes,
diff --git a/tensorflow/core/kernels/cwise_op_gpu_sub.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_sub.cu.cc
index aa5d09f8835..eb37213fc93 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_sub.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_sub.cu.cc
@@ -19,7 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-DEFINE_BINARY6(sub, Eigen::half, float, double, int64, complex64, complex128);
+DEFINE_BINARY7(sub, Eigen::half, float, double, int64, complex64, complex128,
+               uint32);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_sub.cc b/tensorflow/core/kernels/cwise_op_sub.cc
index b2fb9d84b14..17e690b2c17 100644
--- a/tensorflow/core/kernels/cwise_op_sub.cc
+++ b/tensorflow/core/kernels/cwise_op_sub.cc
@@ -20,7 +20,8 @@ REGISTER8(BinaryOp, CPU, "Sub", functor::sub, float, Eigen::half, double, int32,
           int64, bfloat16, complex64, complex128);
 #if !defined(__ANDROID_TYPES_SLIM__)
 // Sub op for int8, uint8, int16, uint16
-REGISTER4(BinaryOp, CPU, "Sub", functor::sub, int8, uint8, int16, uint16);
+REGISTER5(BinaryOp, CPU, "Sub", functor::sub, int8, uint8, int16, uint16,
+          uint32);
 #else
 // We only register the first type when we have multi-argument calls in the
 // case where we're trying to reduce executable size, but it turns out that the
@@ -29,8 +30,8 @@ REGISTER(BinaryOp, CPU, "Sub", functor::sub, int32);
 #endif  // __ANDROID_TYPES_SLIM__
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-REGISTER6(BinaryOp, GPU, "Sub", functor::sub, float, Eigen::half, double, int64,
-          complex64, complex128);
+REGISTER7(BinaryOp, GPU, "Sub", functor::sub, float, Eigen::half, double, int64,
+          complex64, complex128, uint32);
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 8566a681afa..5f0e2343203 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -160,6 +160,7 @@ cc_library(
         ":stats_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index ffb27d9fc3c..0d454a0abf2 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -54,7 +54,8 @@ class BatchDatasetOp::Dataset : public DatasetBase {
         input_(input),
         op_version_(op_version),
         traceme_metadata_(
-            {{"batch_size", strings::Printf("%lld", batch_size)},
+            {{"batch_size",
+              strings::Printf("%lld", static_cast<long long>(batch_size))},
              {"drop_remainder", drop_remainder ? "true" : "false"},
              {"parallel_copy", parallel_copy ? "true" : "false"}}) {
     input_->Ref();
@@ -283,7 +284,7 @@ class BatchDatasetOp::Dataset : public DatasetBase {
 
    private:
     mutex mu_;
-    std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
   };
 
   const int64 batch_size_;
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
index f6c88b261da..e773efc6c2e 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
 
 namespace tensorflow {
@@ -375,7 +376,7 @@ class CacheDatasetOp::FileDataset : public DatasetBase {
 
      private:
       Status EnsureLockFileExists(bool* end_of_sequence)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         if (iteration_completed_) {
           *end_of_sequence = true;
           return Status::OK();
@@ -436,7 +437,7 @@ class CacheDatasetOp::FileDataset : public DatasetBase {
         return Status::OK();
       }
 
-      Status Finish() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      Status Finish() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         iteration_completed_ = true;
         // Flush the current bundle.
         TF_RETURN_IF_ERROR(writer_->Finish());
@@ -466,18 +467,18 @@ class CacheDatasetOp::FileDataset : public DatasetBase {
       }
 
       mutex mu_;
-      size_t cur_index_ GUARDED_BY(mu_);
+      size_t cur_index_ TF_GUARDED_BY(mu_);
       // Index of the current shard. This gets incremented whenever a new
       // cache shard is saved.
-      size_t shard_id_ GUARDED_BY(mu_);
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      size_t shard_id_ TF_GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
       // The current prefix for the cache file. This is equal to
       // `StrCat(dataset()->filename_, "_", shard_id_)`.
       string filename_;
-      std::unique_ptr<BundleWriter> writer_ GUARDED_BY(mu_);
-      string lockfile_ GUARDED_BY(mu_);
-      bool lockfile_created_ GUARDED_BY(mu_);
-      bool iteration_completed_ GUARDED_BY(mu_);
+      std::unique_ptr<BundleWriter> writer_ TF_GUARDED_BY(mu_);
+      string lockfile_ TF_GUARDED_BY(mu_);
+      bool lockfile_created_ TF_GUARDED_BY(mu_);
+      bool iteration_completed_ TF_GUARDED_BY(mu_);
     };  // FileWriterIterator
 
     class FileReaderIterator : public DatasetIterator<FileDataset> {
@@ -563,12 +564,12 @@ class CacheDatasetOp::FileDataset : public DatasetBase {
 
      private:
       mutex mu_;
-      size_t cur_index_ GUARDED_BY(mu_);
-      BundleReader reader_ GUARDED_BY(mu_);
-      bool iterator_restored_ GUARDED_BY(mu_);
+      size_t cur_index_ TF_GUARDED_BY(mu_);
+      BundleReader reader_ TF_GUARDED_BY(mu_);
+      bool iterator_restored_ TF_GUARDED_BY(mu_);
     };  // FileReaderIterator
 
-    void InitializeIterator() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    void InitializeIterator() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       // We intentionally use the same prefix for both `FileReaderIterator`
       // and `FileWriterIterator`. Since at any time there will be at most
       // one of them alive, there should be no conflicts. This allows both
@@ -592,8 +593,8 @@ class CacheDatasetOp::FileDataset : public DatasetBase {
 
     mutex mu_;
     enum Mode { read, write };
-    Mode mode_ GUARDED_BY(mu_);
-    std::unique_ptr<IteratorBase> iterator_ GUARDED_BY(mu_);
+    Mode mode_ TF_GUARDED_BY(mu_);
+    std::unique_ptr<IteratorBase> iterator_ TF_GUARDED_BY(mu_);
   };  // FileIterator
 
   Env* const env_;
@@ -843,11 +844,19 @@ class CacheDatasetOp::MemoryDataset : public DatasetBase {
         TF_RETURN_IF_ERROR(
             input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
         if (*end_of_sequence) {
-          cache_->Complete(std::move(temp_cache_));
+          if (!cache_->IsCompleted()) {
+            VLOG(2) << "Finalizing the cache because EOF has been reached.";
+            cache_->Complete(std::move(temp_cache_));
+          }
           return Status::OK();
         }
         RecordBufferEnqueue(ctx, *out_tensors);
         temp_cache_.emplace_back(*out_tensors);
+        if (temp_cache_.size() == dataset()->input_->Cardinality()) {
+          VLOG(2) << "Finalizing the cache because its size matches the "
+                     "expected input cardinality.";
+          cache_->Complete(std::move(temp_cache_));
+        }
         return Status::OK();
       }
 
@@ -881,9 +890,9 @@ class CacheDatasetOp::MemoryDataset : public DatasetBase {
 
      private:
       mutex mu_;
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-      MemoryCache* const cache_ GUARDED_BY(mu_);  // not owned.
-      std::vector<std::vector<Tensor>> temp_cache_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
+      MemoryCache* const cache_ TF_GUARDED_BY(mu_);  // not owned.
+      std::vector<std::vector<Tensor>> temp_cache_ TF_GUARDED_BY(mu_);
     };  // MemoryWriterIterator
 
     class MemoryReaderIterator : public DatasetIterator<MemoryDataset> {
@@ -949,11 +958,11 @@ class CacheDatasetOp::MemoryDataset : public DatasetBase {
 
      private:
       mutex mu_;
-      MemoryCache* const cache_ GUARDED_BY(mu_);  // not owned.
-      size_t index_ GUARDED_BY(mu_);
+      MemoryCache* const cache_ TF_GUARDED_BY(mu_);  // not owned.
+      size_t index_ TF_GUARDED_BY(mu_);
     };  // MemoryReaderIterator
 
-    void InitializeIterator() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    void InitializeIterator() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       if (cache_->IsCompleted()) {
         iterator_ = absl::make_unique<MemoryReaderIterator>(
             MemoryReaderIterator::Params{dataset(),
@@ -968,8 +977,8 @@ class CacheDatasetOp::MemoryDataset : public DatasetBase {
     }
 
     mutex mu_;
-    MemoryCache* cache_ GUARDED_BY(mu_);  // not owned.
-    std::unique_ptr<IteratorBase> iterator_ GUARDED_BY(mu_);
+    MemoryCache* cache_ TF_GUARDED_BY(mu_);  // not owned.
+    std::unique_ptr<IteratorBase> iterator_ TF_GUARDED_BY(mu_);
   };  // MemoryIterator
 
   const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/data/cache_ops.h b/tensorflow/core/kernels/data/cache_ops.h
index 2cb6eb6e8ed..d21679bc703 100644
--- a/tensorflow/core/kernels/data/cache_ops.h
+++ b/tensorflow/core/kernels/data/cache_ops.h
@@ -51,8 +51,8 @@ class MemoryCache : public ResourceBase {
  private:
   mutex mu_;
   // Determines whether all elements of the dataset have been cached.
-  bool completed_ GUARDED_BY(mu_) = false;
-  std::vector<std::vector<Tensor>> cache_ GUARDED_BY(mu_);
+  bool completed_ TF_GUARDED_BY(mu_) = false;
+  std::vector<std::vector<Tensor>> cache_ TF_GUARDED_BY(mu_);
 };
 
 // Creates an instance of cache resource and transfers ownership to the caller.
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index d39532c4440..7f32efe1d2b 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/notification.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 
@@ -90,8 +91,6 @@ class SimpleStepStatsCollector : public StepStatsCollectorInterface {
 
     void SetOutput(int slot, const Tensor* tensor) override {}
 
-    void SetReferencedTensors(const TensorReferenceVector& tensors) override {}
-
     void SetScheduled(int64 nanos) override {}
 
    private:
@@ -101,7 +100,7 @@ class SimpleStepStatsCollector : public StepStatsCollectorInterface {
   };
 
   mutex mu_;
-  int64 processing_time_ GUARDED_BY(mu_) = 0;
+  int64 processing_time_ TF_GUARDED_BY(mu_) = 0;
 };
 
 Status RunShortCircuit(const ShortCircuitInfo& info,
@@ -432,15 +431,6 @@ Status MakeIteratorFromInputElement(
       out_iterator);
 }
 
-Status MakeIteratorFromInputElement(
-    IteratorContext* ctx, const std::vector<Tensor>& input_element,
-    int64 thread_index, const InstantiatedCapturedFunction& inst_captured_func,
-    StringPiece prefix, std::unique_ptr<IteratorBase>* out_iterator) {
-  return MakeIteratorFromInputElement(ctx, /*parent=*/nullptr, input_element,
-                                      thread_index, inst_captured_func, prefix,
-                                      out_iterator);
-}
-
 /* static */
 Status FunctionMetadata::Create(
     OpKernelConstruction* ctx, const string& func_name, Params params,
@@ -459,36 +449,40 @@ Status FunctionMetadata::Create(
       (*out_metadata)->func_.name(), &(*out_metadata)->lib_def_));
   TF_RETURN_IF_ERROR(CreateShortCircuitInfo(
       ctx, (*out_metadata)->func_, &(*out_metadata)->short_circuit_info_));
-  (*out_metadata)->ValidateMultiDevice();
-  return Status::OK();
-}
+  const FunctionDef* fdef;
+  TF_RETURN_IF_ERROR(LookupFunction(*(*out_metadata)->lib_def(),
+                                    (*out_metadata)->func().name(), &fdef));
 
-void FunctionMetadata::ValidateMultiDevice() {
-  const FunctionDef* fdef = lib_def_->Find(func_.name());
-  if (is_multi_device_function_) {
-    auto attr = fdef->attr().find(FunctionLibraryDefinition::kIntsOnDeviceAttr);
-    if (attr != fdef->attr().end() && attr->second.b()) {
-      LOG(WARNING)
-          << "Disabling multi-device execution for a function that uses the "
-          << FunctionLibraryDefinition::kIntsOnDeviceAttr << " attribute.";
-      is_multi_device_function_ = false;
-      return;
+  auto attr = fdef->attr().find(FunctionLibraryDefinition::kIntsOnDeviceAttr);
+  if (attr != fdef->attr().end() && attr->second.b()) {
+    LOG(WARNING)
+        << "Disabling multi-device execution for a function that uses the "
+        << FunctionLibraryDefinition::kIntsOnDeviceAttr << " attribute.";
+    (*out_metadata)->use_multi_device_function_ = false;
+    return Status::OK();
+  }
+  auto validate_arg = [](const OpDef::ArgDef& arg) {
+    if (!arg.number_attr().empty() || !arg.type_list_attr().empty()) {
+      LOG(WARNING) << "Disabling multi-device execution for a function with "
+                      "a vector argument "
+                   << arg.name() << ".";
+      return false;
     }
-    auto validate_arg = [this](const OpDef::ArgDef& arg) {
-      if (!arg.number_attr().empty() || !arg.type_list_attr().empty()) {
-        LOG(WARNING) << "Disabling multi-device execution for a function with "
-                        "a vector argument "
-                     << arg.name() << ".";
-        is_multi_device_function_ = false;
-      }
-    };
-    for (const auto& arg : fdef->signature().input_arg()) {
-      validate_arg(arg);
-    }
-    for (const auto& arg : fdef->signature().output_arg()) {
-      validate_arg(arg);
+    return true;
+  };
+  for (const auto& arg : fdef->signature().input_arg()) {
+    if (!validate_arg(arg)) {
+      (*out_metadata)->use_multi_device_function_ = false;
+      return Status::OK();
     }
   }
+  for (const auto& arg : fdef->signature().output_arg()) {
+    if (!validate_arg(arg)) {
+      (*out_metadata)->use_multi_device_function_ = false;
+      return Status::OK();
+    }
+  }
+  return Status::OK();
 }
 
 /* static */
@@ -550,7 +544,9 @@ Status CapturedFunction::Instantiate(
   if (!metadata_->use_inter_op_parallelism()) {
     inst_opts.executor_type = "SINGLE_THREADED_EXECUTOR";
   }
-  TF_RETURN_IF_ERROR(IsMultiDevice(ctx, &inst_opts.is_multi_device_function));
+  bool is_multi_device = false;
+  TF_RETURN_IF_ERROR(IsMultiDevice(ctx, &is_multi_device));
+  inst_opts.is_multi_device_function = is_multi_device;
 
   // We infer the target device from the function library runtime.
   DCHECK(lib->device() != nullptr);
@@ -614,7 +610,8 @@ Status CapturedFunction::Instantiate(
   *instantiated_captured_function =
       absl::WrapUnique<InstantiatedCapturedFunction>(
           new InstantiatedCapturedFunction(lib, f_handle, std::move(ret_types),
-                                           *ctx->runner(), this));
+                                           *ctx->runner(), this,
+                                           is_multi_device));
   return Status::OK();
 }
 
@@ -631,12 +628,13 @@ Status CapturedFunction::CheckExternalState() const {
 InstantiatedCapturedFunction::InstantiatedCapturedFunction(
     FunctionLibraryRuntime* lib, FunctionLibraryRuntime::Handle f_handle,
     DataTypeVector ret_types, std::function<void(std::function<void()>)> runner,
-    CapturedFunction* captured_func)
+    CapturedFunction* captured_func, bool is_multi_device)
     : lib_(lib),
       f_handle_(f_handle),
       ret_types_(std::move(ret_types)),
       captured_runner_(std::move(runner)),
-      captured_func_(captured_func) {}
+      captured_func_(captured_func),
+      is_multi_device_(is_multi_device) {}
 
 // NOTE: We don't release f_handle_ here and instead delegate the function
 // handle releasing to the FunctionHandleCache. This is because in some cases
@@ -854,8 +852,10 @@ void InstantiatedCapturedFunction::RunAsync(
 }
 
 bool InstantiatedCapturedFunction::ShouldCreateRendezvous() const {
-  return lib_->device()->device_type() != DEVICE_CPU ||
-         captured_func_->is_multi_device_function();
+  // Rendezvous should only be created by the FLR for non-CPU single-device
+  // functions. For multi-device functions the appropriate rendezvous will be
+  // created by the process FLR.
+  return lib_->device()->device_type() != DEVICE_CPU && !is_multi_device_;
 }
 
 CapturedFunction::CapturedFunction(
@@ -866,7 +866,7 @@ CapturedFunction::CapturedFunction(
 
 Status CapturedFunction::IsMultiDevice(IteratorContext* ctx,
                                        bool* is_multi_device) {
-  if (!metadata_->is_multi_device_function()) {
+  if (!metadata_->use_multi_device_function()) {
     *is_multi_device = false;
     return Status::OK();
   }
diff --git a/tensorflow/core/kernels/data/captured_function.h b/tensorflow/core/kernels/data/captured_function.h
index 1cb39644ed3..564ab9418ee 100644
--- a/tensorflow/core/kernels/data/captured_function.h
+++ b/tensorflow/core/kernels/data/captured_function.h
@@ -47,17 +47,6 @@ Status MakeIteratorFromInputElement(
     const InstantiatedCapturedFunction& inst_captured_func, StringPiece prefix,
     std::unique_ptr<IteratorBase>* out_iterator);
 
-// Creates an iterator for a dataset which is created by applying the given
-// function to the given input element.
-//
-// TODO(jsimsa): Remove this overload once all callers are migrated to the API
-// that passes in the parent iterator pointer.
-ABSL_DEPRECATED("Use the overload that passes the parent iterator pointer.")
-Status MakeIteratorFromInputElement(
-    IteratorContext* ctx, const std::vector<Tensor>& input_element,
-    int64 thread_index, const InstantiatedCapturedFunction& inst_captured_func,
-    StringPiece prefix, std::unique_ptr<IteratorBase>* out_iterator);
-
 // Determines whether the given node is stateful.
 Status IsNodeStateful(const FunctionLibraryDefinition& library,
                       const NodeDef& node);
@@ -113,7 +102,7 @@ class InstantiatedCapturedFunction {
       FunctionLibraryRuntime* lib, FunctionLibraryRuntime::Handle f_handle,
       DataTypeVector ret_types,
       std::function<void(std::function<void()>)> runner,
-      CapturedFunction* captured_func);
+      CapturedFunction* captured_func, bool is_multi_device);
 
   // Determines whether a rendezvous object should be created when running the
   // instantiated function.
@@ -128,6 +117,7 @@ class InstantiatedCapturedFunction {
   // run the function without `IteratorContext` via `RunInstantiated`.
   std::function<void(std::function<void()>)> captured_runner_;
   CapturedFunction* const captured_func_;
+  bool const is_multi_device_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(InstantiatedCapturedFunction);
 };
@@ -141,7 +131,6 @@ struct ShortCircuitInfo {
 class FunctionMetadata {
  public:
   struct Params {
-    bool is_multi_device_function = false;
     bool use_inter_op_parallelism = true;
     bool use_default_device = true;
   };
@@ -161,9 +150,6 @@ class FunctionMetadata {
   // Returns the named list of function arguments.
   const NameAttrList& func() const { return func_; }
 
-  // Indicates whether the function is a multi-device function.
-  bool is_multi_device_function() const { return is_multi_device_function_; }
-
   // Returns a borrowed pointer to the function library that contains the
   // transitive closure of definitions used by the function.
   const FunctionLibraryDefinition* lib_def() const { return lib_def_.get(); }
@@ -181,21 +167,21 @@ class FunctionMetadata {
   // function.
   bool use_inter_op_parallelism() const { return use_inter_op_parallelism_; }
 
+  // Indicates whether the function should a multi-device function backend.
+  bool use_multi_device_function() const { return use_multi_device_function_; }
+
  private:
   FunctionMetadata(NameAttrList&& func, Params params)
       : func_(std::move(func)),
-        is_multi_device_function_(params.is_multi_device_function),
         use_default_device_(params.use_default_device),
         use_inter_op_parallelism_(params.use_inter_op_parallelism) {}
 
-  void ValidateMultiDevice();
-
   NameAttrList func_;
   std::unique_ptr<FunctionLibraryDefinition> lib_def_ = nullptr;
   ShortCircuitInfo short_circuit_info_;
-  bool is_multi_device_function_ = false;
   bool use_default_device_ = true;
   bool use_inter_op_parallelism_ = true;
+  bool use_multi_device_function_ = true;
 };
 
 // A `CapturedFunction` encapsulates a TensorFlow function, plus any "captured"
@@ -247,11 +233,6 @@ class CapturedFunction {
   // Returns the named list of function arguments.
   const NameAttrList& func() const { return metadata_->func(); }
 
-  // Indicates whether the function is multi-device.
-  bool is_multi_device_function() const {
-    return metadata_->is_multi_device_function();
-  }
-
   // Returns the transitive set of function definition required to instantiate
   // this function.
   const FunctionLibraryDefinition* lib_def() const {
diff --git a/tensorflow/core/kernels/data/concatenate_dataset_op.cc b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
index b74fd39aff9..4cf1228f0fd 100644
--- a/tensorflow/core/kernels/data/concatenate_dataset_op.cc
+++ b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
@@ -182,8 +182,8 @@ class ConcatenateDatasetOp::Dataset : public DatasetBase {
 
    private:
     mutex mu_;
-    int64 i_ GUARDED_BY(mu_);
-    std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    int64 i_ TF_GUARDED_BY(mu_);
+    std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
   };
 
   static PartialTensorShape MostSpecificCompatibleShape(
diff --git a/tensorflow/core/kernels/data/dataset_test_base.cc b/tensorflow/core/kernels/data/dataset_test_base.cc
index 817e075e69b..9ce29ddd0d5 100644
--- a/tensorflow/core/kernels/data/dataset_test_base.cc
+++ b/tensorflow/core/kernels/data/dataset_test_base.cc
@@ -400,7 +400,12 @@ Status DatasetOpsTestBase::InitFunctionLibraryRuntime(
   pflr_ = absl::make_unique<ProcessFunctionLibraryRuntime>(
       device_mgr_.get(), Env::Default(), /*config=*/nullptr,
       TF_GRAPH_DEF_VERSION, lib_def_.get(), opts, thread_pool_.get(),
-      nullptr /* cluster_flr */);
+      /*parent=*/nullptr, /*custom_kernel_creator=*/nullptr,
+      /*session_metadata=*/nullptr,
+      [](const int64, const DeviceMgr* device_mgr, Rendezvous** r) {
+        *r = new IntraProcessRendezvous(device_mgr);
+        return Status::OK();
+      });
   flr_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
   if (thread_pool_ == nullptr) {
     runner_ = [](const std::function<void()>& fn) { fn(); };
@@ -449,11 +454,6 @@ Status DatasetOpsTestBase::RunFunction(
   params.delete_kernel = [](OpKernel* kernel) {
     DeleteNonCachedKernel(kernel);
   };
-  params.rendezvous_factory = [](const int64, const DeviceMgr* device_mgr,
-                                 Rendezvous** r) {
-    *r = new IntraProcessRendezvous(device_mgr);
-    return Status::OK();
-  };
 
   Executor* cur_exec;
   TF_RETURN_IF_ERROR(NewLocalExecutor(params, *g, &cur_exec));
diff --git a/tensorflow/core/kernels/data/dataset_utils.cc b/tensorflow/core/kernels/data/dataset_utils.cc
index 60395835cd0..ee135b3c9db 100644
--- a/tensorflow/core/kernels/data/dataset_utils.cc
+++ b/tensorflow/core/kernels/data/dataset_utils.cc
@@ -153,10 +153,11 @@ Status ParseInputNodeName(const std::string& input_name, std::string* node_name,
 // https://stackoverflow.com/questions/11338746/directed-graphs-with-a-given-root-node-match-another-directed-graph-for-equali
 class GraphHasher {
  public:
-  explicit GraphHasher(const GraphDef& graph_def, const NodeDef* root_node)
-      : graph_def_(graph_def),
-        root_node_(root_node),
-        flib_def_(OpRegistry::Global(), graph_def.library()) {}
+  // `GraphHasher` does not take ownership of `graph_def`, `root_node`, or
+  // `flib_def`.
+  explicit GraphHasher(const GraphDef* graph_def, const NodeDef* root_node,
+                       const FunctionLibraryDefinition* flib_def)
+      : graph_def_(graph_def), root_node_(root_node), flib_def_(flib_def) {}
 
   Status ComputeHash(uint64* hash) {
     TF_RETURN_IF_ERROR(Init());
@@ -189,7 +190,7 @@ class GraphHasher {
         TF_RETURN_IF_ERROR(ParseInputNodeName(node->input(i), &node_name,
                                               &suffix, &is_control_input));
         const NodeDef* input_node;
-        TF_RETURN_IF_ERROR(FindNode(graph_def_, node_name, &input_node));
+        TF_RETURN_IF_ERROR(FindNode(*graph_def_, node_name, &input_node));
 
         // If we've already seen this node before, skip it and don't add it to
         // the queue.
@@ -308,20 +309,19 @@ class GraphHasher {
   }
 
   Status HashFunction(const NameAttrList& func, uint64* hash) {
-    const FunctionDef* fdef = flib_def_.Find(func.name());
+    const FunctionDef* fdef = flib_def_->Find(func.name());
 
     // Convert to a GraphDef.
     std::unique_ptr<FunctionBody> fbody;
     TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(*fdef, AttrSlice(&func.attr()),
-                                               &flib_def_, &fbody));
+                                               flib_def_, &fbody));
     GraphDef graph_def = fbody->graph->ToGraphDefDebug();
-    graph_def.mutable_library()->MergeFrom(flib_def_.ToProto());
 
     // For each return node, we create a new GraphHasher to compute a hash.
     // We then combine these hashes to produce the hash ordered.
     uint64 ret_nodes_hash = 0;
     for (const auto& ret_node : fbody->ret_nodes) {
-      GraphHasher ret_node_hasher(graph_def, &ret_node->def());
+      GraphHasher ret_node_hasher(&graph_def, &ret_node->def(), flib_def_);
       uint64 ret_node_hash = 0;
       TF_RETURN_IF_ERROR(ret_node_hasher.ComputeHash(&ret_node_hash));
       ret_nodes_hash = Hash64Combine(ret_nodes_hash, ret_node_hash);
@@ -359,9 +359,9 @@ class GraphHasher {
     }
   };
 
-  const GraphDef graph_def_;
-  const NodeDef* root_node_;
-  const FunctionLibraryDefinition flib_def_;
+  const GraphDef* const graph_def_;                  // Not owned.
+  const NodeDef* const root_node_;                   // Not owned.
+  const FunctionLibraryDefinition* const flib_def_;  // Not owned.
   // Edges that need to be pruned as their presence will cause cycles.
   absl::flat_hash_set<uint64> cycle_forming_edges_;
   absl::flat_hash_map<const NodeDef*, NodeRep> nodes_;
@@ -397,7 +397,14 @@ Status HashTensor(const Tensor& tensor, uint64* hash) {
 }
 
 Status HashNode(const GraphDef& graph, const NodeDef& node, uint64* hash) {
-  GraphHasher graph_hasher(graph, &node);
+  const FunctionLibraryDefinition flib_def(OpRegistry::Global(),
+                                           graph.library());
+  return HashNode(graph, node, flib_def, hash);
+}
+
+Status HashNode(const GraphDef& graph, const NodeDef& node,
+                const FunctionLibraryDefinition& flib_def, uint64* hash) {
+  GraphHasher graph_hasher(&graph, &node, &flib_def);
   return graph_hasher.ComputeHash(hash);
 }
 
@@ -414,11 +421,20 @@ Status HashGraph(const GraphDef& graph_def, uint64* hash) {
     return errors::Internal("Cannot find sink node for dataset graph.");
   }
 
-  GraphHasher graph_hasher(graph_def, sink);
+  const FunctionLibraryDefinition flib_def(OpRegistry::Global(),
+                                           graph_def.library());
+  GraphHasher graph_hasher(&graph_def, sink, &flib_def);
   TF_RETURN_IF_ERROR(graph_hasher.ComputeHash(hash));
   return Status::OK();
 }
 
+std::pair<int64, int64> MaybeOverrideSeeds(std::pair<int64, int64> seeds) {
+  if (seeds.first == 0 && seeds.second == 0) {
+    return {random::New64(), random::New64()};
+  }
+  return seeds;
+}
+
 Status RegisterCancellationCallback(CancellationManager* cancellation_manager,
                                     std::function<void()> register_fn,
                                     std::function<void()>* deregister_fn) {
diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
index 86de97cbaab..bedd5facda9 100644
--- a/tensorflow/core/kernels/data/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -135,6 +135,8 @@ Status VerifyShapesCompatible(const std::vector<PartialTensorShape>& expected,
 // NOTE: There is currently no guarantee that the hash of a subgraph will stay
 // the same between TensorFlow builds.
 Status HashNode(const GraphDef& graph, const NodeDef& node, uint64* hash);
+Status HashNode(const GraphDef& graph, const NodeDef& node,
+                const FunctionLibraryDefinition& flib_def, uint64* hash);
 
 // Returns a stable hash of the given tensor.
 //
@@ -188,6 +190,13 @@ class DeterminismPolicy {
   Type determinism_;
 };
 
+// Resolves non-deterministic seeds if necessary, returning either the original
+// seeds or the resolved seeds.
+//
+// By TensorFlow convention, if both seeds are 0, they should be replaced with
+// non-deterministically chosen seeds.
+std::pair<int64, int64> MaybeOverrideSeeds(std::pair<int64, int64> seeds);
+
 // Helper class for reading data from a vector of VariantTensorData objects.
 class VariantTensorDataReader : public IteratorStateReader {
  public:
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index 95c43a1800c..298982eb356 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -320,6 +320,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data:dataset_utils",
     ],
 )
 
@@ -364,6 +365,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data:dataset_utils",
         "//tensorflow/core/kernels/data:name_utils",
     ],
 )
@@ -442,6 +444,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:coding",
+        "//tensorflow/core/platform:random",
         "//tensorflow/core/profiler/lib:traceme",
     ],
 )
diff --git a/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc
index cd10b754b00..b5f89996b5d 100644
--- a/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc
@@ -55,6 +55,8 @@ class WrapperDataset : public DatasetBase {
 
   string DebugString() const override { return "WrapperDataset"; }
 
+  Status CheckExternalState() const override { return Status::OK(); }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
@@ -438,7 +440,7 @@ class ChooseFastestBranchDatasetOp : public UnaryDatasetOpKernel {
       Status GetNextFromExperiment(IteratorContext* ctx,
                                    std::vector<Tensor>* out_tensors,
                                    bool* end_of_sequence)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         DCHECK_GE(branch_index_, 0);
         DCHECK_LT(branch_index_, histograms_.size());
 
@@ -458,7 +460,7 @@ class ChooseFastestBranchDatasetOp : public UnaryDatasetOpKernel {
       // Select the fastest input to use based on the histograms of timings
       // of the completed iterations. The input with the best 90th percentile
       // iteration time is selected.
-      void SelectFastestInputIndex() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      void SelectFastestInputIndex() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         fastest_index_ = 0;
 
         VLOG(2) << "90.0 percentile iteration time:";
@@ -479,7 +481,7 @@ class ChooseFastestBranchDatasetOp : public UnaryDatasetOpKernel {
 
       Status MakeCurrentIterator(IteratorContext* ctx, int64 branch_index,
                                  bool is_experiment)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         DCHECK_GE(branch_index, 0);
         DCHECK_LT(branch_index, histograms_.size());
 
@@ -523,19 +525,19 @@ class ChooseFastestBranchDatasetOp : public UnaryDatasetOpKernel {
       }
 
       mutex mu_;
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
       std::vector<std::unique_ptr<InstantiatedCapturedFunction>>
-          instantiated_captured_funcs_ GUARDED_BY(mu_);
+          instantiated_captured_funcs_ TF_GUARDED_BY(mu_);
 
       // For tracking the time taken for each input's iterations.
-      std::vector<histogram::Histogram> histograms_ GUARDED_BY(mu_);
+      std::vector<histogram::Histogram> histograms_ TF_GUARDED_BY(mu_);
       int64 fastest_index_ = -1;
       std::unique_ptr<Tensor> wrapper_dataset_tensor_;
       std::unique_ptr<IteratorBase> current_iterator_;
 
       // Keeps track of which (branch, experiment) the next iteration is on.
-      int64 branch_index_ GUARDED_BY(mu_) = 0;
-      int64 experiment_counter_ GUARDED_BY(mu_) = 0;
+      int64 branch_index_ TF_GUARDED_BY(mu_) = 0;
+      int64 experiment_counter_ TF_GUARDED_BY(mu_) = 0;
     };  // class Iterator
 
     const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc b/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc
index e60b4124c57..6531e766183 100644
--- a/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc
@@ -300,11 +300,11 @@ class ChooseFastestDatasetOp : public DatasetOpKernel {
       std::vector<histogram::Histogram> histograms_;
 
       mutex mu_;
-      int64 experiment_counter_ GUARDED_BY(mu_) = 0;
+      int64 experiment_counter_ TF_GUARDED_BY(mu_) = 0;
       int64 fastest_index_ = -1;
 
       std::vector<ThreadInfo> StartThreads(IteratorContext* ctx)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         std::vector<ThreadInfo> threads(dataset()->inputs_.size());
         for (size_t i = 0, num_inputs = dataset()->inputs_.size();
              i < num_inputs; ++i) {
@@ -330,7 +330,7 @@ class ChooseFastestDatasetOp : public DatasetOpKernel {
       // Select the fastest input to use based on the histograms of timings
       // of the completed threads. The input with the best 90th percentile
       // iteration time is selected.
-      void SelectFastestInputIndex() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      void SelectFastestInputIndex() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         fastest_index_ = 0;
 
         VLOG(2) << "90.0 percentile iteration time:";
diff --git a/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc b/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
index ae13d712f1f..8b2d3343c78 100644
--- a/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
@@ -336,7 +336,7 @@ class CSVDatasetOp : public DatasetOpKernel {
       // when fields are included in the record.
       Status ReadRecord(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                         bool select_all, const std::vector<int64>& selected)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         if (pos_ >= buffer_.size()) {
           // At the end of the file, this will return errors::OutOfRange
           TF_RETURN_IF_ERROR(FillBuffer(&buffer_));
@@ -375,7 +375,7 @@ class CSVDatasetOp : public DatasetOpKernel {
       Status ParseOneField(IteratorContext* ctx,
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_record, bool include)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         if (pos_ >= buffer_.size()) {
           // If we get here, this means the previous field's end coincided
           // with the end of the buffer. We can fill the buffer without abandon.
@@ -418,7 +418,7 @@ class CSVDatasetOp : public DatasetOpKernel {
       // 0.
       Status SaveAndFillBuffer(std::vector<Piece>* earlier_pieces,
                                size_t* start, bool include)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         tstring temp_buffer;
 
         buffer_.swap(temp_buffer);
@@ -438,7 +438,7 @@ class CSVDatasetOp : public DatasetOpKernel {
       Status ParseQuotedField(IteratorContext* ctx,
                               std::vector<Tensor>* out_tensors,
                               bool* end_of_record, bool include)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         std::vector<Piece> earlier_pieces;
         size_t start = pos_;
         pos_++;  // Starting quotation mark
@@ -509,7 +509,8 @@ class CSVDatasetOp : public DatasetOpKernel {
       Status QuotedFieldToOutput(IteratorContext* ctx, StringPiece field,
                                  std::vector<Tensor>* out_tensors,
                                  const std::vector<Piece>& earlier_pieces,
-                                 bool include) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                                 bool include)
+          TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         if (!include) return Status::OK();
 
         if (earlier_pieces.empty()) {
@@ -571,7 +572,7 @@ class CSVDatasetOp : public DatasetOpKernel {
       Status ParseUnquotedField(IteratorContext* ctx,
                                 std::vector<Tensor>* out_tensors,
                                 bool* end_of_record, bool include)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         std::vector<Piece> earlier_pieces;
         size_t start = pos_;
         Status parse_result;
@@ -622,7 +623,7 @@ class CSVDatasetOp : public DatasetOpKernel {
         }
       }
 
-      Status FillBuffer(tstring* result) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      Status FillBuffer(tstring* result) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         result->clear();
         ++num_buffer_reads_;
         Status s = input_stream_->ReadNBytes(
@@ -739,7 +740,7 @@ class CSVDatasetOp : public DatasetOpKernel {
       // Records can be delimited by "\r\n" line breaks. When we encounter a
       // '\r', we have to check the next character to see if it is part of the
       // linebreak, and ignore it if so.
-      void SkipNewLineIfNecessary() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      void SkipNewLineIfNecessary() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         if (pos_ >= buffer_.size()) {
           Status s = FillBuffer(&buffer_);
           pos_ = 0;
@@ -758,7 +759,8 @@ class CSVDatasetOp : public DatasetOpKernel {
       Status UnquotedFieldToOutput(IteratorContext* ctx, StringPiece field,
                                    std::vector<Tensor>* out_tensors,
                                    const std::vector<Piece>& earlier_pieces,
-                                   bool include) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                                   bool include)
+          TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         if (!include) return Status::OK();
 
         if (earlier_pieces.empty()) {
@@ -781,7 +783,7 @@ class CSVDatasetOp : public DatasetOpKernel {
       }
 
       // Sets up reader streams to read from the file at `current_file_index_`.
-      Status SetupStreamsLocked(Env* env) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      Status SetupStreamsLocked(Env* env) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         if (current_file_index_ >= dataset()->filenames_.size()) {
           return errors::InvalidArgument(
               "current_file_index_:", current_file_index_,
@@ -821,22 +823,23 @@ class CSVDatasetOp : public DatasetOpKernel {
       }
 
       // Resets all reader streams.
-      void ResetStreamsLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      void ResetStreamsLocked() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         input_stream_.reset();
         file_.reset();
       }
 
       mutex mu_;
-      tstring buffer_ GUARDED_BY(mu_);  // Maintain our own buffer
-      size_t pos_ GUARDED_BY(
+      tstring buffer_ TF_GUARDED_BY(mu_);  // Maintain our own buffer
+      size_t pos_ TF_GUARDED_BY(
           mu_);  // Index into the buffer must be maintained between iters
-      size_t num_buffer_reads_ GUARDED_BY(mu_);
+      size_t num_buffer_reads_ TF_GUARDED_BY(mu_);
       std::shared_ptr<io::RandomAccessInputStream> random_access_input_stream_
-          GUARDED_BY(mu_);
-      std::shared_ptr<io::InputStreamInterface> input_stream_ GUARDED_BY(mu_);
-      size_t current_file_index_ GUARDED_BY(mu_) = 0;
+          TF_GUARDED_BY(mu_);
+      std::shared_ptr<io::InputStreamInterface> input_stream_
+          TF_GUARDED_BY(mu_);
+      size_t current_file_index_ TF_GUARDED_BY(mu_) = 0;
       std::unique_ptr<RandomAccessFile> file_
-          GUARDED_BY(mu_);  // must outlive input_stream_
+          TF_GUARDED_BY(mu_);  // must outlive input_stream_
     };                      // class Iterator
 
     const std::vector<string> filenames_;
diff --git a/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
index c0cef82f4fd..b0aee7a2af2 100644
--- a/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
@@ -304,7 +304,7 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
 
      private:
       mutex mu_;
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
     };
 
     const int64 batch_size_;
diff --git a/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc b/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc
index 9d3fd9e5187..630efbebb6f 100644
--- a/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc
@@ -254,10 +254,10 @@ class DirectedInterleaveDatasetOp : public DatasetOpKernel {
 
      private:
       mutex mu_;
-      std::unique_ptr<IteratorBase> selector_input_impl_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> selector_input_impl_ TF_GUARDED_BY(mu_);
       std::vector<std::unique_ptr<IteratorBase>> data_input_impls_
-          GUARDED_BY(mu_);
-      int64 num_active_inputs_ GUARDED_BY(mu_);
+          TF_GUARDED_BY(mu_);
+      int64 num_active_inputs_ TF_GUARDED_BY(mu_);
     };
 
     static PartialTensorShape MostSpecificCompatibleShape(
diff --git a/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc b/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
index 2847216d700..cde4fe2a591 100644
--- a/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
@@ -32,16 +32,17 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit GroupByReducerDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx) {
-    FunctionMetadata::Params params;
-    params.is_multi_device_function = true;
-    OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, "key_func", params,
+    OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, "key_func", /*params=*/{},
                                                  &key_func_metadata_));
-    OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, "init_func", params,
-                                                 &init_func_metadata_));
-    OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, "reduce_func", params,
-                                                 &reduce_func_metadata_));
-    OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, "finalize_func", params,
-                                                 &finalize_func_metadata_));
+    OP_REQUIRES_OK(ctx,
+                   FunctionMetadata::Create(ctx, "init_func", /*params=*/{},
+                                            &init_func_metadata_));
+    OP_REQUIRES_OK(ctx,
+                   FunctionMetadata::Create(ctx, "reduce_func", /*params=*/{},
+                                            &reduce_func_metadata_));
+    OP_REQUIRES_OK(ctx,
+                   FunctionMetadata::Create(ctx, "finalize_func", /*params=*/{},
+                                            &finalize_func_metadata_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
   }
@@ -285,6 +286,13 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
       }
 
       Status SaveInternal(IteratorStateWriter* writer) override {
+        TF_RETURN_IF_ERROR(dataset()->captured_key_func_->CheckExternalState());
+        TF_RETURN_IF_ERROR(
+            dataset()->captured_init_func_->CheckExternalState());
+        TF_RETURN_IF_ERROR(
+            dataset()->captured_reduce_func_->CheckExternalState());
+        TF_RETURN_IF_ERROR(
+            dataset()->captured_finalize_func_->CheckExternalState());
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
 
@@ -391,11 +399,11 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
 
      private:
       mutex mu_;
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-      bool end_of_input_ GUARDED_BY(mu_) = false;
-      std::map<int64, std::vector<Tensor>> states_ GUARDED_BY(mu_);
-      std::vector<int64> keys_ GUARDED_BY(mu_);
-      int64 keys_index_ GUARDED_BY(mu_) = 0;
+      std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
+      bool end_of_input_ TF_GUARDED_BY(mu_) = false;
+      std::map<int64, std::vector<Tensor>> states_ TF_GUARDED_BY(mu_);
+      std::vector<int64> keys_ TF_GUARDED_BY(mu_);
+      int64 keys_index_ TF_GUARDED_BY(mu_) = 0;
       std::unique_ptr<InstantiatedCapturedFunction> instantiated_key_func_;
       std::unique_ptr<InstantiatedCapturedFunction> instantiated_init_func_;
       std::unique_ptr<InstantiatedCapturedFunction> instantiated_reduce_func_;
diff --git a/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
index c47f5f6b64a..51b54019c0b 100644
--- a/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
@@ -33,15 +33,14 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit GroupByWindowDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx) {
-    FunctionMetadata::Params params;
-    params.is_multi_device_function = true;
-    OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, "key_func", params,
+    OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, "key_func", /*params=*/{},
                                                  &key_func_metadata_));
-    OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, "reduce_func", params,
-                                                 &reduce_func_metadata_));
     OP_REQUIRES_OK(ctx,
-                   FunctionMetadata::Create(ctx, "window_size_func", params,
-                                            &window_size_func_metadata_));
+                   FunctionMetadata::Create(ctx, "reduce_func", /*params=*/{},
+                                            &reduce_func_metadata_));
+    OP_REQUIRES_OK(
+        ctx, FunctionMetadata::Create(ctx, "window_size_func", /*params=*/{},
+                                      &window_size_func_metadata_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
   }
@@ -296,6 +295,11 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
       }
 
       Status SaveInternal(IteratorStateWriter* writer) override {
+        TF_RETURN_IF_ERROR(dataset()->captured_key_func_->CheckExternalState());
+        TF_RETURN_IF_ERROR(
+            dataset()->captured_reduce_func_->CheckExternalState());
+        TF_RETURN_IF_ERROR(
+            dataset()->captured_window_size_func_->CheckExternalState());
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
 
@@ -410,7 +414,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
      private:
       Status SaveGroup(IteratorStateWriter* writer, const string& name,
                        const std::vector<std::vector<Tensor>>& group)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(strings::StrCat(name, "_size"), group.size()));
         for (int i = 0; i < group.size(); i++) {
@@ -426,7 +430,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
 
       Status RestoreGroup(IteratorStateReader* reader, const string& name,
                           std::vector<std::vector<Tensor>>* group)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         int64 group_size;
         TF_RETURN_IF_ERROR(
             reader->ReadScalar(strings::StrCat(name, "_size"), &group_size));
@@ -445,7 +449,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
       }
 
       Status StartFlushingGroup(IteratorContext* ctx, int64 key)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         DatasetBase* group_dataset;
         TF_RETURN_IF_ERROR(NewWindowDataset(
             groups_[key], dataset()->input_->output_dtypes(),
@@ -485,13 +489,14 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
       }
 
       mutex mu_;
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
       // TODO(mrry): Optimize for dense key space if appropriate.
-      bool end_of_input_ GUARDED_BY(mu_) = false;
-      int64 current_key_ GUARDED_BY(mu_);
-      std::map<int64, std::vector<std::vector<Tensor>>> groups_ GUARDED_BY(mu_);
-      std::unique_ptr<IteratorBase> current_group_iterator_ GUARDED_BY(mu_);
-      std::map<int64, int64> window_sizes_ GUARDED_BY(mu_);
+      bool end_of_input_ TF_GUARDED_BY(mu_) = false;
+      int64 current_key_ TF_GUARDED_BY(mu_);
+      std::map<int64, std::vector<std::vector<Tensor>>> groups_
+          TF_GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> current_group_iterator_ TF_GUARDED_BY(mu_);
+      std::map<int64, int64> window_sizes_ TF_GUARDED_BY(mu_);
       std::unique_ptr<InstantiatedCapturedFunction> instantiated_key_func_;
       std::unique_ptr<InstantiatedCapturedFunction> instantiated_reduce_func_;
       std::unique_ptr<InstantiatedCapturedFunction>
diff --git a/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc b/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
index 2d9c53a088d..e7a1675b664 100644
--- a/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
@@ -137,7 +137,7 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
 
      private:
       mutex mu_;
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
     };
 
     const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc b/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc
index bae373a1d76..d661623dd00 100644
--- a/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc
@@ -146,7 +146,7 @@ class LMDBDatasetOp : public DatasetOpKernel {
       }
 
      private:
-      Status SetupStreamsLocked(Env* env) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      Status SetupStreamsLocked(Env* env) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         if (current_file_index_ >= dataset()->filenames_.size()) {
           return errors::InvalidArgument(
               "current_file_index_:", current_file_index_,
@@ -190,7 +190,7 @@ class LMDBDatasetOp : public DatasetOpKernel {
         }
         return Status::OK();
       }
-      void ResetStreamsLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      void ResetStreamsLocked() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         if (mdb_env_ != nullptr) {
           if (mdb_cursor_) {
             mdb_cursor_close(mdb_cursor_);
@@ -205,14 +205,14 @@ class LMDBDatasetOp : public DatasetOpKernel {
         }
       }
       mutex mu_;
-      size_t current_file_index_ GUARDED_BY(mu_) = 0;
-      MDB_env* mdb_env_ GUARDED_BY(mu_) = nullptr;
-      MDB_txn* mdb_txn_ GUARDED_BY(mu_) = nullptr;
-      MDB_dbi mdb_dbi_ GUARDED_BY(mu_) = 0;
-      MDB_cursor* mdb_cursor_ GUARDED_BY(mu_) = nullptr;
+      size_t current_file_index_ TF_GUARDED_BY(mu_) = 0;
+      MDB_env* mdb_env_ TF_GUARDED_BY(mu_) = nullptr;
+      MDB_txn* mdb_txn_ TF_GUARDED_BY(mu_) = nullptr;
+      MDB_dbi mdb_dbi_ TF_GUARDED_BY(mu_) = 0;
+      MDB_cursor* mdb_cursor_ TF_GUARDED_BY(mu_) = nullptr;
 
-      MDB_val mdb_key_ GUARDED_BY(mu_);
-      MDB_val mdb_value_ GUARDED_BY(mu_);
+      MDB_val mdb_key_ TF_GUARDED_BY(mu_);
+      MDB_val mdb_value_ TF_GUARDED_BY(mu_);
     };
 
     const std::vector<string> filenames_;
diff --git a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
index d4ce7aa52ec..a21a97b762a 100644
--- a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
@@ -100,7 +100,8 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
         traceme_metadata_(
             {{"autotune",
               num_parallel_calls == model::kAutotune ? "true" : "false"},
-             {"batch_size", strings::Printf("%lld", batch_size)},
+             {"batch_size",
+              strings::Printf("%lld", static_cast<long long>(batch_size))},
              {"drop_remainder", drop_remainder ? "true" : "false"}}) {
     input_->Ref();
   }
@@ -251,6 +252,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
     }
 
     Status SaveInternal(IteratorStateWriter* writer) override {
+      TF_RETURN_IF_ERROR(dataset()->captured_func_->CheckExternalState());
       mutex_lock l(*mu_);
       // Wait for all in-flight calls to complete.
       while (num_calls_ > 0) {
@@ -284,8 +286,8 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
     }
 
     TraceMeMetadata GetTraceMeMetadata() const override {
-      int64 parallelism = -1;
-      int64 max_batch_results = -1;
+      long long parallelism = -1;        // NOLINT
+      long long max_batch_results = -1;  // NOLINT
       // NOTE: We only set the parallelism value if the lock can be acquired
       // right away to avoid introducing tracing overhead.
       if (mu_->try_lock()) {
@@ -331,19 +333,19 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
       }
 
       mutex mu;
-      bool end_of_input GUARDED_BY(mu);
-      int64 num_elements GUARDED_BY(mu);
+      bool end_of_input TF_GUARDED_BY(mu);
+      int64 num_elements TF_GUARDED_BY(mu);
       std::vector<Tensor> output;
-      bool output_allocated GUARDED_BY(mu);
-      Status status GUARDED_BY(mu);
-      int64 status_offset GUARDED_BY(mu);
+      bool output_allocated TF_GUARDED_BY(mu);
+      Status status TF_GUARDED_BY(mu);
+      int64 status_offset TF_GUARDED_BY(mu);
       // Counts the number of outstanding calls for this batch.
       int64 num_calls;  // access guarded by owner's mutex
     };
 
     void CallCompleted(const std::shared_ptr<IteratorContext>& ctx,
                        const std::shared_ptr<BatchResult>& result)
-        LOCKS_EXCLUDED(*mu_) {
+        TF_LOCKS_EXCLUDED(*mu_) {
       mutex_lock l(*mu_);
       num_calls_--;
       result->num_calls--;
@@ -360,7 +362,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
 
     void CallFunction(std::shared_ptr<IteratorContext> ctx,
                       const std::shared_ptr<BatchResult>& result, int64 offset)
-        LOCKS_EXCLUDED(*mu_) {
+        TF_LOCKS_EXCLUDED(*mu_) {
       // Get the next input element.
       std::vector<Tensor> input_element;
       bool end_of_input = false;
@@ -438,7 +440,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
                                             std::move(done), prefix());
     }
 
-    void CancelThreads(bool wait) LOCKS_EXCLUDED(mu_) {
+    void CancelThreads(bool wait) TF_LOCKS_EXCLUDED(mu_) {
       mutex_lock l(*mu_);
       cancelled_ = true;
       cond_var_->notify_all();
@@ -470,7 +472,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
     }
 
     void EnsureRunnerThreadStarted(IteratorContext* ctx)
-        EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
       if (!runner_thread_) {
         auto ctx_copy = std::make_shared<IteratorContext>(*ctx);
         runner_thread_ = ctx->StartThread(
@@ -558,7 +560,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
     }
 
     void RunnerThread(const std::shared_ptr<IteratorContext>& ctx)
-        LOCKS_EXCLUDED(*mu_) {
+        TF_LOCKS_EXCLUDED(*mu_) {
       std::vector<std::pair<std::shared_ptr<BatchResult>, int64>> new_calls;
       RecordStart(ctx.get());
       auto stop_cleanup =
@@ -567,7 +569,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
         tf_shared_lock l(*mu_);  // mu_ == num_parallel_calls_->mu
         new_calls.reserve(num_parallel_calls_->value);
       }
-      auto busy = [this]() EXCLUSIVE_LOCKS_REQUIRED(*mu_) -> bool {
+      auto busy = [this]() TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) -> bool {
         int64 num_parallel_calls = num_parallel_calls_->value;
         return num_calls_ >= num_parallel_calls ||
                (batch_results_.size() > max_batch_results_ ||
@@ -623,7 +625,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
     }
 
     Status ReadBatchResult(IteratorContext* ctx, IteratorStateReader* reader,
-                           size_t index) EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+                           size_t index) TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
       batch_results_.push_back(
           std::make_shared<BatchResult>(dataset()->batch_size_));
       std::shared_ptr<BatchResult> result = batch_results_.back();
@@ -669,7 +671,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
     }
 
     Status ReadStatus(IteratorStateReader* reader, const string& prefix,
-                      Status* status) EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+                      Status* status) TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
       int64 code_int;
       TF_RETURN_IF_ERROR(reader->ReadScalar(
           full_name(strings::StrCat(prefix, "_", kCode)), &code_int));
@@ -687,7 +689,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
     }
 
     Status WriteBatchResult(IteratorStateWriter* writer, size_t index)
-        EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
       std::shared_ptr<BatchResult> result = batch_results_[index];
       string prefix = strings::StrCat(kBatchResults, "_", index);
       mutex_lock l(result->mu);
@@ -728,7 +730,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
     }
 
     Status WriteStatus(IteratorStateWriter* writer, const string& prefix,
-                       const Status& status) EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+                       const Status& status) TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
       TF_RETURN_IF_ERROR(
           writer->WriteScalar(full_name(strings::StrCat(prefix, "_", kCode)),
                               static_cast<int64>(status.code())));
@@ -753,20 +755,20 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
     const std::shared_ptr<model::SharedState> num_parallel_calls_;
 
     // Counts the number of outstanding calls for this batch.
-    int64 num_calls_ GUARDED_BY(*mu_) = 0;
+    int64 num_calls_ TF_GUARDED_BY(*mu_) = 0;
     // Counts the total number of calls.
-    int64 call_counter_ GUARDED_BY(*mu_) = 0;
+    int64 call_counter_ TF_GUARDED_BY(*mu_) = 0;
     std::unique_ptr<IteratorBase> input_impl_;
     // Buffer for storing the (intermediate) batch results.
-    std::deque<std::shared_ptr<BatchResult>> batch_results_ GUARDED_BY(*mu_);
+    std::deque<std::shared_ptr<BatchResult>> batch_results_ TF_GUARDED_BY(*mu_);
     // Background thread used for coordinating input processing.
-    std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
+    std::unique_ptr<Thread> runner_thread_ TF_GUARDED_BY(*mu_);
     // Determines whether the transformation has been cancelled.
-    bool cancelled_ GUARDED_BY(*mu_) = false;
+    bool cancelled_ TF_GUARDED_BY(*mu_) = false;
     // Identifies the number of callers currently waiting for a batch result.
-    int64 waiting_ GUARDED_BY(*mu_) = 0;
+    int64 waiting_ TF_GUARDED_BY(*mu_) = 0;
     // Identifies the maximum number of batch results to store.
-    int64 max_batch_results_ GUARDED_BY(*mu_);
+    int64 max_batch_results_ TF_GUARDED_BY(*mu_);
     std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
 
     // Method for deregistering the cancellation callback.
@@ -786,10 +788,8 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
 
 MapAndBatchDatasetOp::MapAndBatchDatasetOp(OpKernelConstruction* ctx)
     : UnaryDatasetOpKernel(ctx) {
-  FunctionMetadata::Params params;
-  params.is_multi_device_function = true;
-  OP_REQUIRES_OK(ctx,
-                 FunctionMetadata::Create(ctx, kFunc, params, &func_metadata_));
+  OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, kFunc, /*params=*/{},
+                                               &func_metadata_));
   OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputTypes, &output_types_));
   OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputShapes, &output_shapes_));
   OP_REQUIRES_OK(ctx,
diff --git a/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc b/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
index 84cf2149a82..db2ab927993 100644
--- a/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
@@ -267,7 +267,7 @@ class MatchingFilesDatasetOp : public DatasetOpKernel {
      private:
       Status UpdateIterator(IteratorContext* ctx, FileSystem* fs,
                             const string& dir, const string& eval_pattern)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         StringPiece fixed_prefix =
             StringPiece(eval_pattern)
                 .substr(0, eval_pattern.find_first_of("*?[\\"));
@@ -362,11 +362,11 @@ class MatchingFilesDatasetOp : public DatasetOpKernel {
       typedef std::pair<string, bool> PathStatus;
       std::priority_queue<PathStatus, std::vector<PathStatus>,
                           std::greater<PathStatus>>
-          filepath_queue_ GUARDED_BY(mu_);
-      size_t current_pattern_index_ GUARDED_BY(mu_) = 0;
-      tstring current_pattern_ GUARDED_BY(mu_);
-      bool hasMatch_ GUARDED_BY(mu_) = false;
-      bool isWindows_ GUARDED_BY(mu_) = false;
+          filepath_queue_ TF_GUARDED_BY(mu_);
+      size_t current_pattern_index_ TF_GUARDED_BY(mu_) = 0;
+      tstring current_pattern_ TF_GUARDED_BY(mu_);
+      bool hasMatch_ TF_GUARDED_BY(mu_) = false;
+      bool isWindows_ TF_GUARDED_BY(mu_) = false;
     };
 
     const std::vector<tstring> patterns_;
diff --git a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
index 8323346633d..74482d8f3e0 100644
--- a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
@@ -107,8 +107,10 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
         output_types_(output_types),
         output_shapes_(output_shapes),
         traceme_metadata_(
-            {{"block_length", strings::Printf("%lld", block_length)},
-             {"cycle_length", strings::Printf("%lld", cycle_length)},
+            {{"block_length",
+              strings::Printf("%lld", static_cast<long long>(block_length))},
+             {"cycle_length",
+              strings::Printf("%lld", static_cast<long long>(cycle_length))},
              {"deterministic",
               deterministic.IsDeterministic() || deterministic.IsDefault()
                   ? "true"
@@ -395,6 +397,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     }
 
     Status SaveInternal(IteratorStateWriter* writer) override {
+      TF_RETURN_IF_ERROR(dataset()->captured_func_->CheckExternalState());
       // The order of locking is important here to avoid deadlock.
       mutex_lock l(mu_);
       mutex_lock ckpt_l(ckpt_mu_);
@@ -627,7 +630,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       WorkerThreadState() : output_elem(Status::OK()) {}
     };
 
-    void CancelThreads() LOCKS_EXCLUDED(mu_) {
+    void CancelThreads() TF_LOCKS_EXCLUDED(mu_) {
       mutex_lock l(mu_);
       cancelled_ = true;
       for (auto& worker : workers_) {
@@ -636,7 +639,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     }
 
     Status EnsureWorkerThreadsStarted(IteratorContext* ctx)
-        EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       if (worker_threads_.empty() && input_impl_) {
         worker_threads_.reserve(dataset()->num_threads());
         for (int64 i = 0; i < dataset()->num_threads(); ++i) {
@@ -872,7 +875,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     }
 
     Status WriteWorkerStateLocked(IteratorStateWriter* writer, int index)
-        EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
       string iterator_name =
           strings::StrCat(prefix(), "::", kWorker, "_", index);
       TF_RETURN_IF_ERROR(writer->WriteScalar(iterator_name, kInputSize,
@@ -898,7 +901,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
 
     Status ReadWorkerStateLocked(IteratorStateReader* reader, int index,
                                  IteratorContext* ctx)
-        EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
       string worker_prefix =
           strings::StrCat(prefix(), "::", kWorker, "_", index);
       // Restore inputs.
@@ -930,7 +933,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     }
 
     Status WriteWorkerThreadStateLocked(IteratorStateWriter* writer, int index)
-        EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
       string iterator_name =
           strings::StrCat(prefix(), "::", kWorkerThread, "_", index);
       if (worker_thread_states_[index].iterator != nullptr) {
@@ -1005,7 +1008,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
                                  const OutputElem& output_elem,
                                  const string& iterator_name,
                                  const string& prefix)
-        EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
       TF_RETURN_IF_ERROR(WriteStatusLocked(
           writer, iterator_name, strings::StrCat(prefix, "_", kStatus),
           output_elem.status));
@@ -1044,7 +1047,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     Status WriteStatusLocked(IteratorStateWriter* writer,
                              const string& iterator_name, const string& prefix,
                              const Status& status)
-        EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
       TF_RETURN_IF_ERROR(writer->WriteScalar(
           iterator_name, strings::StrCat(prefix, "_", kCode),
           static_cast<int64>(status.code())));
@@ -1078,7 +1081,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
 
     // Mutex & condition variable to guard mutable iterator internals and
     // coordinate among worker threads and client thread[s].
-    mutex mu_ ACQUIRED_BEFORE(ckpt_mu_);
+    mutex mu_ TF_ACQUIRED_BEFORE(ckpt_mu_);
     // The main thread waits on this condition variable if running in
     // nondeterministic mode and no values are available.
     condition_variable any_element_available_cond_var_;
@@ -1095,33 +1098,34 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     // The iterator producing elements which are converted to datasets by
     // the dataset()->captured_func_ then interleaved together.
     // input_impl_ is reset when we have exhausted its input.
-    std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
 
     std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
 
     // The WorkerState structs the worker threads operate on.
     // workers_ elements are in at most one of interleave_ and staging_.
-    std::vector<WorkerState> workers_ GUARDED_BY(mu_);
+    std::vector<WorkerState> workers_ TF_GUARDED_BY(mu_);
 
     // Stores the temporary state of WorkerThreads which is not stored in
     // WorkerState. This is used for checkpointing purposes only.
-    std::vector<WorkerThreadState> worker_thread_states_ GUARDED_BY(ckpt_mu_);
+    std::vector<WorkerThreadState> worker_thread_states_
+        TF_GUARDED_BY(ckpt_mu_);
 
     // Indices in `workers_` of iterators to interleave.
-    std::vector<int64> interleave_indices_ GUARDED_BY(mu_);
+    std::vector<int64> interleave_indices_ TF_GUARDED_BY(mu_);
     // Indices in `workers_` of prefetched iterators.
-    std::deque<int64> staging_indices_ GUARDED_BY(mu_);
+    std::deque<int64> staging_indices_ TF_GUARDED_BY(mu_);
 
     // The index into output_elements_ for next element to produce.
-    size_t next_index_ GUARDED_BY(mu_) = 0;
+    size_t next_index_ TF_GUARDED_BY(mu_) = 0;
     // The number of items produced so far within the block
-    size_t block_count_ GUARDED_BY(mu_) = 0;
+    size_t block_count_ TF_GUARDED_BY(mu_) = 0;
     // Flag to instruct the worker threads to exit.
-    bool cancelled_ GUARDED_BY(mu_) = false;
+    bool cancelled_ TF_GUARDED_BY(mu_) = false;
     // The worker threads. This must be last to ensure the
     // threads have exited before any other members are deallocated.
     // TODO(b/65178177): Avoid allocating additional threads.
-    std::vector<std::unique_ptr<Thread>> worker_threads_ GUARDED_BY(mu_);
+    std::vector<std::unique_ptr<Thread>> worker_threads_ TF_GUARDED_BY(mu_);
   };
 
   const DatasetBase* const input_;
@@ -1141,10 +1145,8 @@ ParallelInterleaveDatasetOp::ParallelInterleaveDatasetOp(
     OpKernelConstruction* ctx)
     : UnaryDatasetOpKernel(ctx),
       op_version_(ctx->HasAttr(kDeterministic) ? 2 : 1) {
-  FunctionMetadata::Params params;
-  params.is_multi_device_function = true;
-  OP_REQUIRES_OK(ctx,
-                 FunctionMetadata::Create(ctx, kFunc, params, &func_metadata_));
+  OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, kFunc, /*params=*/{},
+                                               &func_metadata_));
   if (op_version_ == 2) {
     std::string deterministic;
     OP_REQUIRES_OK(ctx, ctx->GetAttr(kDeterministic, &deterministic));
diff --git a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
index 09dd84bb402..c5972f11a38 100644
--- a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
@@ -349,6 +349,8 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
       explicit ParseExampleFunctor(const Dataset* dataset)
           : dataset_(dataset) {}
 
+      Status CheckExternalState() override { return Status::OK(); }
+
       void MapFunc(IteratorContext* ctx, const string& prefix,
                    std::vector<Tensor> input, std::vector<Tensor>* output,
                    StatusCallback callback) override {
diff --git a/tensorflow/core/kernels/data/experimental/random_dataset_op.cc b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
index 43e25f381ad..e1d2b5afafe 100644
--- a/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
@@ -37,7 +38,7 @@ namespace experimental {
 class RandomDatasetOp::Dataset : public DatasetBase {
  public:
   Dataset(OpKernelContext* ctx, int64 seed, int64 seed2)
-      : DatasetBase(DatasetContext(ctx)), seed_(seed), seed2_(seed2) {}
+      : DatasetBase(DatasetContext(ctx)), seeds_(seed, seed2) {}
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
       const string& prefix) const override {
@@ -57,20 +58,22 @@ class RandomDatasetOp::Dataset : public DatasetBase {
   }
 
   string DebugString() const override {
-    return strings::StrCat("RandomDatasetOp(", seed_, ", ", seed2_,
-                           ")::Dataset");
+    return strings::StrCat("RandomDatasetOp(", seeds_.first, ", ",
+                           seeds_.second, ")::Dataset");
   }
 
   int64 Cardinality() const override { return kInfiniteCardinality; }
 
+  Status CheckExternalState() const override { return Status::OK(); }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
                             Node** output) const override {
     Node* seed = nullptr;
     Node* seed2 = nullptr;
-    TF_RETURN_IF_ERROR(b->AddScalar(seed_, &seed));
-    TF_RETURN_IF_ERROR(b->AddScalar(seed2_, &seed2));
+    TF_RETURN_IF_ERROR(b->AddScalar(seeds_.first, &seed));
+    TF_RETURN_IF_ERROR(b->AddScalar(seeds_.second, &seed2));
     TF_RETURN_IF_ERROR(b->AddDataset(this, {seed, seed2}, output));
     return Status::OK();
   }
@@ -80,7 +83,8 @@ class RandomDatasetOp::Dataset : public DatasetBase {
    public:
     explicit Iterator(const Params& params)
         : DatasetIterator<Dataset>(params),
-          parent_generator_(dataset()->seed_, dataset()->seed2_),
+          seeds_(MaybeOverrideSeeds(dataset()->seeds_)),
+          parent_generator_(seeds_.first, seeds_.second),
           generator_(&parent_generator_) {}
 
     Status GetNextInternal(IteratorContext* ctx,
@@ -112,8 +116,7 @@ class RandomDatasetOp::Dataset : public DatasetBase {
       mutex_lock l(mu_);
       TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("num_random_samples"),
                                             &num_random_samples_));
-      parent_generator_ =
-          random::PhiloxRandom(dataset()->seed_, dataset()->seed2_);
+      parent_generator_ = random::PhiloxRandom(seeds_.first, seeds_.second);
       generator_ =
           random::SingleSampleAdapter<random::PhiloxRandom>(&parent_generator_);
       generator_.Skip(num_random_samples_);
@@ -122,20 +125,20 @@ class RandomDatasetOp::Dataset : public DatasetBase {
 
    private:
     random::SingleSampleAdapter<random::PhiloxRandom>::ResultType Random()
-        EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       num_random_samples_++;
       auto out = generator_();
       return out;
     }
+    const std::pair<int64, int64> seeds_;
     mutex mu_;
-    random::PhiloxRandom parent_generator_ GUARDED_BY(mu_);
+    random::PhiloxRandom parent_generator_ TF_GUARDED_BY(mu_);
     random::SingleSampleAdapter<random::PhiloxRandom> generator_
-        GUARDED_BY(mu_);
-    int64 num_random_samples_ GUARDED_BY(mu_) = 0;
+        TF_GUARDED_BY(mu_);
+    int64 num_random_samples_ TF_GUARDED_BY(mu_) = 0;
   };
 
-  const int64 seed_;
-  const int64 seed2_;
+  const std::pair<int64, int64> seeds_;
 };  // RandomDatasetOp::Dataset
 
 RandomDatasetOp::RandomDatasetOp(OpKernelConstruction* ctx)
@@ -148,13 +151,6 @@ void RandomDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase** output) {
   int64 seed2;
   OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "seed2", &seed2));
 
-  // By TensorFlow convention, passing 0 for both seeds indicates
-  // that the shuffling should be seeded non-deterministically.
-  if (seed == 0 && seed2 == 0) {
-    seed = random::New64();
-    seed2 = random::New64();
-  }
-
   *output = new Dataset(ctx, seed, seed2);
 }
 namespace {
diff --git a/tensorflow/core/kernels/data/experimental/random_dataset_op.h b/tensorflow/core/kernels/data/experimental/random_dataset_op.h
index 649da90572d..e93265f5969 100644
--- a/tensorflow/core/kernels/data/experimental/random_dataset_op.h
+++ b/tensorflow/core/kernels/data/experimental/random_dataset_op.h
@@ -37,7 +37,7 @@ class RandomDatasetOp : public DatasetOpKernel {
   explicit RandomDatasetOp(OpKernelConstruction* ctx);
 
  protected:
-  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override;
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override;
 
  private:
   class Dataset;
diff --git a/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
index 1973ba4e56a..5f224b8a5f4 100644
--- a/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
@@ -62,7 +62,8 @@ class RebatchDatasetOp : public UnaryDatasetOpKernel {
           output_types_(output_types),
           output_shapes_(output_shapes),
           traceme_metadata_(
-              {{"num_replicas", strings::Printf("%lld", num_replicas)}}) {
+              {{"num_replicas", strings::Printf("%lld", static_cast<long long>(
+                                                            num_replicas))}}) {
       input_->Ref();
     }
 
@@ -249,8 +250,8 @@ class RebatchDatasetOp : public UnaryDatasetOpKernel {
 
       mutex mu_;
       std::unique_ptr<IteratorBase> input_impl_;
-      std::vector<InputDescriptor> input_descriptors_ GUARDED_BY(mu_);
-      int64 slice_number_ GUARDED_BY(mu_) = 0;
+      std::vector<InputDescriptor> input_descriptors_ TF_GUARDED_BY(mu_);
+      int64 slice_number_ TF_GUARDED_BY(mu_) = 0;
     };
 
     const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/data/experimental/sampling_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sampling_dataset_op.cc
index 88a4603267a..36f195d1d1e 100644
--- a/tensorflow/core/kernels/data/experimental/sampling_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sampling_dataset_op.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/data/name_utils.h"
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/random.h"
@@ -43,8 +44,7 @@ class SamplingDatasetOp::Dataset : public DatasetBase {
           const DatasetBase* input)
       : DatasetBase(DatasetContext(ctx)),
         rate_(rate),
-        seed_(seed),
-        seed2_(seed2),
+        seeds_(seed, seed2),
         input_(input) {
     input_->Ref();
   }
@@ -55,7 +55,7 @@ class SamplingDatasetOp::Dataset : public DatasetBase {
       const string& prefix) const override {
     return std::unique_ptr<IteratorBase>(
         new Iterator({this, name_utils::IteratorPrefix(kDatasetType, prefix)},
-                     seed_, seed2_));
+                     seeds_.first, seeds_.second));
   }
 
   const DataTypeVector& output_dtypes() const override {
@@ -84,8 +84,8 @@ class SamplingDatasetOp::Dataset : public DatasetBase {
     Node* seed = nullptr;
     Node* seed2 = nullptr;
     TF_RETURN_IF_ERROR(b->AddScalar(rate_, &rate));
-    TF_RETURN_IF_ERROR(b->AddScalar(seed_, &seed));
-    TF_RETURN_IF_ERROR(b->AddScalar(seed2_, &seed2));
+    TF_RETURN_IF_ERROR(b->AddScalar(seeds_.first, &seed));
+    TF_RETURN_IF_ERROR(b->AddScalar(seeds_.second, &seed2));
     TF_RETURN_IF_ERROR(
         b->AddDataset(this, {input_graph_node, rate, seed, seed2}, output));
     return Status::OK();
@@ -96,9 +96,8 @@ class SamplingDatasetOp::Dataset : public DatasetBase {
    public:
     explicit Iterator(const Params& params, int64 seed, int64 seed2)
         : DatasetIterator<Dataset>(params),
-          seed_(seed),
-          seed2_(seed2),
-          parent_generator_(seed, seed2),
+          seeds_(MaybeOverrideSeeds({seed, seed2})),
+          parent_generator_(seeds_.first, seeds_.second),
           generator_(&parent_generator_) {}
 
     Status Initialize(IteratorContext* ctx) override {
@@ -138,9 +137,9 @@ class SamplingDatasetOp::Dataset : public DatasetBase {
     }
 
    protected:
-    void ResetRngs() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    void ResetRngs() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       // Reset the generators based on the current iterator seeds.
-      parent_generator_ = random::PhiloxRandom(seed_, seed2_);
+      parent_generator_ = random::PhiloxRandom(seeds_.first, seeds_.second);
       generator_ =
           random::SingleSampleAdapter<random::PhiloxRandom>(&parent_generator_);
       generator_.Skip(num_random_samples_);
@@ -151,8 +150,10 @@ class SamplingDatasetOp::Dataset : public DatasetBase {
       // Save state needed to restore the random number generators.
       TF_RETURN_IF_ERROR(writer->WriteScalar(
           this->full_name("num_random_samples"), num_random_samples_));
-      TF_RETURN_IF_ERROR(writer->WriteScalar(this->full_name("seed"), seed_));
-      TF_RETURN_IF_ERROR(writer->WriteScalar(this->full_name("seed2"), seed2_));
+      TF_RETURN_IF_ERROR(
+          writer->WriteScalar(this->full_name("seed"), seeds_.first));
+      TF_RETURN_IF_ERROR(
+          writer->WriteScalar(this->full_name("seed2"), seeds_.second));
 
       if (input_impl_) {
         TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
@@ -169,8 +170,11 @@ class SamplingDatasetOp::Dataset : public DatasetBase {
       // Restore the random number generators.
       TF_RETURN_IF_ERROR(reader->ReadScalar(
           this->full_name("num_random_samples"), &num_random_samples_));
-      TF_RETURN_IF_ERROR(reader->ReadScalar(this->full_name("seed"), &seed_));
-      TF_RETURN_IF_ERROR(reader->ReadScalar(this->full_name("seed2"), &seed2_));
+      int64 seed;
+      TF_RETURN_IF_ERROR(reader->ReadScalar(this->full_name("seed"), &seed));
+      int64 seed2;
+      TF_RETURN_IF_ERROR(reader->ReadScalar(this->full_name("seed2"), &seed2));
+      seeds_ = {seed, seed2};
       ResetRngs();
 
       if (!reader->Contains(full_name("input_impl_empty"))) {
@@ -182,11 +186,10 @@ class SamplingDatasetOp::Dataset : public DatasetBase {
     }
 
     mutex mu_;
-    int64 seed_ GUARDED_BY(mu_);
-    int64 seed2_ GUARDED_BY(mu_);
+    std::pair<int64, int64> seeds_ TF_GUARDED_BY(mu_);
 
    private:
-    std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
 
     float Random() {
       mutex_lock l(mu_);
@@ -199,14 +202,14 @@ class SamplingDatasetOp::Dataset : public DatasetBase {
     }
 
     // random util
-    random::PhiloxRandom parent_generator_ GUARDED_BY(mu_);
+    random::PhiloxRandom parent_generator_ TF_GUARDED_BY(mu_);
     random::SingleSampleAdapter<random::PhiloxRandom> generator_
-        GUARDED_BY(mu_);
-    int64 num_random_samples_ GUARDED_BY(mu_) = 0;
+        TF_GUARDED_BY(mu_);
+    int64 num_random_samples_ TF_GUARDED_BY(mu_) = 0;
   };
 
   const float rate_;
-  const int64 seed_, seed2_;
+  const std::pair<int64, int64> seeds_;
   const DatasetBase* const input_;
 };  // SamplingDatasetOp::Dataset
 
@@ -223,10 +226,6 @@ void SamplingDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
   OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, kSeed, &seed));
   OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, kSeed2, &seed2));
 
-  if (seed == 0 && seed2 == 0) {
-    seed = random::New64();
-    seed2 = random::New64();
-  }
   *output = new Dataset(ctx, rate, seed, seed2, input);
 }
 
diff --git a/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
index 72202092a46..9c2f4c4b403 100644
--- a/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
@@ -34,7 +34,6 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
   explicit ScanDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx) {
     FunctionMetadata::Params params;
-    params.is_multi_device_function = true;
     if (ctx->HasAttr("use_default_device")) {
       OP_REQUIRES_OK(ctx,
                      ctx->GetAttr("use_default_device", &use_default_device_));
@@ -250,6 +249,7 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
       }
 
       Status SaveInternal(IteratorStateWriter* writer) override {
+        TF_RETURN_IF_ERROR(dataset()->captured_func_->CheckExternalState());
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         if (!state_.empty()) {
@@ -282,8 +282,8 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
 
      private:
       mutex mu_;
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-      std::vector<Tensor> state_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
+      std::vector<Tensor> state_ TF_GUARDED_BY(mu_);
       std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
diff --git a/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc b/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
index c6b635fce20..a3a43085316 100644
--- a/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
@@ -212,7 +212,7 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
 
      private:
       mutex mu_;
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
     };
 
     const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
index 5caa0092c9d..ec13bb8108c 100644
--- a/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
@@ -152,8 +152,8 @@ class SleepDatasetOp : public UnaryDatasetOpKernel {
       }
 
       mutex mu_;
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-      bool cancelled_ GUARDED_BY(mu_) = false;
+      std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
+      bool cancelled_ TF_GUARDED_BY(mu_) = false;
       std::function<void()> deregister_fn_;
     };
 
diff --git a/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
index ee7a013580b..f6be3c08d6d 100644
--- a/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
@@ -293,8 +293,8 @@ class SlidingWindowDatasetOp : public UnaryDatasetOpKernel {
       }
 
       mutex mu_;
-      std::deque<std::vector<Tensor>> buffer_ GUARDED_BY(mu_);
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::deque<std::vector<Tensor>> buffer_ TF_GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
     };
 
     const int64 window_size_;
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
index 80f5f4b67e6..d7eff8df710 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
@@ -62,19 +62,11 @@ namespace data {
 namespace experimental {
 namespace {
 
-enum SnapshotMode { READER = 0, WRITER = 1, PASSTHROUGH = 2 };
-
 // Defaults to 10 GiB per shard.
 const int64 kDefaultShardSizeBytes = 10LL * 1024 * 1024 * 1024;
 
 const int64 kCurrentVersion = 1;
 
-constexpr char kModeAuto[] = "auto";
-constexpr char kModeWrite[] = "write";
-constexpr char kModeRead[] = "read";
-constexpr char kModePassthrough[] = "passthrough";
-
-constexpr char kSnapshotFilename[] = "snapshot.metadata";
 constexpr char kSnapshotReaderWorkerPool[] = "snapshot_reader_worker_pool";
 constexpr char kSnapshotWriterWorkerPool[] = "snapshot_writer_worker_pool";
 constexpr char kSeparator[] = "::";
@@ -104,90 +96,6 @@ constexpr char kBuffer[] = "buffer";
 constexpr char kNumElementsWritten[] = "num_elements_written";
 constexpr char kNextElem[] = "next_elem";
 
-Status WriteMetadataFile(const string& hash_dir,
-                         const experimental::SnapshotMetadataRecord& metadata) {
-  string metadata_filename = io::JoinPath(hash_dir, kSnapshotFilename);
-  TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(hash_dir));
-  std::string tmp_filename =
-      absl::StrCat(metadata_filename, "-tmp-", random::New64());
-  TF_RETURN_IF_ERROR(WriteBinaryProto(Env::Default(), tmp_filename, metadata));
-  return Env::Default()->RenameFile(tmp_filename, metadata_filename);
-}
-
-Status ReadMetadataFile(const string& hash_dir,
-                        experimental::SnapshotMetadataRecord* metadata) {
-  string metadata_filename = io::JoinPath(hash_dir, kSnapshotFilename);
-  TF_RETURN_IF_ERROR(Env::Default()->FileExists(metadata_filename));
-  return ReadBinaryProto(Env::Default(), metadata_filename, metadata);
-}
-
-Status DumpDatasetGraph(const std::string& path, uint64 hash,
-                        const GraphDef& graph) {
-  std::string hash_hex =
-      strings::StrCat(strings::Hex(hash, strings::kZeroPad16));
-  std::string graph_file =
-      io::JoinPath(path, absl::StrCat(hash_hex, "-graph.pbtxt"));
-
-  LOG(INFO) << "Graph hash is " << hash_hex << ", writing to " << graph_file;
-  TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(path));
-  return WriteTextProto(Env::Default(), graph_file, graph);
-}
-
-Status DetermineOpState(const std::string& mode_string,
-                        const Status& file_status,
-                        const experimental::SnapshotMetadataRecord& metadata,
-                        const uint64 pending_snapshot_expiry_seconds,
-                        SnapshotMode* mode) {
-  if (mode_string == kModeRead) {
-    // In read mode, we should expect a metadata file is written.
-    if (errors::IsNotFound(file_status)) {
-      return file_status;
-    }
-    LOG(INFO) << "Overriding mode to reader.";
-    *mode = READER;
-    return Status::OK();
-  }
-
-  if (mode_string == kModeWrite) {
-    LOG(INFO) << "Overriding mode to writer.";
-    *mode = WRITER;
-    return Status::OK();
-  }
-
-  if (mode_string == kModePassthrough) {
-    LOG(INFO) << "Overriding mode to passthrough.";
-    *mode = PASSTHROUGH;
-    return Status::OK();
-  }
-
-  if (errors::IsNotFound(file_status)) {
-    *mode = WRITER;
-    return Status::OK();
-  }
-
-  if (!file_status.ok()) {
-    return file_status;
-  }
-
-  if (metadata.finalized()) {
-    // File found, snapshot has been finalized.
-    *mode = READER;
-    return Status::OK();
-  }
-
-  if (metadata.creation_timestamp() >=
-      (static_cast<int64>(EnvTime::NowMicros()) -
-       pending_snapshot_expiry_seconds * 1000000)) {
-    // Someone else is already writing and time has not expired.
-    *mode = PASSTHROUGH;
-    return Status::OK();
-  } else {
-    // Time has expired, we write regardless.
-    *mode = WRITER;
-    return Status::OK();
-  }
-}
-
 class SnapshotDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit SnapshotDatasetOp(OpKernelConstruction* ctx)
@@ -217,7 +125,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("seed", &seed_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("seed2", &seed2_));
 
-    mode_ = kModeAuto;
+    mode_ = snapshot_util::kModeAuto;
     if (ctx->HasAttr("mode")) {
       OP_REQUIRES_OK(ctx, ctx->GetAttr("mode", &mode_));
     }
@@ -253,11 +161,14 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
             "pending_snapshot_expiry_seconds must be at least 1 second."));
 
     OP_REQUIRES(ctx,
-                mode_ == kModeAuto || mode_ == kModeRead ||
-                    mode_ == kModeWrite || mode_ == kModePassthrough,
-                errors::InvalidArgument("mode must be either '", kModeAuto,
-                                        "', '", kModeRead, "', '", kModeWrite,
-                                        "', or '", kModePassthrough, "'."));
+                mode_ == snapshot_util::kModeAuto ||
+                    mode_ == snapshot_util::kModeRead ||
+                    mode_ == snapshot_util::kModeWrite ||
+                    mode_ == snapshot_util::kModePassthrough,
+                errors::InvalidArgument(
+                    "mode must be either '", snapshot_util::kModeAuto, "', '",
+                    snapshot_util::kModeRead, "', '", snapshot_util::kModeWrite,
+                    "', or '", snapshot_util::kModePassthrough, "'."));
   }
 
  protected:
@@ -280,7 +191,8 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
     uint64 hash;
     OP_REQUIRES_OK(ctx, ComputeDatasetHash(graph_def, path, &hash));
 
-    Status dump_status = DumpDatasetGraph(path, hash, graph_def);
+    Status dump_status =
+        snapshot_util::DumpDatasetGraph(path, hash, &graph_def);
     if (!dump_status.ok()) {
       LOG(WARNING) << "Unable to write graphdef to disk, error: "
                    << dump_status.ToString();
@@ -466,9 +378,9 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
         mutex_lock l(mu_);
         if (iterator_ == nullptr) {
           experimental::SnapshotMetadataRecord metadata;
-          Status s = ReadMetadataFile(hash_dir_, &metadata);
-          TF_RETURN_IF_ERROR(DetermineOpState(
-              dataset()->mode_, s, metadata,
+          Status s = snapshot_util::ReadMetadataFile(hash_dir_, &metadata);
+          TF_RETURN_IF_ERROR(snapshot_util::DetermineOpState(
+              dataset()->mode_, s, &metadata,
               dataset()->pending_snapshot_expiry_seconds_, &state_));
           VLOG(2) << "Snapshot state: " << state_;
           TF_RETURN_IF_ERROR(InitializeIterator(ctx, metadata));
@@ -501,10 +413,11 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
         {
           int64 temp;
           TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kState), &temp));
-          state_ = SnapshotMode(temp);
+          state_ = snapshot_util::Mode(temp);
         }
         experimental::SnapshotMetadataRecord metadata;
-        TF_RETURN_IF_ERROR(ReadMetadataFile(hash_dir_, &metadata));
+        TF_RETURN_IF_ERROR(
+            snapshot_util::ReadMetadataFile(hash_dir_, &metadata));
         TF_RETURN_IF_ERROR(InitializeIterator(ctx, metadata));
         VLOG(2) << "Restoring Snapshot iterator: " << state_;
         return RestoreInput(ctx, reader, iterator_);
@@ -515,7 +428,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
       Status InitializeIterator(
           IteratorContext* ctx,
           const experimental::SnapshotMetadataRecord& metadata)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         std::string run_id = "";
         if (!dataset()->snapshot_name_.empty()) {
           // We have overridden the snapshot with a custom name, so we don't
@@ -524,13 +437,13 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
         }
 
         switch (state_) {
-          case WRITER:
+          case snapshot_util::WRITER:
             iterator_ = absl::make_unique<SnapshotWriterIterator>(
                 SnapshotWriterIterator::Params{
                     dataset(), absl::StrCat(prefix(), "WriterImpl")},
                 hash_dir_, run_id);
             break;
-          case READER:
+          case snapshot_util::READER:
             if (run_id.empty() && metadata.run_id().empty()) {
               return errors::NotFound(
                   "Could not find a valid snapshot to read.");
@@ -558,7 +471,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
                     dataset(), absl::StrCat(prefix(), "ReaderImpl")},
                 hash_dir_, run_id, metadata.version());
             break;
-          case PASSTHROUGH:
+          case snapshot_util::PASSTHROUGH:
             iterator_ = absl::make_unique<SnapshotPassthroughIterator>(
                 SnapshotPassthroughIterator::Params{
                     dataset(), absl::StrCat(prefix(), "PassthroughImpl")});
@@ -817,8 +730,8 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
           std::unique_ptr<RandomAccessFile> file;
           TF_RETURN_IF_ERROR(
               Env::Default()->NewRandomAccessFile(filename, &file));
-          SnapshotReader reader(file.get(), dataset()->compression_, version_,
-                                dataset()->output_dtypes());
+          snapshot_util::Reader reader(file.get(), dataset()->compression_,
+                                       version_, dataset()->output_dtypes());
 
           while (true) {
             // Wait for a slot in the buffer.
@@ -857,7 +770,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
           return Status::OK();
         }
 
-        string GetNextFilename() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        string GetNextFilename() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
           if (next_file_index_ >= filenames_.size()) {
             return "";
           }
@@ -913,7 +826,8 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
         }
 
         Status WriteStatus(IteratorStateWriter* writer, size_t index,
-                           const Status& status) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                           const Status& status)
+            TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
           TF_RETURN_IF_ERROR(writer->WriteScalar(
               CodeKey(index), static_cast<int64>(status.code())));
           if (!status.ok()) {
@@ -924,7 +838,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
         }
 
         Status ReadStatus(IteratorStateReader* reader, size_t index,
-                          Status* status) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                          Status* status) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
           int64 code_int;
           TF_RETURN_IF_ERROR(reader->ReadScalar(CodeKey(index), &code_int));
           error::Code code = static_cast<error::Code>(code_int);
@@ -958,26 +872,26 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
         condition_variable cond_var_;
 
         const string hash_dir_;
-        tstring run_id_ GUARDED_BY(mu_);
-        tstring run_dir_ GUARDED_BY(mu_);
+        tstring run_id_ TF_GUARDED_BY(mu_);
+        tstring run_dir_ TF_GUARDED_BY(mu_);
         int64 version_;
         std::vector<tstring> filenames_;
 
-        uint64 elements_produced_ GUARDED_BY(mu_) = 0;
-        int64 time_spent_micros_ GUARDED_BY(mu_) = 0;
-        double kbytes_read_ GUARDED_BY(mu_) = 0;
-        size_t next_file_index_ GUARDED_BY(mu_) = 0;
-        int64 num_files_done_ GUARDED_BY(mu_) = 0;
+        uint64 elements_produced_ TF_GUARDED_BY(mu_) = 0;
+        int64 time_spent_micros_ TF_GUARDED_BY(mu_) = 0;
+        double kbytes_read_ TF_GUARDED_BY(mu_) = 0;
+        size_t next_file_index_ TF_GUARDED_BY(mu_) = 0;
+        int64 num_files_done_ TF_GUARDED_BY(mu_) = 0;
 
         std::unique_ptr<thread::ThreadPool> thread_pool_;
-        int64 num_active_threads_ GUARDED_BY(mu_) = 0;
-        std::deque<BufferElement> buffer_ GUARDED_BY(mu_);
-        bool cancelled_ GUARDED_BY(mu_) = false;
-        bool background_threads_started_ GUARDED_BY(mu_) = false;
-        bool background_threads_finished_ GUARDED_BY(mu_) = false;
-        int64 num_elements_read_ GUARDED_BY(mu_) = 0;
+        int64 num_active_threads_ TF_GUARDED_BY(mu_) = 0;
+        std::deque<BufferElement> buffer_ TF_GUARDED_BY(mu_);
+        bool cancelled_ TF_GUARDED_BY(mu_) = false;
+        bool background_threads_started_ TF_GUARDED_BY(mu_) = false;
+        bool background_threads_finished_ TF_GUARDED_BY(mu_) = false;
+        int64 num_elements_read_ TF_GUARDED_BY(mu_) = 0;
         // curr_filenames_ tracks which file is being read by each thread.
-        std::vector<tstring> curr_filenames_ GUARDED_BY(mu_);
+        std::vector<tstring> curr_filenames_ TF_GUARDED_BY(mu_);
       };
 
       class SnapshotWriterIterator : public DatasetIterator<Dataset> {
@@ -1041,7 +955,8 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
                   metadata.add_dtype(output_dtype);
                 }
                 metadata.set_finalized(false);
-                TF_RETURN_IF_ERROR(WriteMetadataFile(hash_dir_, metadata));
+                TF_RETURN_IF_ERROR(
+                    snapshot_util::WriteMetadataFile(hash_dir_, &metadata));
               }
               for (int i = 0; i < dataset()->num_writer_threads_; ++i) {
                 ++num_active_threads_;
@@ -1291,12 +1206,14 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
         string GetSnapshotFilename() {
           mutex_lock l(mu_);
           string snapshot_data_filename = io::JoinPath(
-              run_dir_, strings::Printf("%08llu.snapshot", next_file_index_));
+              run_dir_, strings::Printf(
+                            "%08llu.snapshot",
+                            static_cast<unsigned long long>(next_file_index_)));
           next_file_index_++;
           return snapshot_data_filename;
         }
 
-        Status FillBuffer(IteratorContext* ctx) LOCKS_EXCLUDED(mu_) {
+        Status FillBuffer(IteratorContext* ctx) TF_LOCKS_EXCLUDED(mu_) {
           BufferElement elem;
           TF_RETURN_IF_ERROR(
               input_impl_->GetNext(ctx, &elem.value, &elem.end_of_sequence));
@@ -1340,7 +1257,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
         Status ProcessOneElement(int64* bytes_written,
                                  string* snapshot_data_filename,
                                  std::unique_ptr<WritableFile>* file,
-                                 std::unique_ptr<SnapshotWriter>* writer,
+                                 std::unique_ptr<snapshot_util::Writer>* writer,
                                  bool* end_of_processing) {
           profiler::TraceMe activity(
               [&]() {
@@ -1400,7 +1317,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
               *snapshot_data_filename = GetSnapshotFilename();
               TF_RETURN_IF_ERROR(Env::Default()->NewAppendableFile(
                   *snapshot_data_filename, file));
-              *writer = absl::make_unique<SnapshotWriter>(
+              *writer = absl::make_unique<snapshot_util::Writer>(
                   file->get(), dataset()->compression_, kCurrentVersion,
                   dataset()->output_dtypes());
               *bytes_written = 0;
@@ -1416,11 +1333,13 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
             mutex_lock l(mu_);
             if (!written_final_metadata_file_) {
               experimental::SnapshotMetadataRecord metadata;
-              TF_RETURN_IF_ERROR(ReadMetadataFile(hash_dir_, &metadata));
+              TF_RETURN_IF_ERROR(
+                  snapshot_util::ReadMetadataFile(hash_dir_, &metadata));
 
               if (metadata.run_id() == run_id_) {
                 metadata.set_finalized(true);
-                TF_RETURN_IF_ERROR(WriteMetadataFile(hash_dir_, metadata));
+                TF_RETURN_IF_ERROR(
+                    snapshot_util::WriteMetadataFile(hash_dir_, &metadata));
               } else {
                 // TODO(frankchn): We lost the race, remove all snapshots.
               }
@@ -1452,9 +1371,10 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
             cond_var_.notify_all();
             return;
           }
-          std::unique_ptr<SnapshotWriter> writer(
-              new SnapshotWriter(file.get(), dataset()->compression_,
-                                 kCurrentVersion, dataset()->output_dtypes()));
+          std::unique_ptr<snapshot_util::Writer> writer(
+              new snapshot_util::Writer(file.get(), dataset()->compression_,
+                                        kCurrentVersion,
+                                        dataset()->output_dtypes()));
 
           bool end_of_processing = false;
           while (!end_of_processing) {
@@ -1475,8 +1395,8 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
         }
 
         Status ShouldCloseFile(const string& filename, uint64 bytes_written,
-                               SnapshotWriter* writer, WritableFile* file,
-                               bool* should_close) {
+                               snapshot_util::Writer* writer,
+                               WritableFile* file, bool* should_close) {
           // If the compression ratio has been estimated, use it to decide
           // whether the file should be closed. We avoid estimating the
           // compression ratio repeatedly because it requires syncing the file,
@@ -1520,28 +1440,28 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
         // 5. By the background threads when they finish.
         condition_variable cond_var_;
 
-        BufferElement next_elem_ GUARDED_BY(mu_);
+        BufferElement next_elem_ TF_GUARDED_BY(mu_);
         std::unique_ptr<IteratorBase> input_impl_;
 
         const string hash_dir_;
-        tstring run_id_ GUARDED_BY(mu_);
-        tstring run_dir_ GUARDED_BY(mu_);
-        double compression_ratio_ GUARDED_BY(mu_) = 0.0;
-        bool is_restored_ GUARDED_BY(mu_) = false;
+        tstring run_id_ TF_GUARDED_BY(mu_);
+        tstring run_dir_ TF_GUARDED_BY(mu_);
+        double compression_ratio_ TF_GUARDED_BY(mu_) = 0.0;
+        bool is_restored_ TF_GUARDED_BY(mu_) = false;
 
-        uint64 elements_produced_ GUARDED_BY(mu_) = 0;
-        int64 time_spent_micros_ GUARDED_BY(mu_) = 0;
-        int64 bytes_produced_ GUARDED_BY(mu_) = 0;
+        uint64 elements_produced_ TF_GUARDED_BY(mu_) = 0;
+        int64 time_spent_micros_ TF_GUARDED_BY(mu_) = 0;
+        int64 bytes_produced_ TF_GUARDED_BY(mu_) = 0;
 
-        std::deque<BufferElement> buffer_ GUARDED_BY(mu_);
-        bool snapshot_failed_ GUARDED_BY(mu_) = false;
-        bool cancelled_ GUARDED_BY(mu_) = false;
-        bool first_call_ GUARDED_BY(mu_) = true;
-        bool end_of_sequence_ GUARDED_BY(mu_) = false;
-        bool written_final_metadata_file_ GUARDED_BY(mu_) = false;
-        uint64 next_file_index_ GUARDED_BY(mu_) = 0;
+        std::deque<BufferElement> buffer_ TF_GUARDED_BY(mu_);
+        bool snapshot_failed_ TF_GUARDED_BY(mu_) = false;
+        bool cancelled_ TF_GUARDED_BY(mu_) = false;
+        bool first_call_ TF_GUARDED_BY(mu_) = true;
+        bool end_of_sequence_ TF_GUARDED_BY(mu_) = false;
+        bool written_final_metadata_file_ TF_GUARDED_BY(mu_) = false;
+        uint64 next_file_index_ TF_GUARDED_BY(mu_) = 0;
         std::unique_ptr<thread::ThreadPool> thread_pool_;
-        int64 num_active_threads_ GUARDED_BY(mu_) = 0;
+        int64 num_active_threads_ TF_GUARDED_BY(mu_) = 0;
         int64 num_elements_written_ = 0;
       };
 
@@ -1575,9 +1495,9 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
         std::unique_ptr<IteratorBase> input_impl_;
       };
 
-      string hash_dir_ GUARDED_BY(mu_);
-      SnapshotMode state_ GUARDED_BY(mu_);
-      std::unique_ptr<IteratorBase> iterator_ GUARDED_BY(mu_);
+      string hash_dir_ TF_GUARDED_BY(mu_);
+      snapshot_util::Mode state_ TF_GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> iterator_ TF_GUARDED_BY(mu_);
 
       mutex mu_;
     };
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_util.cc b/tensorflow/core/kernels/data/experimental/snapshot_util.cc
index 391ece306b5..9c2b30736e7 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_util.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_util.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/experimental/snapshot_util.h"
 
 #include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/lib/io/buffered_inputstream.h"
 #include "tensorflow/core/lib/io/random_inputstream.h"
@@ -26,16 +27,20 @@ limitations under the License.
 #include "tensorflow/core/lib/io/zlib_outputbuffer.h"
 #include "tensorflow/core/platform/coding.h"
 #include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/random.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/protobuf/data/experimental/snapshot.pb.h"
 
 namespace tensorflow {
 namespace data {
-namespace experimental {
+namespace snapshot_util {
 
-SnapshotWriter::SnapshotWriter(WritableFile* dest,
-                               const string& compression_type, int version,
-                               const DataTypeVector& dtypes)
+/* static */ constexpr const int64 Reader::kSnappyReaderInputBufferSizeBytes;
+/* static */ constexpr const int64 Reader::kSnappyReaderOutputBufferSizeBytes;
+
+Writer::Writer(WritableFile* dest, const string& compression_type, int version,
+               const DataTypeVector& dtypes)
     : dest_(dest), compression_type_(compression_type), version_(version) {
 #if defined(IS_SLIM_BUILD)
   if (compression_type != io::compression::kNone) {
@@ -67,7 +72,7 @@ SnapshotWriter::SnapshotWriter(WritableFile* dest,
   }
 }
 
-Status SnapshotWriter::WriteTensors(const std::vector<Tensor>& tensors) {
+Status Writer::WriteTensors(const std::vector<Tensor>& tensors) {
   if (compression_type_ != io::compression::kSnappy) {
     experimental::SnapshotRecord record;
     for (const auto& tensor : tensors) {
@@ -93,11 +98,12 @@ Status SnapshotWriter::WriteTensors(const std::vector<Tensor>& tensors) {
   tensor_buffers.reserve(num_simple_);
   std::vector<TensorProto> tensor_protos;
   tensor_protos.reserve(num_complex_);
-  SnapshotTensorMetadata metadata;
+  experimental::SnapshotTensorMetadata metadata;
   int64 total_size = 0;
   for (int i = 0; i < tensors.size(); ++i) {
     const Tensor& tensor = tensors[i];
-    TensorMetadata* tensor_metadata = metadata.add_tensor_metadata();
+    experimental::TensorMetadata* tensor_metadata =
+        metadata.add_tensor_metadata();
     tensor.shape().AsProto(tensor_metadata->mutable_tensor_shape());
     int64 size = 0;
     if (simple_tensor_mask_[i]) {
@@ -147,9 +153,9 @@ Status SnapshotWriter::WriteTensors(const std::vector<Tensor>& tensors) {
   return Status::OK();
 }
 
-Status SnapshotWriter::Sync() { return dest_->Sync(); }
+Status Writer::Sync() { return dest_->Sync(); }
 
-Status SnapshotWriter::Close() {
+Status Writer::Close() {
   if (dest_is_owned_) {
     Status s = dest_->Close();
     delete dest_;
@@ -159,7 +165,7 @@ Status SnapshotWriter::Close() {
   return Status::OK();
 }
 
-SnapshotWriter::~SnapshotWriter() {
+Writer::~Writer() {
   if (dest_ != nullptr) {
     Status s = Close();
     if (!s.ok()) {
@@ -168,7 +174,7 @@ SnapshotWriter::~SnapshotWriter() {
   }
 }
 
-Status SnapshotWriter::WriteRecord(const StringPiece& data) {
+Status Writer::WriteRecord(const StringPiece& data) {
   char header[kHeaderSize];
   core::EncodeFixed64(header, data.size());
   TF_RETURN_IF_ERROR(dest_->Append(StringPiece(header, sizeof(header))));
@@ -176,7 +182,7 @@ Status SnapshotWriter::WriteRecord(const StringPiece& data) {
 }
 
 #if defined(PLATFORM_GOOGLE)
-Status SnapshotWriter::WriteRecord(const absl::Cord& data) {
+Status Writer::WriteRecord(const absl::Cord& data) {
   char header[kHeaderSize];
   core::EncodeFixed64(header, data.size());
   TF_RETURN_IF_ERROR(dest_->Append(StringPiece(header, sizeof(header))));
@@ -184,9 +190,8 @@ Status SnapshotWriter::WriteRecord(const absl::Cord& data) {
 }
 #endif  // PLATFORM_GOOGLE
 
-SnapshotReader::SnapshotReader(RandomAccessFile* file,
-                               const string& compression_type, int version,
-                               const DataTypeVector& dtypes)
+Reader::Reader(RandomAccessFile* file, const string& compression_type,
+               int version, const DataTypeVector& dtypes)
     : file_(file),
       input_stream_(new io::RandomAccessInputStream(file)),
       compression_type_(compression_type),
@@ -228,7 +233,7 @@ SnapshotReader::SnapshotReader(RandomAccessFile* file,
   }
 }
 
-Status SnapshotReader::ReadTensors(std::vector<Tensor>* read_tensors) {
+Status Reader::ReadTensors(std::vector<Tensor>* read_tensors) {
   profiler::TraceMe activity(
       [&]() { return absl::StrCat(kClassName, kSeparator, "ReadTensors"); },
       profiler::TraceMeLevel::kInfo);
@@ -242,7 +247,7 @@ Status SnapshotReader::ReadTensors(std::vector<Tensor>* read_tensors) {
     return errors::InvalidArgument("Version 1 only supports snappy.");
   }
 
-  SnapshotTensorMetadata metadata;
+  experimental::SnapshotTensorMetadata metadata;
   tstring metadata_str;
   TF_RETURN_IF_ERROR(ReadRecord(&metadata_str));
   if (!metadata.ParseFromArray(metadata_str.data(), metadata_str.size())) {
@@ -293,7 +298,7 @@ Status SnapshotReader::ReadTensors(std::vector<Tensor>* read_tensors) {
   return Status::OK();
 }
 
-Status SnapshotReader::ReadTensorsV0(std::vector<Tensor>* read_tensors) {
+Status Reader::ReadTensorsV0(std::vector<Tensor>* read_tensors) {
   experimental::SnapshotRecord record;
 #if defined(PLATFORM_GOOGLE)
   absl::Cord c;
@@ -314,8 +319,9 @@ Status SnapshotReader::ReadTensorsV0(std::vector<Tensor>* read_tensors) {
   return Status::OK();
 }
 
-Status SnapshotReader::SnappyUncompress(
-    const SnapshotTensorMetadata* metadata, std::vector<Tensor>* simple_tensors,
+Status Reader::SnappyUncompress(
+    const experimental::SnapshotTensorMetadata* metadata,
+    std::vector<Tensor>* simple_tensors,
     std::vector<std::pair<std::unique_ptr<char[]>, size_t>>*
         tensor_proto_strs) {
   tstring compressed;
@@ -362,7 +368,7 @@ Status SnapshotReader::SnappyUncompress(
   return Status::OK();
 }
 
-Status SnapshotReader::ReadRecord(tstring* record) {
+Status Reader::ReadRecord(tstring* record) {
   tstring header;
   TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(kHeaderSize, &header));
   uint64 length = core::DecodeFixed64(header.data());
@@ -370,7 +376,7 @@ Status SnapshotReader::ReadRecord(tstring* record) {
 }
 
 #if defined(PLATFORM_GOOGLE)
-Status SnapshotReader::ReadRecord(absl::Cord* record) {
+Status Reader::ReadRecord(absl::Cord* record) {
   tstring header;
   TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(kHeaderSize, &header));
   uint64 length = core::DecodeFixed64(header.data());
@@ -389,6 +395,90 @@ Status SnapshotReader::ReadRecord(absl::Cord* record) {
 }
 #endif
 
-}  // namespace experimental
+Status WriteMetadataFile(const string& hash_dir,
+                         const experimental::SnapshotMetadataRecord* metadata) {
+  string metadata_filename = io::JoinPath(hash_dir, kMetadataFilename);
+  TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(hash_dir));
+  std::string tmp_filename =
+      absl::StrCat(metadata_filename, "-tmp-", random::New64());
+  TF_RETURN_IF_ERROR(WriteBinaryProto(Env::Default(), tmp_filename, *metadata));
+  return Env::Default()->RenameFile(tmp_filename, metadata_filename);
+}
+
+Status ReadMetadataFile(const string& hash_dir,
+                        experimental::SnapshotMetadataRecord* metadata) {
+  string metadata_filename = io::JoinPath(hash_dir, kMetadataFilename);
+  TF_RETURN_IF_ERROR(Env::Default()->FileExists(metadata_filename));
+  return ReadBinaryProto(Env::Default(), metadata_filename, metadata);
+}
+
+Status DumpDatasetGraph(const std::string& path, uint64 hash,
+                        const GraphDef* graph) {
+  std::string hash_hex =
+      strings::StrCat(strings::Hex(hash, strings::kZeroPad16));
+  std::string graph_file =
+      io::JoinPath(path, absl::StrCat(hash_hex, "-graph.pbtxt"));
+
+  LOG(INFO) << "Graph hash is " << hash_hex << ", writing to " << graph_file;
+  TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(path));
+  return WriteTextProto(Env::Default(), graph_file, *graph);
+}
+
+Status DetermineOpState(const std::string& mode_string,
+                        const Status& file_status,
+                        const experimental::SnapshotMetadataRecord* metadata,
+                        const uint64 pending_snapshot_expiry_seconds,
+                        Mode* mode) {
+  if (mode_string == kModeRead) {
+    // In read mode, we should expect a metadata file is written.
+    if (errors::IsNotFound(file_status)) {
+      return file_status;
+    }
+    LOG(INFO) << "Overriding mode to reader.";
+    *mode = READER;
+    return Status::OK();
+  }
+
+  if (mode_string == kModeWrite) {
+    LOG(INFO) << "Overriding mode to writer.";
+    *mode = WRITER;
+    return Status::OK();
+  }
+
+  if (mode_string == kModePassthrough) {
+    LOG(INFO) << "Overriding mode to passthrough.";
+    *mode = PASSTHROUGH;
+    return Status::OK();
+  }
+
+  if (errors::IsNotFound(file_status)) {
+    *mode = WRITER;
+    return Status::OK();
+  }
+
+  if (!file_status.ok()) {
+    return file_status;
+  }
+
+  if (metadata->finalized()) {
+    // File found, snapshot has been finalized.
+    *mode = READER;
+    return Status::OK();
+  }
+
+  if (metadata->creation_timestamp() >=
+      (static_cast<int64>(EnvTime::NowMicros()) -
+       pending_snapshot_expiry_seconds * 1000000)) {
+    // Someone else is already writing and time has not expired.
+    *mode = PASSTHROUGH;
+    return Status::OK();
+  } else {
+    // Time has expired, we write regardless.
+    *mode = WRITER;
+    return Status::OK();
+  }
+}
+
+}  // namespace snapshot_util
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_util.h b/tensorflow/core/kernels/data/experimental/snapshot_util.h
index a2df3ccec10..c141cb0bbb0 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_util.h
+++ b/tensorflow/core/kernels/data/experimental/snapshot_util.h
@@ -24,12 +24,30 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 
 namespace tensorflow {
+
+class GraphDef;
+
 namespace data {
+
 namespace experimental {
 
+class SnapshotMetadataRecord;
 class SnapshotTensorMetadata;
 
-class SnapshotWriter {
+}  // namespace experimental
+
+namespace snapshot_util {
+
+constexpr char kMetadataFilename[] = "snapshot.metadata";
+
+constexpr char kModeAuto[] = "auto";
+constexpr char kModeWrite[] = "write";
+constexpr char kModeRead[] = "read";
+constexpr char kModePassthrough[] = "passthrough";
+
+enum Mode { READER = 0, WRITER = 1, PASSTHROUGH = 2 };
+
+class Writer {
  public:
   static constexpr const size_t kHeaderSize = sizeof(uint64);
 
@@ -38,8 +56,8 @@ class SnapshotWriter {
   static constexpr const char* const kWriteCord = "WriteCord";
   static constexpr const char* const kSeparator = "::";
 
-  explicit SnapshotWriter(WritableFile* dest, const string& compression_type,
-                          int version, const DataTypeVector& dtypes);
+  explicit Writer(WritableFile* dest, const string& compression_type,
+                  int version, const DataTypeVector& dtypes);
 
   Status WriteTensors(const std::vector<Tensor>& tensors);
 
@@ -47,7 +65,7 @@ class SnapshotWriter {
 
   Status Close();
 
-  ~SnapshotWriter();
+  ~Writer();
 
  private:
   Status WriteRecord(const StringPiece& data);
@@ -65,7 +83,7 @@ class SnapshotWriter {
   int num_complex_ = 0;
 };
 
-class SnapshotReader {
+class Reader {
  public:
   // The reader input buffer size is deliberately large because the input reader
   // will throw an error if the compressed block length cannot fit in the input
@@ -82,9 +100,8 @@ class SnapshotReader {
   static constexpr const char* const kReadCord = "ReadCord";
   static constexpr const char* const kSeparator = "::";
 
-  explicit SnapshotReader(RandomAccessFile* file,
-                          const string& compression_type, int version,
-                          const DataTypeVector& dtypes);
+  explicit Reader(RandomAccessFile* file, const string& compression_type,
+                  int version, const DataTypeVector& dtypes);
 
   Status ReadTensors(std::vector<Tensor>* read_tensors);
 
@@ -92,7 +109,7 @@ class SnapshotReader {
   Status ReadTensorsV0(std::vector<Tensor>* read_tensors);
 
   Status SnappyUncompress(
-      const SnapshotTensorMetadata* metadata,
+      const experimental::SnapshotTensorMetadata* metadata,
       std::vector<Tensor>* simple_tensors,
       std::vector<std::pair<std::unique_ptr<char[]>, size_t>>*
           tensor_proto_strs);
@@ -113,7 +130,22 @@ class SnapshotReader {
   std::vector<bool> simple_tensor_mask_;  // true for simple, false for complex.
 };
 
-}  // namespace experimental
+Status WriteMetadataFile(const string& hash_dir,
+                         const experimental::SnapshotMetadataRecord* metadata);
+
+Status ReadMetadataFile(const string& hash_dir,
+                        experimental::SnapshotMetadataRecord* metadata);
+
+Status DumpDatasetGraph(const std::string& path, uint64 hash,
+                        const GraphDef* graph);
+
+Status DetermineOpState(const std::string& mode_string,
+                        const Status& file_status,
+                        const experimental::SnapshotMetadataRecord* metadata,
+                        const uint64 pending_snapshot_expiry_seconds,
+                        Mode* mode);
+
+}  // namespace snapshot_util
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
index f5cbf6a2633..976cb1e87ba 100644
--- a/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
@@ -190,7 +190,7 @@ class SqlDatasetOp : public DatasetOpKernel {
       }
 
      private:
-      Status InitializeQueryConnection() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      Status InitializeQueryConnection() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         query_connection_initialized_ = true;
         end_of_sequence_ = false;
         query_connection_ =
@@ -208,10 +208,11 @@ class SqlDatasetOp : public DatasetOpKernel {
 
       mutex mu_;
       // TODO(b/129062371): explore ways to seek into a SQLite databases.
-      int64 next_calls_ GUARDED_BY(mu_) = 0;
-      std::unique_ptr<sql::QueryConnection> query_connection_ GUARDED_BY(mu_);
-      bool query_connection_initialized_ GUARDED_BY(mu_) = false;
-      bool end_of_sequence_ GUARDED_BY(mu_) = false;
+      int64 next_calls_ TF_GUARDED_BY(mu_) = 0;
+      std::unique_ptr<sql::QueryConnection> query_connection_
+          TF_GUARDED_BY(mu_);
+      bool query_connection_initialized_ TF_GUARDED_BY(mu_) = false;
+      bool end_of_sequence_ TF_GUARDED_BY(mu_) = false;
     };
     const tstring driver_name_;
     const tstring data_source_name_;
diff --git a/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc b/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
index b30cb4af95b..2da30f3112b 100644
--- a/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
@@ -105,8 +105,9 @@ class StatsAggregatorImpl : public StatsAggregator {
 
  private:
   mutex mu_;
-  std::unordered_map<string, histogram::Histogram> histograms_ GUARDED_BY(mu_);
-  std::unordered_map<string, float> scalars_ GUARDED_BY(mu_);
+  std::unordered_map<string, histogram::Histogram> histograms_
+      TF_GUARDED_BY(mu_);
+  std::unordered_map<string, float> scalars_ TF_GUARDED_BY(mu_);
   TF_DISALLOW_COPY_AND_ASSIGN(StatsAggregatorImpl);
 };
 
@@ -118,7 +119,7 @@ class StatsAggregatorHandleOp
 
  private:
   Status CreateResource(StatsAggregatorResource** ret) override
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     *ret =
         new StatsAggregatorResource(absl::make_unique<StatsAggregatorImpl>());
     return Status::OK();
@@ -195,7 +196,7 @@ class StatsAggregatorImplV2 : public StatsAggregator {
 
  private:
   void AddToEvents(const string& name, const int64 steps,
-                   const float scalar_value) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                   const float scalar_value) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (summary_writer_interface_ == nullptr) {
       return;
     }
@@ -211,7 +212,7 @@ class StatsAggregatorImplV2 : public StatsAggregator {
 
   void AddToEvents(const string& name, const int64 steps,
                    const histogram::Histogram& histogram)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (summary_writer_interface_ == nullptr) {
       return;
     }
@@ -225,10 +226,12 @@ class StatsAggregatorImplV2 : public StatsAggregator {
   }
 
   mutex mu_;
-  SummaryWriterInterface* summary_writer_interface_ GUARDED_BY(mu_) = nullptr;
+  SummaryWriterInterface* summary_writer_interface_ TF_GUARDED_BY(mu_) =
+      nullptr;
   // not owned, we might be associating the default summary_writer from the
   // context
-  std::unordered_map<string, histogram::Histogram> histograms_ GUARDED_BY(mu_);
+  std::unordered_map<string, histogram::Histogram> histograms_
+      TF_GUARDED_BY(mu_);
   TF_DISALLOW_COPY_AND_ASSIGN(StatsAggregatorImplV2);
 };
 
@@ -240,7 +243,7 @@ class StatsAggregatorHandleOpV2
 
  private:
   Status CreateResource(StatsAggregatorResource** ret) override
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     *ret =
         new StatsAggregatorResource(absl::make_unique<StatsAggregatorImplV2>());
     return Status::OK();
diff --git a/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
index 6d549b35036..bdf7a29ca26 100644
--- a/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
@@ -143,7 +143,7 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
 
      private:
       mutex mu_;
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
     };
 
     const DatasetBase* const input_;
@@ -261,7 +261,7 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
 
      private:
       mutex mu_;
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
     };
 
     const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc b/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
index 15b86a07ad4..3868c65af5e 100644
--- a/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
@@ -33,10 +33,8 @@ class TakeWhileDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit TakeWhileDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx) {
-    FunctionMetadata::Params params;
-    params.is_multi_device_function = true;
-    OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, "predicate", params,
-                                                 &func_metadata_));
+    OP_REQUIRES_OK(ctx, FunctionMetadata::Create(
+                            ctx, "predicate", /*params=*/{}, &func_metadata_));
     OP_REQUIRES(ctx, func_metadata_->short_circuit_info().indices.size() <= 1,
                 errors::InvalidArgument(
                     "predicate function has more than one return value."));
@@ -169,6 +167,7 @@ class TakeWhileDatasetOp : public UnaryDatasetOpKernel {
       }
 
       Status SaveInternal(IteratorStateWriter* writer) override {
+        TF_RETURN_IF_ERROR(dataset()->captured_func_->CheckExternalState());
         mutex_lock l(mu_);
         if (input_impl_)
           TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
@@ -190,7 +189,7 @@ class TakeWhileDatasetOp : public UnaryDatasetOpKernel {
 
      private:
       mutex mu_;
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
       std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
diff --git a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
index 658d4212495..26a58d10593 100644
--- a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
@@ -89,7 +90,7 @@ class ThreadPoolHandleOp : public OpKernel {
     }
   }
 
-  void Compute(OpKernelContext* ctx) override LOCKS_EXCLUDED(mu_) {
+  void Compute(OpKernelContext* ctx) override TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
     if (!initialized_) {
       ResourceMgr* mgr = ctx->resource_manager();
@@ -98,7 +99,7 @@ class ThreadPoolHandleOp : public OpKernel {
       OP_REQUIRES_OK(ctx, mgr->LookupOrCreate<ThreadPoolResource>(
                               cinfo_.container(), cinfo_.name(), &resource,
                               [this, ctx](ThreadPoolResource** ret)
-                                  EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                                  TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
                                     *ret = new ThreadPoolResource(
                                         ctx->env(), {}, display_name_,
                                         num_threads_,
@@ -115,8 +116,8 @@ class ThreadPoolHandleOp : public OpKernel {
 
  private:
   mutex mu_;
-  ContainerInfo cinfo_ GUARDED_BY(mu_);
-  bool initialized_ GUARDED_BY(mu_) = false;
+  ContainerInfo cinfo_ TF_GUARDED_BY(mu_);
+  bool initialized_ TF_GUARDED_BY(mu_) = false;
   string display_name_;
   int num_threads_;
   int max_intra_op_parallelism_;
@@ -203,6 +204,7 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
+        mutex_lock l(mu_);
         return input_impl_->GetNext(IteratorContext(CreateParams(ctx)),
                                     out_tensors, end_of_sequence);
       }
@@ -214,6 +216,20 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
                                          /*ratio=*/1);
       }
 
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        DCHECK(input_impl_ != nullptr);
+        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+        return Status::OK();
+      }
+
      private:
       IteratorContext::Params CreateParams(IteratorContext* ctx) {
         ThreadPoolResource* pool = dataset()->threadpool_;
@@ -225,7 +241,8 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
         return params;
       }
 
-      std::unique_ptr<IteratorBase> input_impl_;
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
     };
 
     const DatasetBase* const input_;
@@ -319,6 +336,7 @@ class MaxIntraOpParallelismDatasetOp : public UnaryDatasetOpKernel {
         auto max_parallelism = dataset()->max_intra_op_parallelism_;
         params.runner =
             RunnerWithMaxParallelism(*ctx->runner(), max_parallelism);
+        mutex_lock l(mu_);
         return input_impl_->GetNext(IteratorContext{std::move(params)},
                                     out_tensors, end_of_sequence);
       }
@@ -330,8 +348,23 @@ class MaxIntraOpParallelismDatasetOp : public UnaryDatasetOpKernel {
                                          /*ratio=*/1);
       }
 
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        DCHECK(input_impl_ != nullptr);
+        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+        return Status::OK();
+      }
+
      private:
-      std::unique_ptr<IteratorBase> input_impl_;
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
     };
 
     const DatasetBase* const input_;
@@ -425,6 +458,7 @@ class PrivateThreadPoolDatasetOp : public UnaryDatasetOpKernel {
           pool->Schedule(std::move(c));
         };
         params.runner_threadpool_size = dataset()->num_threads_;
+        mutex_lock l(mu_);
         return input_impl_->GetNext(IteratorContext{std::move(params)},
                                     out_tensors, end_of_sequence);
       }
@@ -436,8 +470,23 @@ class PrivateThreadPoolDatasetOp : public UnaryDatasetOpKernel {
                                          /*ratio=*/1);
       }
 
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        DCHECK(input_impl_ != nullptr);
+        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+        return Status::OK();
+      }
+
      private:
-      std::unique_ptr<IteratorBase> input_impl_;
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
     };
 
     const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
index fb6381bc49b..c19ad6ca7ae 100644
--- a/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
@@ -212,11 +212,11 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
 
      private:
       mutex mu_;
-      int64 current_index_ GUARDED_BY(mu_);
-      int64 current_batch_size_ GUARDED_BY(mu_);
-      std::vector<Tensor> tensors_ GUARDED_BY(mu_);
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-      std::vector<TensorShape> shapes_ GUARDED_BY(mu_);
+      int64 current_index_ TF_GUARDED_BY(mu_);
+      int64 current_batch_size_ TF_GUARDED_BY(mu_);
+      std::vector<Tensor> tensors_ TF_GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
+      std::vector<TensorShape> shapes_ TF_GUARDED_BY(mu_);
     };
 
     const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc b/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
index f4a4ad0cd2a..dd35f44f9ac 100644
--- a/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
@@ -196,9 +196,9 @@ class UniqueDatasetOp::Dataset : public DatasetBase {
     };
 
       mutex mu_;
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
       std::unordered_set<Tensor, TensorHash, TensorKeyEqual> unique_elements_
-          GUARDED_BY(mu_);
+          TF_GUARDED_BY(mu_);
   };
 
     const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/data/filter_dataset_op.cc b/tensorflow/core/kernels/data/filter_dataset_op.cc
index 9ebd5d4f346..f7828e60fe4 100644
--- a/tensorflow/core/kernels/data/filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_dataset_op.cc
@@ -195,6 +195,7 @@ class FilterDatasetOp::Dataset : public DatasetBase {
     }
 
     Status SaveInternal(IteratorStateWriter* writer) override {
+      TF_RETURN_IF_ERROR(dataset()->captured_func_->CheckExternalState());
       mutex_lock l(mu_);
       if (input_impl_)
         TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
@@ -224,9 +225,9 @@ class FilterDatasetOp::Dataset : public DatasetBase {
 
    private:
     mutex mu_;
-    std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-    int64 filtered_elements_ GUARDED_BY(mu_);
-    int64 dropped_elements_ GUARDED_BY(mu_);
+    std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
+    int64 filtered_elements_ TF_GUARDED_BY(mu_);
+    int64 dropped_elements_ TF_GUARDED_BY(mu_);
     std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
   };
 
@@ -236,10 +237,8 @@ class FilterDatasetOp::Dataset : public DatasetBase {
 
 FilterDatasetOp::FilterDatasetOp(OpKernelConstruction* ctx)
     : UnaryDatasetOpKernel(ctx) {
-  FunctionMetadata::Params params;
-  params.is_multi_device_function = true;
-  OP_REQUIRES_OK(
-      ctx, FunctionMetadata::Create(ctx, kPredicate, params, &func_metadata_));
+  OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, kPredicate, /*params=*/{},
+                                               &func_metadata_));
   OP_REQUIRES(ctx, func_metadata_->short_circuit_info().indices.size() <= 1,
               errors::InvalidArgument(
                   "predicate function has more than one return value."));
diff --git a/tensorflow/core/kernels/data/fixed_length_record_dataset_op.cc b/tensorflow/core/kernels/data/fixed_length_record_dataset_op.cc
index cb8a2d5cf16..2aa2d5e4ce2 100644
--- a/tensorflow/core/kernels/data/fixed_length_record_dataset_op.cc
+++ b/tensorflow/core/kernels/data/fixed_length_record_dataset_op.cc
@@ -235,11 +235,11 @@ class FixedLengthRecordDatasetOp::Dataset : public DatasetBase {
 
    private:
     mutex mu_;
-    size_t current_file_index_ GUARDED_BY(mu_) = 0;
+    size_t current_file_index_ TF_GUARDED_BY(mu_) = 0;
     std::unique_ptr<RandomAccessFile> file_
-        GUARDED_BY(mu_);  // must outlive input_buffer_
-    std::unique_ptr<io::InputBuffer> input_buffer_ GUARDED_BY(mu_);
-    int64 file_pos_limit_ GUARDED_BY(mu_) = -1;
+        TF_GUARDED_BY(mu_);  // must outlive input_buffer_
+    std::unique_ptr<io::InputBuffer> input_buffer_ TF_GUARDED_BY(mu_);
+    int64 file_pos_limit_ TF_GUARDED_BY(mu_) = -1;
   };
 
   class CompressedIterator : public DatasetIterator<Dataset> {
@@ -427,15 +427,15 @@ class FixedLengthRecordDatasetOp::Dataset : public DatasetBase {
 
    private:
     mutex mu_;
-    size_t current_file_index_ GUARDED_BY(mu_) = 0;
+    size_t current_file_index_ TF_GUARDED_BY(mu_) = 0;
     std::unique_ptr<RandomAccessFile> file_
-        GUARDED_BY(mu_);  // must outlive buffered_input_stream_
+        TF_GUARDED_BY(mu_);  // must outlive buffered_input_stream_
     std::unique_ptr<io::RandomAccessInputStream>
         file_stream_;  // must outlive buffered_input_stream_
     std::unique_ptr<io::InputStreamInterface> buffered_input_stream_
-        GUARDED_BY(mu_);
-    int64 file_pos_limit_ GUARDED_BY(mu_) = -1;
-    tstring lookahead_cache_ GUARDED_BY(mu_);
+        TF_GUARDED_BY(mu_);
+    int64 file_pos_limit_ TF_GUARDED_BY(mu_) = -1;
+    tstring lookahead_cache_ TF_GUARDED_BY(mu_);
   };
 
   const std::vector<string> filenames_;
diff --git a/tensorflow/core/kernels/data/flat_map_dataset_op.cc b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
index 6d04a5b55a1..dca4973f0ce 100644
--- a/tensorflow/core/kernels/data/flat_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
@@ -163,6 +163,7 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
     }
 
     Status SaveInternal(IteratorStateWriter* writer) override {
+      TF_RETURN_IF_ERROR(dataset()->captured_func_->CheckExternalState());
       mutex_lock l(mu_);
       if (input_impl_) {
         TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
@@ -232,17 +233,17 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
 
    private:
     Status BuildCurrentElementIteratorLocked(IteratorContext* ctx)
-        EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       return MakeIteratorFromInputElement(
           ctx, this, captured_func_inputs_, element_index_++,
           *instantiated_captured_func_, prefix(), &current_element_iterator_);
     }
 
     mutex mu_;
-    size_t element_index_ GUARDED_BY(mu_) = 0;
-    std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-    std::unique_ptr<IteratorBase> current_element_iterator_ GUARDED_BY(mu_);
-    std::vector<Tensor> captured_func_inputs_ GUARDED_BY(mu_);
+    size_t element_index_ TF_GUARDED_BY(mu_) = 0;
+    std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
+    std::unique_ptr<IteratorBase> current_element_iterator_ TF_GUARDED_BY(mu_);
+    std::vector<Tensor> captured_func_inputs_ TF_GUARDED_BY(mu_);
     std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
   };
 
@@ -254,10 +255,8 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
 
 FlatMapDatasetOp::FlatMapDatasetOp(OpKernelConstruction* ctx)
     : UnaryDatasetOpKernel(ctx), graph_def_version_(ctx->graph_def_version()) {
-  FunctionMetadata::Params params;
-  params.is_multi_device_function = true;
-  OP_REQUIRES_OK(ctx,
-                 FunctionMetadata::Create(ctx, kFunc, params, &func_metadata_));
+  OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, kFunc, /*params=*/{},
+                                               &func_metadata_));
   OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputTypes, &output_types_));
   OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputShapes, &output_shapes_));
 }
diff --git a/tensorflow/core/kernels/data/generator_dataset_op.cc b/tensorflow/core/kernels/data/generator_dataset_op.cc
index 7a5d7e4f7b6..8eca7057bec 100644
--- a/tensorflow/core/kernels/data/generator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/generator_dataset_op.cc
@@ -158,11 +158,22 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
       return model::MakeSourceNode(std::move(args));
     }
 
+    Status SaveInternal(IteratorStateWriter* writer) override {
+      return errors::Unimplemented(
+          "GeneratorDataset does not support checkpointing.");
+    }
+
+    Status RestoreInternal(IteratorContext* ctx,
+                           IteratorStateReader* reader) override {
+      return errors::Unimplemented(
+          "GeneratorDataset does not support checkpointing.");
+    }
+
    private:
     mutex mu_;
-    bool initialized_ GUARDED_BY(mu_) = false;
-    bool finalized_ GUARDED_BY(mu_) = false;
-    std::vector<Tensor> state_ GUARDED_BY(mu_);
+    bool initialized_ TF_GUARDED_BY(mu_) = false;
+    bool finalized_ TF_GUARDED_BY(mu_) = false;
+    std::vector<Tensor> state_ TF_GUARDED_BY(mu_);
     std::unique_ptr<InstantiatedCapturedFunction> instantiated_init_func_;
     std::unique_ptr<InstantiatedCapturedFunction> instantiated_next_func_;
     std::unique_ptr<InstantiatedCapturedFunction> instantiated_finalize_func_;
diff --git a/tensorflow/core/kernels/data/interleave_dataset_op.cc b/tensorflow/core/kernels/data/interleave_dataset_op.cc
index de622386be7..312aaa5219c 100644
--- a/tensorflow/core/kernels/data/interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/interleave_dataset_op.cc
@@ -62,8 +62,10 @@ class InterleaveDatasetOp::Dataset : public DatasetBase {
         output_types_(output_types),
         output_shapes_(output_shapes),
         traceme_metadata_(
-            {{"block_length", strings::Printf("%lld", block_length)},
-             {"cycle_length", strings::Printf("%lld", cycle_length)}}) {
+            {{"block_length",
+              strings::Printf("%lld", static_cast<long long>(block_length))},
+             {"cycle_length",
+              strings::Printf("%lld", static_cast<long long>(cycle_length))}}) {
     input_->Ref();
   }
 
@@ -131,12 +133,12 @@ class InterleaveDatasetOp::Dataset : public DatasetBase {
           ctx, &instantiated_captured_func_);
     }
 
-    void AdvanceToNextInCycle() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    void AdvanceToNextInCycle() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       block_index_ = 0;
       cycle_index_ = (cycle_index_ + 1) % dataset()->cycle_length_;
     }
 
-    void AdvancePosition() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    void AdvancePosition() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       ++block_index_;
       if (block_index_ == dataset()->block_length_) {
         AdvanceToNextInCycle();
@@ -194,6 +196,7 @@ class InterleaveDatasetOp::Dataset : public DatasetBase {
     }
 
     Status SaveInternal(IteratorStateWriter* writer) override {
+      TF_RETURN_IF_ERROR(dataset()->captured_func_->CheckExternalState());
       mutex_lock l(mu_);
       TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
       TF_RETURN_IF_ERROR(
@@ -232,7 +235,7 @@ class InterleaveDatasetOp::Dataset : public DatasetBase {
 
    private:
     Status SaveCurrentElements(IteratorStateWriter* writer)
-        EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       for (int idx = 0; idx < current_elements_.size(); idx++) {
         if (current_elements_[idx]) {
           TF_RETURN_IF_ERROR(SaveInput(writer, current_elements_[idx]));
@@ -251,7 +254,7 @@ class InterleaveDatasetOp::Dataset : public DatasetBase {
 
     Status RestoreCurrentElements(IteratorContext* ctx,
                                   IteratorStateReader* reader)
-        EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       for (int idx = 0; idx < current_elements_.size(); idx++) {
         if (reader->Contains(
                 full_name(strings::StrCat(kArgsSize, "[", idx, "]")))) {
@@ -277,14 +280,14 @@ class InterleaveDatasetOp::Dataset : public DatasetBase {
     }
 
     mutex mu_;
-    std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
     std::vector<std::unique_ptr<IteratorBase>> current_elements_
-        GUARDED_BY(mu_);
-    std::vector<std::vector<Tensor>> args_list_ GUARDED_BY(mu_);
-    size_t cycle_index_ GUARDED_BY(mu_) = 0;
-    int64 block_index_ GUARDED_BY(mu_) = 0;
-    bool end_of_input_ GUARDED_BY(mu_) = false;
-    size_t num_open_ GUARDED_BY(mu_) = 0;
+        TF_GUARDED_BY(mu_);
+    std::vector<std::vector<Tensor>> args_list_ TF_GUARDED_BY(mu_);
+    size_t cycle_index_ TF_GUARDED_BY(mu_) = 0;
+    int64 block_index_ TF_GUARDED_BY(mu_) = 0;
+    bool end_of_input_ TF_GUARDED_BY(mu_) = false;
+    size_t num_open_ TF_GUARDED_BY(mu_) = 0;
     std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
   };
 
@@ -299,10 +302,8 @@ class InterleaveDatasetOp::Dataset : public DatasetBase {
 
 InterleaveDatasetOp::InterleaveDatasetOp(OpKernelConstruction* ctx)
     : UnaryDatasetOpKernel(ctx), graph_def_version_(ctx->graph_def_version()) {
-  FunctionMetadata::Params params;
-  params.is_multi_device_function = true;
-  OP_REQUIRES_OK(ctx,
-                 FunctionMetadata::Create(ctx, kFunc, params, &func_metadata_));
+  OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, kFunc, /*params=*/{},
+                                               &func_metadata_));
   OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputTypes, &output_types_));
   OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputShapes, &output_shapes_));
 }
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index c3b365ead44..94a1de02688 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -394,7 +394,8 @@ IteratorHandleOp::~IteratorHandleOp() {
   }
 }
 
-void IteratorHandleOp::Compute(OpKernelContext* context) LOCKS_EXCLUDED(mu_) {
+void IteratorHandleOp::Compute(OpKernelContext* context)
+    TF_LOCKS_EXCLUDED(mu_) {
   {
     mutex_lock l(mu_);
     if (resource_ == nullptr) {
@@ -422,7 +423,7 @@ void IteratorHandleOp::Compute(OpKernelContext* context) LOCKS_EXCLUDED(mu_) {
           mgr->LookupOrCreate<IteratorResource>(
               cinfo_.container(), cinfo_.name(), &resource,
               [context, flr, &device_mgr, &flib_def, &pflr,
-               this](IteratorResource** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+               this](IteratorResource** ret) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
                 *ret = new IteratorResource(
                     context->env(), output_dtypes_, output_shapes_,
                     graph_def_version_, std::move(device_mgr),
@@ -601,7 +602,6 @@ class ReduceDatasetOp : public HybridAsyncOpKernel {
     FunctionMetadata::Params params;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_inter_op_parallelism",
                                      &params.use_inter_op_parallelism));
-    params.is_multi_device_function = true;
     OP_REQUIRES_OK(ctx,
                    FunctionMetadata::Create(ctx, "f", params, &func_metadata_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputTypes, &output_types_));
@@ -807,7 +807,7 @@ class OneShotIteratorOp : public AsyncOpKernel {
         ctx->resource_manager()->LookupOrCreate<IteratorResource>(
             cinfo->container(), cinfo->name(), iterator,
             [ctx, flr, this, &flib_def, &pflr](IteratorResource** ret)
-                EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
                   *ret = new IteratorResource(
                       ctx->env(), output_dtypes_, output_shapes_,
                       graph_def_version_, nullptr, std::move(flib_def),
@@ -835,6 +835,7 @@ class OneShotIteratorOp : public AsyncOpKernel {
     });
     opts.step_container = &step_container;
     opts.runner = ctx->runner();
+    opts.run_all_kernels_inline = ctx->run_all_kernels_inline();
     Notification n;
     Status factory_status;
     std::vector<Tensor> return_values;
@@ -886,13 +887,13 @@ class OneShotIteratorOp : public AsyncOpKernel {
   BackgroundWorker background_worker_;
 
   mutex mu_;
-  ContainerInfo cinfo_ GUARDED_BY(mu_);
-  IteratorResource* iterator_resource_ GUARDED_BY(mu_) = nullptr;
+  ContainerInfo cinfo_ TF_GUARDED_BY(mu_);
+  IteratorResource* iterator_resource_ TF_GUARDED_BY(mu_) = nullptr;
 
-  bool initialization_started_ GUARDED_BY(mu_) = false;
-  Status initialization_status_ GUARDED_BY(mu_);
+  bool initialization_started_ TF_GUARDED_BY(mu_) = false;
+  Status initialization_status_ TF_GUARDED_BY(mu_);
   std::vector<std::pair<OpKernelContext*, DoneCallback>> done_callbacks_
-      GUARDED_BY(mu_);
+      TF_GUARDED_BY(mu_);
   const int graph_def_version_;
 };
 
@@ -901,9 +902,6 @@ class OneShotIteratorOp : public AsyncOpKernel {
 AsyncOpKernel* IteratorGetNextOp::AsAsync() {
   return type_string() == "IteratorGetNextSync" ? nullptr : this;
 }
-const AsyncOpKernel* IteratorGetNextOp::AsAsync() const {
-  return type_string() == "IteratorGetNextSync" ? nullptr : this;
-}
 
 Status IteratorGetNextOp::DoCompute(OpKernelContext* ctx) {
   profiler::TraceMe traceme(
diff --git a/tensorflow/core/kernels/data/iterator_ops.h b/tensorflow/core/kernels/data/iterator_ops.h
index ad5d1517176..b0507170546 100644
--- a/tensorflow/core/kernels/data/iterator_ops.h
+++ b/tensorflow/core/kernels/data/iterator_ops.h
@@ -99,8 +99,8 @@ class IteratorResource : public ResourceBase {
 
   UnboundedThreadPool unbounded_thread_pool_;
   mutex mu_;
-  const std::unique_ptr<DeviceMgr> device_mgr_ GUARDED_BY(mu_);
-  std::shared_ptr<State> iterator_state_ GUARDED_BY(mu_);
+  const std::unique_ptr<DeviceMgr> device_mgr_ TF_GUARDED_BY(mu_);
+  std::shared_ptr<State> iterator_state_ TF_GUARDED_BY(mu_);
   const DataTypeVector output_dtypes_;
   const std::vector<PartialTensorShape> output_shapes_;
 };
@@ -114,7 +114,7 @@ class IteratorHandleOp : public OpKernel {
   // by anyone, but it would break backward compatibility.
   ~IteratorHandleOp() override;
 
-  void Compute(OpKernelContext* context) override LOCKS_EXCLUDED(mu_);
+  void Compute(OpKernelContext* context) override TF_LOCKS_EXCLUDED(mu_);
 
  private:
   // During the first Compute(), resource is either created or looked up using
@@ -131,7 +131,7 @@ class IteratorHandleOp : public OpKernel {
 
   mutex mu_;
   ContainerInfo cinfo_;  // Written once under mu_ then constant afterwards.
-  IteratorResource* resource_ GUARDED_BY(mu_) = nullptr;
+  IteratorResource* resource_ TF_GUARDED_BY(mu_) = nullptr;
   DataTypeVector output_dtypes_;
   std::vector<PartialTensorShape> output_shapes_;
   const int graph_def_version_;
@@ -205,7 +205,6 @@ class IteratorGetNextOp : public HybridAsyncOpKernel {
       : HybridAsyncOpKernel(ctx, "tf_data_iterator_get_next") {}
 
   AsyncOpKernel* AsAsync() override;
-  const AsyncOpKernel* AsAsync() const override;
 
  protected:
   Status DoCompute(OpKernelContext* ctx) override;
diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
index cda198bfe8e..1bbf277ee58 100644
--- a/tensorflow/core/kernels/data/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -175,6 +175,7 @@ class MapDatasetOp::Dataset : public DatasetBase {
     }
 
     Status SaveInternal(IteratorStateWriter* writer) override {
+      TF_RETURN_IF_ERROR(dataset()->captured_func_->CheckExternalState());
       TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
       return Status::OK();
     }
@@ -202,7 +203,6 @@ MapDatasetOp::MapDatasetOp(OpKernelConstruction* ctx)
   FunctionMetadata::Params params;
   OP_REQUIRES_OK(ctx, ctx->GetAttr(kUseInterOpParallelism,
                                    &params.use_inter_op_parallelism));
-  params.is_multi_device_function = true;
   OP_REQUIRES_OK(ctx,
                  FunctionMetadata::Create(ctx, kFunc, params, &func_metadata_));
   OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputTypes, &output_types_));
diff --git a/tensorflow/core/kernels/data/map_defun_op.cc b/tensorflow/core/kernels/data/map_defun_op.cc
index fc62b0d579d..3a0307070eb 100644
--- a/tensorflow/core/kernels/data/map_defun_op.cc
+++ b/tensorflow/core/kernels/data/map_defun_op.cc
@@ -51,8 +51,8 @@ struct MapDefunOp::ComputeOptions {
   std::function<void(std::function<void()>)> runner;
 
   // Output of a compute call
-  std::vector<PartialTensorShape> output_shapes GUARDED_BY(mu);
-  OpOutputList output GUARDED_BY(mu);
+  std::vector<PartialTensorShape> output_shapes TF_GUARDED_BY(mu);
+  OpOutputList output TF_GUARDED_BY(mu);
   mutex mu;
 
   // Create a copy of output_shapes because every `Compute` may expect a
@@ -239,6 +239,7 @@ void MapDefunOp::SetRunOptions(OpKernelContext* ctx,
   } else {
     opts->runner = ctx->runner();
   }
+  opts->run_all_kernels_inline = ctx->run_all_kernels_inline();
 }
 
 Status MapDefunOp::SetupArgs(OpKernelContext* ctx,
diff --git a/tensorflow/core/kernels/data/model_dataset_op.cc b/tensorflow/core/kernels/data/model_dataset_op.cc
index ca053e8f364..4a227078a78 100644
--- a/tensorflow/core/kernels/data/model_dataset_op.cc
+++ b/tensorflow/core/kernels/data/model_dataset_op.cc
@@ -166,7 +166,7 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
 
      private:
       Status EnsureOptimizeThreadStarted(IteratorContext* ctx)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         if (!optimize_thread_) {
           std::shared_ptr<IteratorContext> new_ctx =
               std::make_shared<IteratorContext>(*ctx);
@@ -210,8 +210,8 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
       mutex mu_;
       condition_variable cond_var_;
       std::shared_ptr<model::Model> model_;
-      std::unique_ptr<Thread> optimize_thread_ GUARDED_BY(mu_);
-      bool cancelled_ GUARDED_BY(mu_) = false;
+      std::unique_ptr<Thread> optimize_thread_ TF_GUARDED_BY(mu_);
+      bool cancelled_ TF_GUARDED_BY(mu_) = false;
       std::unique_ptr<IteratorBase> input_impl_;
     };
 
diff --git a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
index 774d91a6b73..7be03632d94 100644
--- a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
+++ b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
@@ -169,7 +169,7 @@ class MultiDeviceIterator : public ResourceBase {
       Reset();
     }
 
-    void Reset() LOCKS_EXCLUDED(mu_) {
+    void Reset() TF_LOCKS_EXCLUDED(mu_) {
       {
         mutex_lock l(mu_);
         if (background_thread_ && !background_thread_finished_) {
@@ -237,7 +237,7 @@ class MultiDeviceIterator : public ResourceBase {
 
    private:
     void EnsureBackgroundThreadStarted(IteratorContext* ctx)
-        EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       if (!background_thread_) {
         auto ctx_copy = std::make_shared<IteratorContext>(*ctx);
         background_thread_ =
@@ -249,7 +249,7 @@ class MultiDeviceIterator : public ResourceBase {
       }
     }
 
-    void RunPendingCallbacks() LOCKS_EXCLUDED(mu_) {
+    void RunPendingCallbacks() TF_LOCKS_EXCLUDED(mu_) {
       // Run all remaining callbacks.
       std::vector<MultiDeviceIteratorCallback> cancellation_callbacks;
       std::vector<HostBufferElement> cancellation_elements;
@@ -354,12 +354,12 @@ class MultiDeviceIterator : public ResourceBase {
     };
 
     mutex mu_;
-    std::unique_ptr<Thread> background_thread_ GUARDED_BY(mu_);
-    bool background_thread_finished_ GUARDED_BY(mu_) = false;
-    bool background_thread_started_ GUARDED_BY(mu_) = false;
-    bool end_of_iterator_ GUARDED_BY(mu_) = false;
-    bool cancelled_ GUARDED_BY(mu_) = false;
-    condition_variable shutdown_cond_var_ GUARDED_BY(mu_);
+    std::unique_ptr<Thread> background_thread_ TF_GUARDED_BY(mu_);
+    bool background_thread_finished_ TF_GUARDED_BY(mu_) = false;
+    bool background_thread_started_ TF_GUARDED_BY(mu_) = false;
+    bool end_of_iterator_ TF_GUARDED_BY(mu_) = false;
+    bool cancelled_ TF_GUARDED_BY(mu_) = false;
+    condition_variable shutdown_cond_var_ TF_GUARDED_BY(mu_);
 
     std::vector<HostBuffer> buffer_;
 
@@ -382,8 +382,8 @@ class MultiDeviceIterator : public ResourceBase {
   ResourceMgr resource_mgr_;
   CancellationManager cancellation_manager_;
 
-  int64 incarnation_id_ GUARDED_BY(mu_) = 0;
-  std::unique_ptr<MultiDeviceBuffer> multi_device_buffer_ GUARDED_BY(mu_);
+  int64 incarnation_id_ TF_GUARDED_BY(mu_) = 0;
+  std::unique_ptr<MultiDeviceBuffer> multi_device_buffer_ TF_GUARDED_BY(mu_);
 };
 
 // Used to generate unique names for anonymous multi device iterators.
@@ -417,7 +417,7 @@ class MultiDeviceIteratorHandleOp : public OpKernel {
     }
   }
 
-  void Compute(OpKernelContext* context) override LOCKS_EXCLUDED(mu_) {
+  void Compute(OpKernelContext* context) override TF_LOCKS_EXCLUDED(mu_) {
     string unique_name = cinfo_.name();
     string container_name = cinfo_.container();
     {
@@ -455,7 +455,7 @@ class MultiDeviceIteratorHandleOp : public OpKernel {
                              container_name, unique_name, &resource,
                              [this, context, flr, &flib_def, &pflr,
                               &function_handle_cache](MultiDeviceIterator** ret)
-                                 EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                                 TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
                                    *ret = new MultiDeviceIterator(
                                        context->env(), output_types_,
                                        output_shapes_, devices_,
@@ -494,7 +494,7 @@ class MultiDeviceIteratorHandleOp : public OpKernel {
 
   mutex mu_;
   ContainerInfo cinfo_;  // Written once under mu_ then constant afterwards.
-  MultiDeviceIterator* resource_ GUARDED_BY(mu_) = nullptr;
+  MultiDeviceIterator* resource_ TF_GUARDED_BY(mu_) = nullptr;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
   const int graph_def_version_;
@@ -573,7 +573,8 @@ class MultiDeviceIteratorInitOp : public OpKernel {
 
     IteratorContext iter_ctx(std::move(params));
     OP_REQUIRES_OK(
-        ctx, dataset->MakeIterator(std::move(iter_ctx), "Iterator", &iterator));
+        ctx, dataset->MakeIterator(std::move(iter_ctx), /*parent=*/nullptr,
+                                   "Iterator", &iterator));
     int64 incarnation_id;
     OP_REQUIRES_OK(ctx, resource->Init(std::move(iterator), max_buffer_size,
                                        &incarnation_id));
diff --git a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
index 44c6d30762e..12800c27eff 100644
--- a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
@@ -61,7 +61,8 @@ class PaddedBatchDatasetOp::Dataset : public DatasetBase {
         input_(input),
         op_version_(op_version),
         traceme_metadata_(
-            {{"batch_size", strings::Printf("%lld", batch_size)},
+            {{"batch_size",
+              strings::Printf("%lld", static_cast<long long>(batch_size))},
              {"drop_remainder", drop_remainder ? "true" : "false"}}) {
     input_->Ref();
 
@@ -370,7 +371,7 @@ class PaddedBatchDatasetOp::Dataset : public DatasetBase {
 
    private:
     mutex mu_;
-    std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
   };
 
   const int64 batch_size_;
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index ec60c5ac1f0..c744977c8bd 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -172,8 +172,10 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
         traceme_metadata_(
             {{"autotune",
               num_parallel_calls == model::kAutotune ? "true" : "false"},
-             {"block_length", strings::Printf("%lld", block_length)},
-             {"cycle_length", strings::Printf("%lld", cycle_length)},
+             {"block_length",
+              strings::Printf("%lld", static_cast<long long>(block_length))},
+             {"cycle_length",
+              strings::Printf("%lld", static_cast<long long>(cycle_length))},
              {"deterministic",
               deterministic.IsNondeterministic() ? "false" : "true"}}) {
     input_->Ref();
@@ -376,6 +378,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     // TODO(aaudibert): Refactor the implementations to avoid the need for
     // `IteratorContext` when saving the state of the iterator.
     Status SaveInternal(IteratorStateWriter* writer) override {
+      TF_RETURN_IF_ERROR(dataset()->captured_func_->CheckExternalState());
       mutex_lock l(*mu_);
       wait_for_checkpoint_ = true;
       // Wait for all in-flight calls to complete.
@@ -466,8 +469,9 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
         mu_->unlock();
       }
       auto result = dataset()->traceme_metadata_;
-      result.push_back(
-          std::make_pair("parallelism", strings::Printf("%lld", parallelism)));
+      result.push_back(std::make_pair(
+          "parallelism",
+          strings::Printf("%lld", static_cast<long long>(parallelism))));
       return result;
     }
 
@@ -485,37 +489,37 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     // This structure represents an input element and derived state.
     struct Element {
       // Unique identifier, needed to support checkpointing.
-      int64 id GUARDED_BY(&ParallelInterleaveIterator::mu_);
+      int64 id TF_GUARDED_BY(&ParallelInterleaveIterator::mu_);
       // The actual input element.  Iterator created from the input element. A
       // null value indicates that the element either reached end of input or
       // hasn't been initialized yet.
       std::unique_ptr<std::vector<Tensor>> inputs
-          GUARDED_BY(&ParallelInterleaveIterator::mu_);
+          TF_GUARDED_BY(&ParallelInterleaveIterator::mu_);
       // Iterator created from the input element. A null value indicates that
       // the element either reached end of input or hasn't been initialized yet.
       std::unique_ptr<IteratorBase> iterator
-          GUARDED_BY(&ParallelInterleaveIterator::mu_);
+          TF_GUARDED_BY(&ParallelInterleaveIterator::mu_);
       // Buffer for storing the outputs of `iterator`.
-      std::deque<std::shared_ptr<Result>> GUARDED_BY(
+      std::deque<std::shared_ptr<Result>> TF_GUARDED_BY(
           &ParallelInterleaveIterator::mu_) results;
       // The element's index in the cycle, if it is in the current cycle.
       // -1 if the element is not in the current cycle.
-      int64 cycle_index GUARDED_BY(&ParallelInterleaveIterator::mu_) = -1;
+      int64 cycle_index TF_GUARDED_BY(&ParallelInterleaveIterator::mu_) = -1;
       // Whether the element is currently being processed by a worker thread.
       // This is used to ensure that only one thread at a time tries to process
       // an element.
-      bool active GUARDED_BY(&ParallelInterleaveIterator::mu_) = false;
+      bool active TF_GUARDED_BY(&ParallelInterleaveIterator::mu_) = false;
       // Whether the inputs and iterator have been initialized.
-      bool initialized GUARDED_BY(&ParallelInterleaveIterator::mu_) = false;
+      bool initialized TF_GUARDED_BY(&ParallelInterleaveIterator::mu_) = false;
       // Whether we tried to initialize the element, but the input iterator
       // was exhausted so we could produce no inputs.
-      bool no_input GUARDED_BY(&ParallelInterleaveIterator::mu_) = false;
+      bool no_input TF_GUARDED_BY(&ParallelInterleaveIterator::mu_) = false;
       // Condition variable for communicating between current worker threads
       // and GetNext.
       condition_variable cond_var;
 
       std::string DebugString()
-          EXCLUSIVE_LOCKS_REQUIRED(&ParallelInterleaveIterator::mu_) {
+          TF_EXCLUSIVE_LOCKS_REQUIRED(&ParallelInterleaveIterator::mu_) {
         return absl::StrFormat(
             "Element(id: %d, iterator_null: %d, results_size: %d, "
             "cycle_index: %d, active: %d, initialized: %d, no_input: %d)",
@@ -527,7 +531,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     // Sets the cancellation bit and wakes up all threads that need to be
     // cancelled. Optionally, the method waits until all threads finish
     // executing.
-    void CancelThreads(bool wait) LOCKS_EXCLUDED(mu_) {
+    void CancelThreads(bool wait) TF_LOCKS_EXCLUDED(mu_) {
       mutex_lock l(*mu_);
       cancelled_ = true;
       // Wake up all threads so that they can exit. This will also wake up any
@@ -548,7 +552,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       zero_active_workers_cond_var_.notify_all();
     }
 
-    void EnsureInitialElementsCreated() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    void EnsureInitialElementsCreated() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       if (!initial_elements_created_) {
         for (int i = 0; i < dataset()->cycle_length_; ++i) {
           current_elements_[i] = MakeElement();
@@ -563,7 +567,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       }
     }
 
-    void EnsureThreadsStarted() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    void EnsureThreadsStarted() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       if (!threads_initialized_) {
         IncrementOutstandingThreads();
         thread_pool_->Schedule([this]() { WorkerManagerThread(); });
@@ -577,14 +581,14 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
 
     // Advances the position in the interleave cycle to the next cycle
     // element.
-    void AdvanceToNextInCycle() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    void AdvanceToNextInCycle() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       DCHECK_NE(last_valid_current_element_, -1);
       block_index_ = 0;
       cycle_index_ = (cycle_index_ + 1) % (last_valid_current_element_ + 1);
     }
 
     // Advances the position in the interleave cycle by one.
-    void AdvancePosition() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    void AdvancePosition() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       ++block_index_;
       if (block_index_ == dataset()->block_length_) {
         AdvanceToNextInCycle();
@@ -595,7 +599,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     // a result is available. If `true` is returned, `result` either
     // points to a valid result or is null if end of input has been reached.
     bool Consume(std::shared_ptr<Result>* result)
-        EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       if (deterministic_) {
         return ConsumeHelper(result);
       }
@@ -615,7 +619,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     // a result is available. If `true` is returned, `result` either
     // points to a valid result or is null if end of input has been reached.
     bool ConsumeHelper(std::shared_ptr<Result>* result)
-        EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       while (true) {
         if (last_valid_current_element_ == -1) {
           // Reached end of input.
@@ -689,7 +693,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     }
 
     // Creates a new element.
-    std::shared_ptr<Element> MakeElement() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    std::shared_ptr<Element> MakeElement() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       if (end_of_input_) {
         return nullptr;
       }
@@ -701,7 +705,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
 
     // Thread responsible for launching all worker threads. The thread stays
     // around after startup in case autotuning increases num_parallel_calls.
-    void WorkerManagerThread() LOCKS_EXCLUDED(mu_) {
+    void WorkerManagerThread() TF_LOCKS_EXCLUDED(mu_) {
       int initial_current_workers;
       // When elements are moved from `future_elements_` to `current_elements_`,
       // the future worker which created the element may continue to process
@@ -766,9 +770,9 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     // claim the element by setting `element->active`, then continue to produce
     // results for the element until enough results have been computed for the
     // current cycle and the results buffer is full.
-    void CurrentWorkerThread() LOCKS_EXCLUDED(mu_) {
+    void CurrentWorkerThread() TF_LOCKS_EXCLUDED(mu_) {
       RecordStart(ctx_.get());
-      auto done = [this]() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      auto done = [this]() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         RecordStop(ctx_.get());
         DecrementActiveWorkers();
         DecrementCurrentActiveWorkers();
@@ -834,9 +838,9 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     // future worker's job is to keep `future_elements_` filled with elements.
     // Elements in `future_elements` have had their first `kPerIteratorPrefetch`
     // results computed.
-    void FutureWorkerThread() LOCKS_EXCLUDED(mu_) {
+    void FutureWorkerThread() TF_LOCKS_EXCLUDED(mu_) {
       RecordStart(ctx_.get());
-      auto done = [this]() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      auto done = [this]() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         RecordStop(ctx_.get());
         DecrementActiveWorkers();
         DecrementOutstandingThreads();
@@ -878,7 +882,8 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
 
     // Generates results for the given element until the element's results
     // buffer is full or the element is done producing results.
-    void ProcessElement(std::shared_ptr<Element> element) LOCKS_EXCLUDED(mu_) {
+    void ProcessElement(std::shared_ptr<Element> element)
+        TF_LOCKS_EXCLUDED(mu_) {
       DCHECK(element != nullptr);
       IteratorBase* iterator;
       // Initialize the inputs and iterator if necessary.
@@ -922,7 +927,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
 
     // Initialize inputs and create an iterator for all elements up to
     // element_id.
-    void InitializeInputs(int element_id) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    void InitializeInputs(int element_id) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       while (!uninitialized_elements_.empty() &&
              uninitialized_elements_.front()->id <= element_id) {
         std::shared_ptr<Element> element = uninitialized_elements_.front();
@@ -969,7 +974,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
 
     // Adds an error result for the given element.
     void AddErrorResult(std::shared_ptr<Element> element, Status status)
-        EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       auto result = std::make_shared<Result>();
       result->status = status;
       element->results.push_back(std::move(result));
@@ -977,12 +982,11 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     }
 
     // Cancels all threads (including the manager) and waits for them to finish.
-    void StopAllThreads(mutex_lock* l) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    }
+    void StopAllThreads(mutex_lock* l) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {}
 
     // Waits on the given cond_var in a worker thread.
     void WaitWorkerThread(condition_variable* cond_var, mutex_lock* l)
-        EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       DecrementActiveWorkers();
       RecordStop(ctx_.get());
       cond_var->wait(*l);
@@ -991,7 +995,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     }
 
     void NotifyElementUpdate(std::shared_ptr<Element> element)
-        EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       if (deterministic_) {
         element->cond_var.notify_one();
       } else {
@@ -1000,7 +1004,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     }
 
     bool NeedsProcessing(const std::shared_ptr<Element>& element)
-        EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       if (!element) {
         return false;
       }
@@ -1011,38 +1015,40 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
              element->results.size() < dataset()->buffer_output_elements_;
     }
 
-    inline void IncrementCurrentWorkers() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    inline void IncrementCurrentWorkers() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       num_current_workers_++;
     }
 
-    inline void DecrementCurrentWorkers() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    inline void DecrementCurrentWorkers() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       num_current_workers_--;
     }
 
-    inline void IncrementActiveWorkers() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    inline void IncrementActiveWorkers() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       num_active_workers_++;
     }
 
-    inline void DecrementActiveWorkers() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    inline void DecrementActiveWorkers() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       num_active_workers_--;
       if (num_active_workers_ == 0) {
         zero_active_workers_cond_var_.notify_one();
       }
     }
 
-    inline void IncrementCurrentActiveWorkers() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    inline void IncrementCurrentActiveWorkers()
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       num_current_active_workers_++;
     }
 
-    inline void DecrementCurrentActiveWorkers() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    inline void DecrementCurrentActiveWorkers()
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       num_current_active_workers_--;
     }
 
-    inline void IncrementOutstandingThreads() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    inline void IncrementOutstandingThreads() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       outstanding_threads_++;
     }
 
-    inline void DecrementOutstandingThreads() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    inline void DecrementOutstandingThreads() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       outstanding_threads_--;
       if (outstanding_threads_ == 0) {
         outstanding_threads_finished_cond_var_.notify_one();
@@ -1081,7 +1087,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     Status WriteStatusLocked(IteratorStateWriter* writer,
                              const string& iterator_name, size_t idx,
                              const Status& status)
-        EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       TF_RETURN_IF_ERROR(writer->WriteScalar(
           iterator_name, CodeKey(idx), static_cast<int64>(status.code())));
       if (!status.ok()) {
@@ -1093,7 +1099,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
 
     Status ReadStatusLocked(IteratorStateReader* reader,
                             const string& iterator_name, size_t idx,
-                            Status* status) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                            Status* status) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       int64 code_int;
       TF_RETURN_IF_ERROR(
           reader->ReadScalar(iterator_name, CodeKey(idx), &code_int));
@@ -1120,7 +1126,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
 
     Status WriteElement(std::shared_ptr<Element> element, int idx,
                         const string& key_prefix, IteratorStateWriter* writer)
-        EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
       const auto& iterator_name =
           absl::StrCat(prefix(), "::", key_prefix, "::", idx);
       if (element->iterator) {
@@ -1160,7 +1166,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     }
 
     Status WriteCurrentElements(IteratorStateWriter* writer)
-        EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       TF_RETURN_IF_ERROR(writer->WriteScalar(prefix(), kCurrentElementsSize,
                                              current_elements_.size()));
       for (int idx = 0; idx < current_elements_.size(); idx++) {
@@ -1173,7 +1179,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     }
 
     Status WriteFutureElements(IteratorStateWriter* writer)
-        EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       TF_RETURN_IF_ERROR(writer->WriteScalar(prefix(), kFutureElementsSize,
                                              future_elements_.size()));
       for (int idx = 0; idx < future_elements_.size(); idx++) {
@@ -1322,7 +1328,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       return s;
     }
 
-    std::string DebugString() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    std::string DebugString() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       std::string result;
       result.append(strings::StrCat("Cycle index: ", cycle_index_, "\n"));
       result.append(strings::StrCat("Block index: ", block_index_, "\n"));
@@ -1363,13 +1369,13 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     // TODO(aaudibert): Generalize this optimization by removing null elements
     // from `current_elements_`, e.g. by compacting the vector when x% of
     // its elements are null.
-    int64 last_valid_current_element_ GUARDED_BY(mu_) = -1;
+    int64 last_valid_current_element_ TF_GUARDED_BY(mu_) = -1;
 
     // Identifies whether the current_elements_ vector has been initialized.
-    bool initial_elements_created_ GUARDED_BY(mu_) = false;
+    bool initial_elements_created_ TF_GUARDED_BY(mu_) = false;
 
     // Identifies whether the element threads have been initialized.
-    bool threads_initialized_ GUARDED_BY(mu_) = false;
+    bool threads_initialized_ TF_GUARDED_BY(mu_) = false;
 
     // Used for coordination between the main thread, the manager threads, and
     // the worker threads.
@@ -1389,10 +1395,10 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     // Number of active worker threads which might be processing elements,
     // including both current workers and future workers. Used by
     // checkpointing to wait for outstanding work to finish.
-    int num_active_workers_ GUARDED_BY(mu_) = 0;
+    int num_active_workers_ TF_GUARDED_BY(mu_) = 0;
 
     // Number of active current worker threads.
-    int num_current_active_workers_ GUARDED_BY(mu_) = 0;
+    int num_current_active_workers_ TF_GUARDED_BY(mu_) = 0;
 
     // Condition variable notified whenever the total number of active workers
     // drops to zero. Used for checkpointing.
@@ -1407,7 +1413,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
 
     // The number of current workers currently alive or scheduled to be started.
     // This includes current workers which are blocked waiting for work.
-    int num_current_workers_ GUARDED_BY(mu_) = 0;
+    int num_current_workers_ TF_GUARDED_BY(mu_) = 0;
 
     // Condition variable to signal that a result has been produced by some
     // element thread. Only used when `deterministic` is false.
@@ -1417,39 +1423,39 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     const bool deterministic_;
 
     // Iterator for input elements.
-    std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
 
     // Identifies position in the interleave cycle.
-    int64 block_index_ GUARDED_BY(mu_) = 0;
+    int64 block_index_ TF_GUARDED_BY(mu_) = 0;
     // It is an invariant that either `last_valid_current_element_ == -1` or
     // `cycle_index_ <= last_valid_current_element_`.
-    int64 cycle_index_ GUARDED_BY(mu_) = 0;
+    int64 cycle_index_ TF_GUARDED_BY(mu_) = 0;
 
     // Elements of the current interleave cycle.
-    std::vector<std::shared_ptr<Element>> current_elements_ GUARDED_BY(mu_);
+    std::vector<std::shared_ptr<Element>> current_elements_ TF_GUARDED_BY(mu_);
 
     // Elements which still need their inputs and iterators to be initialized.
     // Elements at the front need to be initialized first.
     std::deque<std::shared_ptr<Element>> uninitialized_elements_
-        GUARDED_BY(mu_);
+        TF_GUARDED_BY(mu_);
 
     // Elements to be used in the interleave cycle in the future. The element
     // at the front is the next element to add to the interleave cycle when a
     // current element is exhausted.
-    std::deque<std::shared_ptr<Element>> future_elements_ GUARDED_BY(mu_);
+    std::deque<std::shared_ptr<Element>> future_elements_ TF_GUARDED_BY(mu_);
 
     // Identifies whether the global end of input has been reached.
-    bool end_of_input_ GUARDED_BY(mu_) = false;
+    bool end_of_input_ TF_GUARDED_BY(mu_) = false;
 
     // The number of outstanding element threads.
-    int outstanding_threads_ GUARDED_BY(mu_) = 0;
+    int outstanding_threads_ TF_GUARDED_BY(mu_) = 0;
 
     // Condition variable notified when outstanding_threads_ drops to 0.
     condition_variable outstanding_threads_finished_cond_var_;
 
     std::unique_ptr<thread::ThreadPool> thread_pool_;
 
-    int64 element_id_counter_ GUARDED_BY(mu_) = 0;
+    int64 element_id_counter_ TF_GUARDED_BY(mu_) = 0;
 
     // Iterator context used in worker threads.
     std::unique_ptr<IteratorContext> ctx_;
@@ -1462,7 +1468,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
 
     // Identifies whether background threads should be cancelled.
-    bool cancelled_ GUARDED_BY(mu_) = false;
+    bool cancelled_ TF_GUARDED_BY(mu_) = false;
 
     // Method for deregistering the cancellation callback.
     std::function<void()> deregister_fn_;
@@ -1486,10 +1492,8 @@ ParallelInterleaveDatasetOp::ParallelInterleaveDatasetOp(
     OpKernelConstruction* ctx)
     : UnaryDatasetOpKernel(ctx),
       op_version_(OpVersionFromOpName(ctx->def().op())) {
-  FunctionMetadata::Params params;
-  params.is_multi_device_function = true;
-  OP_REQUIRES_OK(ctx,
-                 FunctionMetadata::Create(ctx, kFunc, params, &func_metadata_));
+  OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, kFunc, /*params=*/{},
+                                               &func_metadata_));
   OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputTypes, &output_types_));
   OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputShapes, &output_shapes_));
   if (op_version_ == 2) {
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index fbd122b0bee..55b29ed2a08 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -190,6 +190,10 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
           ctx, &instantiated_captured_func_);
     }
 
+    Status CheckExternalState() override {
+      return dataset_->captured_func_->CheckExternalState();
+    }
+
     void MapFunc(IteratorContext* ctx, const string& prefix,
                  std::vector<Tensor> input_element, std::vector<Tensor>* result,
                  StatusCallback done) override {
@@ -229,7 +233,6 @@ ParallelMapDatasetOp::ParallelMapDatasetOp(OpKernelConstruction* ctx)
   FunctionMetadata::Params params;
   OP_REQUIRES_OK(ctx, ctx->GetAttr(kUseInterOpParallelism,
                                    &params.use_inter_op_parallelism));
-  params.is_multi_device_function = true;
   OP_REQUIRES_OK(ctx,
                  FunctionMetadata::Create(ctx, kFunc, params, &func_metadata_));
   OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputTypes, &output_types_));
@@ -375,6 +378,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
   }
 
   Status SaveInternal(IteratorStateWriter* writer) override {
+    TF_RETURN_IF_ERROR(parallel_map_functor_->CheckExternalState());
     mutex_lock l(*mu_);
     // Wait for all in-flight calls to complete.
     while (num_calls_ > 0) {
@@ -466,8 +470,9 @@ class ParallelMapIterator : public DatasetBaseIterator {
     result.push_back(std::make_pair("autotune", autotune_ ? "true" : "false"));
     result.push_back(
         std::make_pair("deterministic", deterministic_ ? "true" : "false"));
-    result.push_back(
-        std::make_pair("parallelism", strings::Printf("%lld", parallelism)));
+    result.push_back(std::make_pair(
+        "parallelism",
+        strings::Printf("%lld", static_cast<long long>(parallelism))));
     return result;
   }
 
@@ -479,7 +484,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
     bool end_of_input;
   };
 
-  void CancelThreads(bool wait) LOCKS_EXCLUDED(mu_) {
+  void CancelThreads(bool wait) TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock l(*mu_);
     cancelled_ = true;
     cond_var_->notify_all();
@@ -490,7 +495,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
   }
 
   void EnsureThreadsStarted(IteratorContext* ctx)
-      EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
     if (!runner_thread_) {
       auto ctx_copy = std::make_shared<IteratorContext>(*ctx);
       runner_thread_ = ctx->StartThread(
@@ -506,7 +511,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
 
   void CallCompleted(const std::shared_ptr<IteratorContext>& ctx,
                      const std::shared_ptr<InvocationResult>& result)
-      LOCKS_EXCLUDED(*mu_) {
+      TF_LOCKS_EXCLUDED(*mu_) {
     mutex_lock l(*mu_);
     num_calls_--;
     RecordBufferEnqueue(ctx.get(), result->return_values);
@@ -516,7 +521,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
 
   void CallFunction(const std::shared_ptr<IteratorContext>& ctx,
                     const std::shared_ptr<InvocationResult>& result)
-      LOCKS_EXCLUDED(*mu_) {
+      TF_LOCKS_EXCLUDED(*mu_) {
     // Get the next input element.
     std::vector<Tensor> input_element;
     result->status =
@@ -541,7 +546,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
   Status ProcessResult(IteratorContext* ctx,
                        const std::shared_ptr<InvocationResult>& result,
                        std::vector<Tensor>* out_tensors, bool* end_of_sequence)
-      LOCKS_EXCLUDED(*mu_) {
+      TF_LOCKS_EXCLUDED(*mu_) {
     if (!result->end_of_input && result->status.ok()) {
       *out_tensors = std::move(result->return_values);
       RecordBufferDequeue(ctx, *out_tensors);
@@ -568,7 +573,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
   }
 
   void RunnerThread(const std::shared_ptr<IteratorContext>& ctx)
-      LOCKS_EXCLUDED(*mu_) {
+      TF_LOCKS_EXCLUDED(*mu_) {
     RecordStart(ctx.get());
     auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); });
     std::vector<std::shared_ptr<InvocationResult>> new_calls;
@@ -576,7 +581,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
       tf_shared_lock l(*mu_);  // mu_ == num_parallel_calls_->mu
       new_calls.reserve(num_parallel_calls_->value);
     }
-    auto busy = [this]() EXCLUSIVE_LOCKS_REQUIRED(*mu_) -> bool {
+    auto busy = [this]() TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) -> bool {
       int64 num_parallel_calls = num_parallel_calls_->value;
       return num_calls_ >= num_parallel_calls ||
              invocation_results_.size() >= num_parallel_calls;
@@ -609,7 +614,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
   // Determines whether the caller needs to wait for a result. Upon returning
   // false, `result` will point to the result.
   bool ShouldWait(std::shared_ptr<InvocationResult>* result)
-      EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
     if (cancelled_) {
       return false;
     }
@@ -663,7 +668,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
 
   Status WriteStatusLocked(IteratorStateWriter* writer, size_t index,
                            const Status& status)
-      EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
     TF_RETURN_IF_ERROR(
         writer->WriteScalar(CodeKey(index), static_cast<int64>(status.code())));
     if (!status.ok()) {
@@ -674,7 +679,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
   }
 
   Status ReadStatusLocked(IteratorStateReader* reader, size_t index,
-                          Status* status) EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+                          Status* status) TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
     int64 code_int;
     TF_RETURN_IF_ERROR(reader->ReadScalar(CodeKey(index), &code_int));
     error::Code code = static_cast<error::Code>(code_int);
@@ -718,15 +723,15 @@ class ParallelMapIterator : public DatasetBaseIterator {
   const bool autotune_;
   const string key_prefix_;
   // Counts the number of outstanding calls.
-  int64 num_calls_ GUARDED_BY(*mu_) = 0;
+  int64 num_calls_ TF_GUARDED_BY(*mu_) = 0;
   std::unique_ptr<IteratorBase> input_impl_;
   // Buffer for storing the invocation results.
   std::deque<std::shared_ptr<InvocationResult>> invocation_results_
-      GUARDED_BY(*mu_);
+      TF_GUARDED_BY(*mu_);
 
-  std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
-  std::unique_ptr<Thread> stats_thread_ GUARDED_BY(*mu_);
-  bool cancelled_ GUARDED_BY(*mu_) = false;
+  std::unique_ptr<Thread> runner_thread_ TF_GUARDED_BY(*mu_);
+  std::unique_ptr<Thread> stats_thread_ TF_GUARDED_BY(*mu_);
+  bool cancelled_ TF_GUARDED_BY(*mu_) = false;
 
   // Method for deregistering the cancellation callback.
   std::function<void()> deregister_fn_;
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.h b/tensorflow/core/kernels/data/parallel_map_dataset_op.h
index c5e9e0d57fb..064ccb5f812 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.h
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.h
@@ -64,6 +64,12 @@ class ParallelMapFunctor {
   // to specify error checking logic that can fail early.
   virtual Status InitFunc(IteratorContext* ctx) { return Status::OK(); }
 
+  // Indicates whether the functor depends on any external state.
+  // If so, the method returns `errors::FailedPrecondition` with
+  // a message that identifies the external state. Otherwise, the method returns
+  // `Status::OK()`.
+  virtual Status CheckExternalState() = 0;
+
   // A function that transforms elements of one dataset into another
   // asynchronously. The arguments are:
   // 1. An `IteratorContext*` for the context in which the function should
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index dc45e8e589a..27c2ca57854 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -278,11 +278,13 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
         mu_->unlock();
       }
       data::TraceMeMetadata result;
-      result.push_back(
-          std::make_pair("buffer_limit", strings::Printf("%lld", limit)));
+      result.push_back(std::make_pair(
+          "buffer_limit",
+          strings::Printf("%lld", static_cast<long long>(limit))));
       if (dataset()->slack_period_ > 0) {
-        result.push_back(
-            std::make_pair("slack", strings::Printf("%lld", slack_us_.load())));
+        result.push_back(std::make_pair(
+            "slack",
+            strings::Printf("%lld", static_cast<long long>(slack_us_.load()))));
       }
       return result;
     }
@@ -298,21 +300,21 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       int64 created_us;
     };
 
-    int64 buffer_limit() const EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+    int64 buffer_limit() const TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
       if (legacy_autotune_) {
         return auto_tuner_.buffer_limit();
       }
       return buffer_size_->value;
     }
 
-    void CancelThreads() LOCKS_EXCLUDED(mu_) {
+    void CancelThreads() TF_LOCKS_EXCLUDED(mu_) {
       mutex_lock l(*mu_);
       cancelled_ = true;
       cond_var_->notify_all();
     }
 
     Status Consume(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
-                   bool* end_of_sequence) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                   bool* end_of_sequence) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       const auto& stats_aggregator = ctx->stats_aggregator();
       if (stats_aggregator) {
         double buffer_limit_ = buffer_limit();
@@ -365,7 +367,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
     }
 
     Status EnsurePrefetchThreadStarted(IteratorContext* ctx)
-        EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
       if (!prefetch_thread_) {
         std::shared_ptr<IteratorContext> new_ctx =
             std::make_shared<IteratorContext>(*ctx);
@@ -438,7 +440,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
     }
 
     Status WriteStatus(IteratorStateWriter* writer, size_t index,
-                       const Status& status) EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+                       const Status& status) TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
       TF_RETURN_IF_ERROR(
           writer->WriteScalar(absl::StrCat(prefix(), "::", index), CodeKey(),
                               static_cast<int64>(status.code())));
@@ -451,7 +453,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
     }
 
     Status ReadStatus(IteratorStateReader* reader, size_t index, Status* status)
-        EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
       int64 code_int;
       TF_RETURN_IF_ERROR(reader->ReadScalar(absl::StrCat(prefix(), "::", index),
                                             CodeKey(), &code_int));
@@ -483,14 +485,14 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
     // This mutex is used to ensure exclusivity between multiple threads
     // accessing the input iterator. We keep this separate from `mu_` to allow
     // prefetching to run in parallel with GetNext calls.
-    mutex input_mu_ ACQUIRED_BEFORE(*mu_);
-    std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(input_mu_);
+    mutex input_mu_ TF_ACQUIRED_BEFORE(*mu_);
+    std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(input_mu_);
     const std::shared_ptr<condition_variable> cond_var_;
-    PrefetchAutotuner auto_tuner_ GUARDED_BY(*mu_);
-    std::deque<BufferElement> buffer_ GUARDED_BY(*mu_);
-    std::unique_ptr<Thread> prefetch_thread_ GUARDED_BY(*mu_);
-    bool cancelled_ GUARDED_BY(*mu_) = false;
-    bool prefetch_thread_finished_ GUARDED_BY(*mu_) = false;
+    PrefetchAutotuner auto_tuner_ TF_GUARDED_BY(*mu_);
+    std::deque<BufferElement> buffer_ TF_GUARDED_BY(*mu_);
+    std::unique_ptr<Thread> prefetch_thread_ TF_GUARDED_BY(*mu_);
+    bool cancelled_ TF_GUARDED_BY(*mu_) = false;
+    bool prefetch_thread_finished_ TF_GUARDED_BY(*mu_) = false;
     const bool legacy_autotune_;
 
     std::atomic<int64> slack_us_;
diff --git a/tensorflow/core/kernels/data/random_seed_ops.h b/tensorflow/core/kernels/data/random_seed_ops.h
index 750e6fdfdba..1a336466ffa 100644
--- a/tensorflow/core/kernels/data/random_seed_ops.h
+++ b/tensorflow/core/kernels/data/random_seed_ops.h
@@ -46,9 +46,10 @@ class RandomSeedGenerator : public ResourceBase {
   const int64 seed_;
   const int64 seed2_;
   mutex mu_;
-  random::PhiloxRandom parent_generator_ GUARDED_BY(mu_);
-  random::SingleSampleAdapter<random::PhiloxRandom> generator_ GUARDED_BY(mu_);
-  int64 num_random_samples_ GUARDED_BY(mu_) = 0;
+  random::PhiloxRandom parent_generator_ TF_GUARDED_BY(mu_);
+  random::SingleSampleAdapter<random::PhiloxRandom> generator_
+      TF_GUARDED_BY(mu_);
+  int64 num_random_samples_ TF_GUARDED_BY(mu_) = 0;
 };
 
 // Creates an instance of random seed generator resource and transfers ownership
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index 0ffa36675b6..a03fe82d815 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -107,7 +107,6 @@ class RangeDatasetOp::Dataset : public DatasetBase {
         return Status::OK();
       }
       out_tensors->reserve(1);
-      Tensor result(dataset()->output_dtypes()[0]);
       switch (dataset()->output_dtypes()[0]) {
 #define HANDLE_TYPE(type)                                \
   case DataTypeToEnum<type>::value: {                    \
@@ -148,7 +147,7 @@ class RangeDatasetOp::Dataset : public DatasetBase {
 
    private:
     mutex mu_;
-    int64 next_ GUARDED_BY(mu_);
+    int64 next_ TF_GUARDED_BY(mu_);
   };
 
   const int64 start_;
diff --git a/tensorflow/core/kernels/data/repeat_dataset_op.cc b/tensorflow/core/kernels/data/repeat_dataset_op.cc
index 8e47512a0d7..d968661b1c6 100644
--- a/tensorflow/core/kernels/data/repeat_dataset_op.cc
+++ b/tensorflow/core/kernels/data/repeat_dataset_op.cc
@@ -197,8 +197,8 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
 
    private:
     mutex mu_;
-    int64 i_ GUARDED_BY(mu_);
-    std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    int64 i_ TF_GUARDED_BY(mu_);
+    std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
   };
 
   class ForeverIterator : public DatasetIterator<Dataset> {
@@ -275,8 +275,8 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
 
    private:
     mutex mu_;
-    std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-    bool first_call_ GUARDED_BY(mu_);
+    std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
+    bool first_call_ TF_GUARDED_BY(mu_);
   };
 
   const int64 count_;
diff --git a/tensorflow/core/kernels/data/rewrite_utils.cc b/tensorflow/core/kernels/data/rewrite_utils.cc
index 383d25998db..609c402fd29 100644
--- a/tensorflow/core/kernels/data/rewrite_utils.cc
+++ b/tensorflow/core/kernels/data/rewrite_utils.cc
@@ -126,7 +126,7 @@ Status ApplyRewrites(OpKernelContext* ctx,
   tensorflow::ConfigProto config;
   *config.mutable_graph_options()->mutable_rewrite_options() = config_factory();
   TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
-      *grappler_item, config, ctx->device(), &cluster, graph_def));
+      std::move(*grappler_item), config, ctx->device(), &cluster, graph_def));
 
   // Remove fake sinks after optimizations are done.
   //
@@ -195,8 +195,10 @@ Status RewriteDataset(OpKernelContext* ctx, const DatasetBase* input,
 
   if (record_fingerprint) {
     (*ctx->runner())([graph_def = std::move(graph_def),
+                      lib_def = lib_def.release(),
                       input_list = std::move(input_list),
                       output_node = std::move(output_node)]() {
+      std::unique_ptr<FunctionLibraryDefinition> lib_def_owner(lib_def);
       const NodeDef* node_def = nullptr;
       for (const auto& node : graph_def.node()) {
         if (node.name() == output_node) {
@@ -209,7 +211,7 @@ Status RewriteDataset(OpKernelContext* ctx, const DatasetBase* input,
         return;
       }
       uint64 hash = 0;
-      Status s = HashNode(graph_def, *node_def, &hash);
+      Status s = HashNode(graph_def, *node_def, *lib_def, &hash);
       if (!s.ok()) {
         VLOG(3) << "Failed to hash graph: " << s.ToString();
         return;
diff --git a/tensorflow/core/kernels/data/shard_dataset_op.cc b/tensorflow/core/kernels/data/shard_dataset_op.cc
index cd2aba5d95e..9d6f81ced96 100644
--- a/tensorflow/core/kernels/data/shard_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shard_dataset_op.cc
@@ -48,8 +48,9 @@ class ShardDatasetOp::Dataset : public DatasetBase {
         input_(input),
         require_non_empty_(require_non_empty),
         traceme_metadata_(
-            {{"index", strings::Printf("%lld", index)},
-             {"num_shards", strings::Printf("%lld", num_shards)}}) {
+            {{"index", strings::Printf("%lld", static_cast<long long>(index))},
+             {"num_shards",
+              strings::Printf("%lld", static_cast<long long>(num_shards))}}) {
     input_->Ref();
   }
 
@@ -199,8 +200,8 @@ class ShardDatasetOp::Dataset : public DatasetBase {
 
    private:
     mutex mu_;
-    std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-    int64 next_index_ GUARDED_BY(mu_);
+    std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
+    int64 next_index_ TF_GUARDED_BY(mu_);
   };
 
   const int64 num_shards_;
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index acfd8f00340..ce68f533664 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -108,7 +108,8 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
         buffer_size_(buffer_size),
         count_(count),
         traceme_metadata_(
-            {{"buffer_size", strings::Printf("%lld", buffer_size)}}) {
+            {{"buffer_size",
+              strings::Printf("%lld", static_cast<long long>(buffer_size))}}) {
     input_->Ref();
   }
 
@@ -268,7 +269,7 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
                                        /*ratio=*/1);
     }
 
-    void ResetRngs() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    void ResetRngs() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       // Reset the generators based on the current iterator seeds.
       parent_generator_ = random::PhiloxRandom(seed_, seed2_);
       generator_ =
@@ -399,8 +400,8 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
     }
 
     mutex mu_;
-    int64 seed_ GUARDED_BY(mu_);
-    int64 seed2_ GUARDED_BY(mu_);
+    int64 seed_ TF_GUARDED_BY(mu_);
+    int64 seed2_ TF_GUARDED_BY(mu_);
 
    private:
     // Used to represent slices of `buffer_` that belong to different epochs.
@@ -416,26 +417,26 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
     };
 
     random::SingleSampleAdapter<random::PhiloxRandom>::ResultType Random()
-        EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       num_random_samples_++;
       auto out = generator_();
       return out;
     }
 
-    std::unique_ptr<std::vector<Tensor>[]> buffer_ GUARDED_BY(mu_);
-    std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-    int64 epoch_ GUARDED_BY(mu_);
-    int64 num_elements_ GUARDED_BY(mu_);
+    std::unique_ptr<std::vector<Tensor>[]> buffer_ TF_GUARDED_BY(mu_);
+    std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
+    int64 epoch_ TF_GUARDED_BY(mu_);
+    int64 num_elements_ TF_GUARDED_BY(mu_);
     // Indices into `buffer_` indicating which data belongs to which epoch.
     // The slice at the front of the deque references data from the earliest
     // buffered epoch. It is an invariant that all slices reference
     // non-overlapping sections of `buffer_`.
-    std::deque<std::unique_ptr<Slice>> slices_ GUARDED_BY(mu_);
-    random::PhiloxRandom parent_generator_ GUARDED_BY(mu_);
+    std::deque<std::unique_ptr<Slice>> slices_ TF_GUARDED_BY(mu_);
+    random::PhiloxRandom parent_generator_ TF_GUARDED_BY(mu_);
     random::SingleSampleAdapter<random::PhiloxRandom> generator_
-        GUARDED_BY(mu_);
-    int64 num_random_samples_ GUARDED_BY(mu_) = 0;
-    bool data_produced_ GUARDED_BY(mu_) = false;
+        TF_GUARDED_BY(mu_);
+    int64 num_random_samples_ TF_GUARDED_BY(mu_) = 0;
+    bool data_produced_ TF_GUARDED_BY(mu_) = false;
   };
 
   const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/data/single_threaded_executor.cc b/tensorflow/core/kernels/data/single_threaded_executor.cc
index 5393d5557eb..770ffe61f92 100644
--- a/tensorflow/core/kernels/data/single_threaded_executor.cc
+++ b/tensorflow/core/kernels/data/single_threaded_executor.cc
@@ -243,9 +243,7 @@ class SingleThreadedExecutorImpl : public Executor {
     Device* device = params_.device;
     params.device = device;
     params.log_memory = false;              // TODO(mrry): Too severe?
-    params.record_tensor_accesses = false;  // TODO(mrry): Too severe?
     params.rendezvous = args.rendezvous;
-    params.create_rendezvous = &(params_.rendezvous_factory);
     params.session_state = args.session_state;
     params.tensor_store = args.tensor_store;
     params.cancellation_manager = args.cancellation_manager;
@@ -259,6 +257,7 @@ class SingleThreadedExecutorImpl : public Executor {
 
     Args::Runner runner_copy = args.runner;
     params.runner = &runner_copy;
+    params.run_all_kernels_inline = args.run_all_kernels_inline;
     params.stats_collector = args.stats_collector;
 
     // NOTE(mrry): We are assuming that the graph is loopless and condless.
diff --git a/tensorflow/core/kernels/data/skip_dataset_op.cc b/tensorflow/core/kernels/data/skip_dataset_op.cc
index 5feda58157d..9a3e8a9c980 100644
--- a/tensorflow/core/kernels/data/skip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/skip_dataset_op.cc
@@ -197,8 +197,8 @@ class SkipDatasetOp::Dataset : public DatasetBase {
 
    private:
     mutex mu_;
-    int64 i_ GUARDED_BY(mu_);
-    std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    int64 i_ TF_GUARDED_BY(mu_);
+    std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
   };
 
   const int64 count_;
diff --git a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
index ffc74fc15de..adcdc954ffc 100644
--- a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
@@ -202,13 +202,13 @@ class Dataset : public DatasetBase {
     Tensor dense_shape_;
 
     mutex mu_;
-    sparse::GroupIterable group_iterable_ GUARDED_BY(mu_);
-    sparse::GroupIterable::IteratorStep iter_ GUARDED_BY(mu_);
-    int64 i_ GUARDED_BY(mu_) = 0;
+    sparse::GroupIterable group_iterable_ TF_GUARDED_BY(mu_);
+    sparse::GroupIterable::IteratorStep iter_ TF_GUARDED_BY(mu_);
+    int64 i_ TF_GUARDED_BY(mu_) = 0;
     const int64 kNextNonEmptyUnknown = -1;
-    int64 next_non_empty_i_ GUARDED_BY(mu_) = kNextNonEmptyUnknown;
-    Tensor next_indices_ GUARDED_BY(mu_);
-    Tensor next_values_ GUARDED_BY(mu_);
+    int64 next_non_empty_i_ TF_GUARDED_BY(mu_) = kNextNonEmptyUnknown;
+    Tensor next_indices_ TF_GUARDED_BY(mu_);
+    Tensor next_values_ TF_GUARDED_BY(mu_);
   };
 
   const sparse::SparseTensor sparse_tensor_;
diff --git a/tensorflow/core/kernels/data/take_dataset_op.cc b/tensorflow/core/kernels/data/take_dataset_op.cc
index 9fbfcd0b09e..05ed06f459e 100644
--- a/tensorflow/core/kernels/data/take_dataset_op.cc
+++ b/tensorflow/core/kernels/data/take_dataset_op.cc
@@ -167,8 +167,8 @@ class TakeDataset::FiniteIterator : public DatasetIterator<TakeDataset> {
 
  private:
   mutex mu_;
-  int64 i_ GUARDED_BY(mu_);
-  std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+  int64 i_ TF_GUARDED_BY(mu_);
+  std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
 };
 
 // See documentation in ../../ops/dataset_ops.cc for a high-level
diff --git a/tensorflow/core/kernels/data/tensor_dataset_op.cc b/tensorflow/core/kernels/data/tensor_dataset_op.cc
index 4ee4087bb22..8abdbe7b757 100644
--- a/tensorflow/core/kernels/data/tensor_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_dataset_op.cc
@@ -133,7 +133,7 @@ class TensorDatasetOp::Dataset : public DatasetBase {
 
    private:
     mutex mu_;
-    bool produced_ GUARDED_BY(mu_);
+    bool produced_ TF_GUARDED_BY(mu_);
   };
 
   const std::vector<Tensor> tensors_;
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index 00edcb8f129..a2e4222033e 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -151,7 +151,7 @@ class TensorSliceDatasetOp::Dataset : public DatasetBase {
 
    private:
     mutex mu_;
-    int64 i_ GUARDED_BY(mu_);
+    int64 i_ TF_GUARDED_BY(mu_);
     const int64 n_;
   };
 
diff --git a/tensorflow/core/kernels/data/text_line_dataset_op.cc b/tensorflow/core/kernels/data/text_line_dataset_op.cc
index b3a08f05890..7f6c3b20f0e 100644
--- a/tensorflow/core/kernels/data/text_line_dataset_op.cc
+++ b/tensorflow/core/kernels/data/text_line_dataset_op.cc
@@ -176,7 +176,7 @@ class TextLineDatasetOp::Dataset : public DatasetBase {
 
    private:
     // Sets up reader streams to read from the file at `current_file_index_`.
-    Status SetupStreamsLocked(Env* env) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    Status SetupStreamsLocked(Env* env) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       if (current_file_index_ >= dataset()->filenames_.size()) {
         return errors::InvalidArgument(
             "current_file_index_:", current_file_index_,
@@ -204,7 +204,7 @@ class TextLineDatasetOp::Dataset : public DatasetBase {
     }
 
     // Resets all reader streams.
-    void ResetStreamsLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    void ResetStreamsLocked() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       input_stream_.reset();
       zlib_input_stream_.reset();
       buffered_input_stream_.reset();
@@ -212,13 +212,14 @@ class TextLineDatasetOp::Dataset : public DatasetBase {
     }
 
     mutex mu_;
-    std::unique_ptr<io::RandomAccessInputStream> input_stream_ GUARDED_BY(mu_);
-    std::unique_ptr<io::ZlibInputStream> zlib_input_stream_ GUARDED_BY(mu_);
+    std::unique_ptr<io::RandomAccessInputStream> input_stream_
+        TF_GUARDED_BY(mu_);
+    std::unique_ptr<io::ZlibInputStream> zlib_input_stream_ TF_GUARDED_BY(mu_);
     std::unique_ptr<io::BufferedInputStream> buffered_input_stream_
-        GUARDED_BY(mu_);
-    size_t current_file_index_ GUARDED_BY(mu_) = 0;
+        TF_GUARDED_BY(mu_);
+    size_t current_file_index_ TF_GUARDED_BY(mu_) = 0;
     std::unique_ptr<RandomAccessFile> file_
-        GUARDED_BY(mu_);  // must outlive input_stream_
+        TF_GUARDED_BY(mu_);  // must outlive input_stream_
   };
 
   const std::vector<string> filenames_;
diff --git a/tensorflow/core/kernels/data/tf_record_dataset_op.cc b/tensorflow/core/kernels/data/tf_record_dataset_op.cc
index 8b6658167ea..c63d211d926 100644
--- a/tensorflow/core/kernels/data/tf_record_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tf_record_dataset_op.cc
@@ -187,7 +187,7 @@ class TFRecordDatasetOp::Dataset : public DatasetBase {
 
    private:
     // Sets up reader streams to read from the file at `current_file_index_`.
-    Status SetupStreamsLocked(Env* env) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    Status SetupStreamsLocked(Env* env) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       if (current_file_index_ >= dataset()->filenames_.size()) {
         return errors::InvalidArgument(
             "current_file_index_:", current_file_index_,
@@ -203,18 +203,18 @@ class TFRecordDatasetOp::Dataset : public DatasetBase {
     }
 
     // Resets all reader streams.
-    void ResetStreamsLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    void ResetStreamsLocked() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       reader_.reset();
       file_.reset();
     }
 
     mutex mu_;
-    size_t current_file_index_ GUARDED_BY(mu_) = 0;
+    size_t current_file_index_ TF_GUARDED_BY(mu_) = 0;
 
     // `reader_` will borrow the object that `file_` points to, so
     // we must destroy `reader_` before `file_`.
-    std::unique_ptr<RandomAccessFile> file_ GUARDED_BY(mu_);
-    std::unique_ptr<io::SequentialRecordReader> reader_ GUARDED_BY(mu_);
+    std::unique_ptr<RandomAccessFile> file_ TF_GUARDED_BY(mu_);
+    std::unique_ptr<io::SequentialRecordReader> reader_ TF_GUARDED_BY(mu_);
   };
 
   const std::vector<string> filenames_;
@@ -254,10 +254,10 @@ void TFRecordDatasetOp::MakeDataset(OpKernelContext* ctx,
                   "`buffer_size` must be >= 0 (0 == no buffering)"));
 
   if (is_gcs_fs && is_cloud_tpu_gcs_fs() && buffer_size < kCloudTpuBlockSize) {
-    LOG(WARNING) << "User buffer size is too small for reading Cloud TPU "
-                 << "TFRecords stored in GCS. Overriding " << buffer_size
-                 << " to the minimum recommended buffer_size = "
-                 << kCloudTpuBlockSize;
+    VLOG(2) << "User buffer size is too small for reading Cloud TPU "
+            << "TFRecords stored in GCS. Overriding " << buffer_size
+            << " to the minimum recommended buffer_size = "
+            << kCloudTpuBlockSize;
     buffer_size = kCloudTpuBlockSize;
   }
 
diff --git a/tensorflow/core/kernels/data/window_dataset.cc b/tensorflow/core/kernels/data/window_dataset.cc
index 64365f52268..2207577f8db 100644
--- a/tensorflow/core/kernels/data/window_dataset.cc
+++ b/tensorflow/core/kernels/data/window_dataset.cc
@@ -113,7 +113,7 @@ class WindowDataset : public DatasetBase {
     }
 
     mutex mu_;
-    size_t i_ GUARDED_BY(mu_) = 0;
+    size_t i_ TF_GUARDED_BY(mu_) = 0;
   };
 
   const std::vector<std::vector<Tensor>> elements_;
diff --git a/tensorflow/core/kernels/data/window_dataset_op.cc b/tensorflow/core/kernels/data/window_dataset_op.cc
index bb0546dd504..7db3f0a6a5b 100644
--- a/tensorflow/core/kernels/data/window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/window_dataset_op.cc
@@ -54,9 +54,12 @@ class WindowDatasetOp::Dataset : public DatasetBase {
         output_dtypes_(input_->output_dtypes().size(), {DT_VARIANT}),
         output_shapes_(input_->output_shapes().size(), TensorShape({})),
         traceme_metadata_(
-            {{"window_size", strings::Printf("%lld", window_size)},
-             {"window_shift", strings::Printf("%lld", window_shift)},
-             {"window_stride", strings::Printf("%lld", window_stride)}}) {
+            {{"window_size",
+              strings::Printf("%lld", static_cast<long long>(window_size))},
+             {"window_shift",
+              strings::Printf("%lld", static_cast<long long>(window_shift))},
+             {"window_stride", strings::Printf("%lld", static_cast<long long>(
+                                                           window_stride))}}) {
     input_->Ref();
   }
 
@@ -321,7 +324,7 @@ class WindowDatasetOp::Dataset : public DatasetBase {
 
     Status WriteStatusLocked(IteratorStateWriter* writer, size_t index,
                              const Status& status)
-        EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       TF_RETURN_IF_ERROR(writer->WriteScalar(
           CodeKey(index), static_cast<int64>(status.code())));
       if (!status.ok()) {
@@ -332,7 +335,7 @@ class WindowDatasetOp::Dataset : public DatasetBase {
     }
 
     Status ReadStatusLocked(IteratorStateReader* reader, size_t index,
-                            Status* status) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                            Status* status) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       int64 code_int;
       TF_RETURN_IF_ERROR(reader->ReadScalar(CodeKey(index), &code_int));
       error::Code code = static_cast<error::Code>(code_int);
@@ -362,8 +365,8 @@ class WindowDatasetOp::Dataset : public DatasetBase {
     }
 
     mutex mu_;
-    std::deque<InvocationResult> buffer_ GUARDED_BY(mu_);
-    std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    std::deque<InvocationResult> buffer_ TF_GUARDED_BY(mu_);
+    std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
   };
 
   const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/data/window_dataset_op_test.cc b/tensorflow/core/kernels/data/window_dataset_op_test.cc
index 31839e5d88d..0415b7f9e8f 100644
--- a/tensorflow/core/kernels/data/window_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/window_dataset_op_test.cc
@@ -496,7 +496,8 @@ TEST_P(ParameterizedIteratorSaveAndRestoreTest, IteratorSaveAndRestore) {
                                                      &window_dataset));
             std::unique_ptr<IteratorBase> window_dataset_iterator;
             TF_ASSERT_OK(window_dataset->MakeIterator(
-                iterator_ctx_.get(), test_case.dataset_params.iterator_prefix(),
+                iterator_ctx_.get(), /*parent=*/nullptr,
+                test_case.dataset_params.iterator_prefix(),
                 &window_dataset_iterator));
             bool end_of_window_dataset = false;
             std::vector<Tensor> window_elements;
diff --git a/tensorflow/core/kernels/data/zip_dataset_op.cc b/tensorflow/core/kernels/data/zip_dataset_op.cc
index a4d00305217..8b6f9a88d00 100644
--- a/tensorflow/core/kernels/data/zip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/zip_dataset_op.cc
@@ -201,7 +201,7 @@ class ZipDatasetOp::Dataset : public DatasetBase {
 
    private:
     mutex mu_;
-    std::vector<std::unique_ptr<IteratorBase>> input_impls_ GUARDED_BY(mu_);
+    std::vector<std::unique_ptr<IteratorBase>> input_impls_ TF_GUARDED_BY(mu_);
   };
 
   const std::vector<DatasetBase*> inputs_;
diff --git a/tensorflow/core/kernels/decode_image_op.cc b/tensorflow/core/kernels/decode_image_op.cc
index ffa6ca5c59a..3f878ac6b95 100644
--- a/tensorflow/core/kernels/decode_image_op.cc
+++ b/tensorflow/core/kernels/decode_image_op.cc
@@ -317,8 +317,8 @@ class DecodeImageOp : public OpKernel {
                       } else {
                         status = errors::InvalidArgument(
                             "Got ", num_frames, " frames, but animated gifs ",
-                            "can only be decoded by tf.image.decode_gif or ",
-                            "tf.image.decode_image");
+                            "can only be decoded by tf.io.decode_gif or ",
+                            "tf.io.decode_image");
                       }
                       if (!status.ok()) {
                         VLOG(1) << status;
diff --git a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
index daf8a7380e0..40dbfba1e58 100644
--- a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
@@ -68,6 +68,7 @@ TF_CALL_GPU_ALL_TYPES(DEFINE_GPU_KERNELS);
 TF_CALL_int32(DEFINE_GPU_KERNELS);
 TF_CALL_int64(DEFINE_GPU_KERNELS);
 TF_CALL_int8(DEFINE_GPU_KERNELS);
+TF_CALL_uint32(DEFINE_GPU_KERNELS);
 #undef DEFINE_GPU_KERNELS
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc
index d6ac85906c0..55e4cd7606a 100644
--- a/tensorflow/core/kernels/dense_update_ops.cc
+++ b/tensorflow/core/kernels/dense_update_ops.cc
@@ -97,6 +97,8 @@ typedef Eigen::SyclDevice SYCLDevice;
       AssignOpT<CPUDevice, type>);
 
 TF_CALL_ALL_TYPES(REGISTER_KERNELS);
+// uint32 not included in ALL_TYPES
+TF_CALL_uint32(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 // quint16 not included in QUANTIZIED_TYPES
 TF_CALL_quint16(REGISTER_KERNELS);
@@ -112,6 +114,7 @@ TF_CALL_quint16(REGISTER_KERNELS);
 
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
 TF_CALL_int64(REGISTER_GPU_KERNELS);
+TF_CALL_uint32(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index de472d5d4fe..f81065803d3 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cmath>
 
 #include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -28,7 +29,6 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
 #include "tensorflow/core/kernels/depthwise_conv_op.h"
-#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index e85b6df721a..4c1811791a5 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -15,11 +15,14 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
+#include "tensorflow/core/kernels/depthwise_conv_op.h"
+
 #include <algorithm>
 #include <cmath>
 #include <type_traits>
 
 #include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -28,8 +31,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/conv_ops.h"
-#include "tensorflow/core/kernels/depthwise_conv_op.h"
-#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.h b/tensorflow/core/kernels/depthwise_conv_op_gpu.h
index 4812fb1a698..f712d9c48c0 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.h
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.h
@@ -987,7 +987,9 @@ Status LaunchDepthwiseConv2dBackpropInputGPU(OpKernelContext* ctx,
                                              const T* filter, T* in_backprop,
                                              TensorFormat data_format) {
   if (args.depth_multiplier == 1) {
-    if (CanLaunchDepthwiseConv2dGPUSmall(args)) {
+    // This kernel doesn't currently work in all cases so it is disabled.
+    // TODO(b/150988950): Fix and reenable this kernel.
+    if (/* CanLaunchDepthwiseConv2dGPUSmall(args) */ false) {
       return LaunchDepthwiseConv2dGPUSmall<
           T, DIRECTION_BACKWARD, kKnownFilterWidth, kKnownFilterHeight>(
           ctx, args, out_backprop, filter, in_backprop, data_format);
diff --git a/tensorflow/core/kernels/dilation_ops.cc b/tensorflow/core/kernels/dilation_ops.cc
index f2e7b8a857a..738ea31d555 100644
--- a/tensorflow/core/kernels/dilation_ops.cc
+++ b/tensorflow/core/kernels/dilation_ops.cc
@@ -17,20 +17,20 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
+#include "tensorflow/core/kernels/dilation_ops.h"
+
 #include <cfloat>
 #include <vector>
 
-#include "tensorflow/core/kernels/dilation_ops.h"
-
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/util/padding.h"
diff --git a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
index f843ca55ddc..e0fb36eca57 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
@@ -48,6 +48,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_reference.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/gather_functor_gpu.cu.h"
diff --git a/tensorflow/core/kernels/extract_image_patches_op.cc b/tensorflow/core/kernels/extract_image_patches_op.cc
index 2cc9933965e..4e87dfc93a4 100644
--- a/tensorflow/core/kernels/extract_image_patches_op.cc
+++ b/tensorflow/core/kernels/extract_image_patches_op.cc
@@ -19,8 +19,11 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/core/kernels/extract_image_patches_op.h"
+
 #include <vector>
+
 #include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/core/kernels/extract_volume_patches_op.cc b/tensorflow/core/kernels/extract_volume_patches_op.cc
index 904d2a8ac26..3f003b6f7f6 100644
--- a/tensorflow/core/kernels/extract_volume_patches_op.cc
+++ b/tensorflow/core/kernels/extract_volume_patches_op.cc
@@ -25,8 +25,11 @@ when rates are to be added.
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/core/kernels/extract_volume_patches_op.h"
+
 #include <vector>
+
 #include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/core/kernels/fifo_queue.cc b/tensorflow/core/kernels/fifo_queue.cc
index d6e859f1aa0..56d31bd4149 100644
--- a/tensorflow/core/kernels/fifo_queue.cc
+++ b/tensorflow/core/kernels/fifo_queue.cc
@@ -59,7 +59,7 @@ void FIFOQueue::TryEnqueue(const Tuple& tuple, OpKernelContext* ctx,
     if (!already_cancelled) {
       enqueue_attempts_.emplace_back(
           1, callback, ctx, cm, token,
-          [tuple, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          [tuple, this](Attempt* attempt) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
             if (closed_) {
               attempt->context->SetStatus(
                   errors::Cancelled("FIFOQueue '", name_, "' is closed."));
@@ -117,7 +117,7 @@ void FIFOQueue::TryEnqueueMany(const Tuple& tuple, OpKernelContext* ctx,
     if (!already_cancelled) {
       enqueue_attempts_.emplace_back(
           batch_size, callback, ctx, cm, token,
-          [tuple, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          [tuple, this](Attempt* attempt) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
             if (closed_) {
               attempt->context->SetStatus(
                   errors::Cancelled("FIFOQueue '", name_, "' is closed."));
@@ -164,7 +164,7 @@ void FIFOQueue::TryDequeue(OpKernelContext* ctx, CallbackWithTuple callback) {
       // TODO(josh11b): This makes two copies of callback, avoid this if possible.
       dequeue_attempts_.emplace_back(
           1, [callback]() { callback(Tuple()); }, ctx, cm, token,
-          [callback, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          [callback, this](Attempt* attempt) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
             const int64 queue_size = queues_[0].size();
             if (closed_ && queue_size == 0) {
               attempt->context->SetStatus(errors::OutOfRange(
@@ -256,7 +256,7 @@ void FIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
       dequeue_attempts_.emplace_back(
           num_elements, [callback]() { callback(Tuple()); }, ctx, cm, token,
           [callback, allow_small_batch,
-           this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+           this](Attempt* attempt) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
             int64 queue_size = queues_[0].size();
 
             if (closed_ && queue_size < attempt->elements_requested) {
diff --git a/tensorflow/core/kernels/fifo_queue.h b/tensorflow/core/kernels/fifo_queue.h
index 4d3a7c19712..ef34199e7ff 100644
--- a/tensorflow/core/kernels/fifo_queue.h
+++ b/tensorflow/core/kernels/fifo_queue.h
@@ -59,7 +59,7 @@ class FIFOQueue : public TypedQueue<std::deque<PersistentTensor> > {
 
   // Helper for dequeuing a single element from queues_.
   void DequeueLocked(OpKernelContext* ctx, Tuple* tuple)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   static Status GetElementComponentFromBatch(const Tuple& tuple, int64 index,
                                              int component,
@@ -80,7 +80,7 @@ class FIFOQueueOp : public TypedQueueOp {
 
  private:
   Status CreateResource(QueueInterface** ret) override
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   std::vector<TensorShape> component_shapes_;
   TF_DISALLOW_COPY_AND_ASSIGN(FIFOQueueOp);
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index 9d64a4ba896..d6c157b18e5 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -253,6 +253,7 @@ class SymbolicGradientOp : public AsyncOpKernel {
     opts.rendezvous = ctx->rendezvous();
     opts.cancellation_manager = ctx->cancellation_manager();
     opts.runner = ctx->runner();
+    opts.run_all_kernels_inline = ctx->run_all_kernels_inline();
     opts.stats_collector = ctx->stats_collector();
     opts.step_container = ctx->step_container();
     opts.collective_executor = ctx->collective_executor();
@@ -365,6 +366,7 @@ void RemoteCallOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
 
   FunctionLibraryRuntime::Options opts;
   opts.runner = ctx->runner();
+  opts.run_all_kernels_inline = ctx->run_all_kernels_inline();
   opts.source_device = source_device;
   if (opts.source_device != target_device) {
     opts.remote_execution = true;
diff --git a/tensorflow/core/kernels/function_ops.h b/tensorflow/core/kernels/function_ops.h
index 3ecbc97f526..69e34c5962d 100644
--- a/tensorflow/core/kernels/function_ops.h
+++ b/tensorflow/core/kernels/function_ops.h
@@ -74,7 +74,7 @@ class RemoteCallOp : public AsyncOpKernel {
   mutex mu_;
   typedef std::pair<string, FunctionLibraryRuntime*> FunctionTarget;
   std::map<FunctionTarget, FunctionLibraryRuntime::Handle> handle_cache_
-      GUARDED_BY(mu_);
+      TF_GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(RemoteCallOp);
 };
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index 537a29d10e5..eb6b5cdce3a 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -107,6 +107,7 @@ void SetRunOptions(OpKernelContext* ctx, FunctionLibraryRuntime::Options* opts,
     opts->stats_collector = ctx->stats_collector();
   }
   opts->runner = ctx->runner();
+  opts->run_all_kernels_inline = ctx->run_all_kernels_inline();
   opts->step_container = ctx->step_container();
 }
 
diff --git a/tensorflow/core/kernels/gpu_device_array.h b/tensorflow/core/kernels/gpu_device_array.h
index 0a787545acb..a9d121e1d2a 100644
--- a/tensorflow/core/kernels/gpu_device_array.h
+++ b/tensorflow/core/kernels/gpu_device_array.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_reference.h"
 #include "tensorflow/core/kernels/gpu_device_array_gpu.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/gpu_utils.h b/tensorflow/core/kernels/gpu_utils.h
index 51cee491058..c0dd3b6bc77 100644
--- a/tensorflow/core/kernels/gpu_utils.h
+++ b/tensorflow/core/kernels/gpu_utils.h
@@ -188,7 +188,7 @@ class AutoTuneMap {
     int32 count;
   };
   std::unordered_map<Parameters, ValueType, Hasher> params_config_map_
-      GUARDED_BY(mu_);
+      TF_GUARDED_BY(mu_);
   string name_;
   int32 min_score_threshold_;
   int32 max_autotune_count_;
diff --git a/tensorflow/core/kernels/identity_op.cc b/tensorflow/core/kernels/identity_op.cc
index daa8a1ddb25..fd94df9a768 100644
--- a/tensorflow/core/kernels/identity_op.cc
+++ b/tensorflow/core/kernels/identity_op.cc
@@ -122,6 +122,7 @@ REGISTER_SYCL_HOST_KERNEL(bool);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 REGISTER_GPU_KERNEL(Variant);
+TF_CALL_uint32(REGISTER_GPU_KERNEL);
 
 #undef REGISTER_GPU_KERNEL
 
diff --git a/tensorflow/core/kernels/image_ops.cc b/tensorflow/core/kernels/image_ops.cc
index 2e81cdaad72..5d879661f12 100644
--- a/tensorflow/core/kernels/image_ops.cc
+++ b/tensorflow/core/kernels/image_ops.cc
@@ -46,27 +46,39 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 
 using functor::FillProjectiveTransform;
 using generator::Interpolation;
-using generator::INTERPOLATION_BILINEAR;
-using generator::INTERPOLATION_NEAREST;
-using generator::ProjectiveGenerator;
+using generator::Mode;
 
 template <typename Device, typename T>
-class ImageProjectiveTransform : public OpKernel {
+class ImageProjectiveTransformV2 : public OpKernel {
  private:
   Interpolation interpolation_;
+  Mode fill_mode_;
 
  public:
-  explicit ImageProjectiveTransform(OpKernelConstruction* ctx) : OpKernel(ctx) {
+  explicit ImageProjectiveTransformV2(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
     string interpolation_str;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("interpolation", &interpolation_str));
     if (interpolation_str == "NEAREST") {
-      interpolation_ = INTERPOLATION_NEAREST;
+      interpolation_ = Interpolation::NEAREST;
     } else if (interpolation_str == "BILINEAR") {
-      interpolation_ = INTERPOLATION_BILINEAR;
+      interpolation_ = Interpolation::BILINEAR;
     } else {
       LOG(ERROR) << "Invalid interpolation " << interpolation_str
                  << ". Supported types: NEAREST, BILINEAR";
     }
+    string mode_str;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("fill_mode", &mode_str));
+    if (mode_str == "REFLECT") {
+      fill_mode_ = Mode::REFLECT;
+    } else if (mode_str == "WRAP") {
+      fill_mode_ = Mode::WRAP;
+    } else if (mode_str == "CONSTANT") {
+      fill_mode_ = Mode::CONSTANT;
+    } else {
+      LOG(ERROR) << "Invalid mode " << mode_str
+                 << ". Supported types: REFLECT, WRAP, CONSTANT";
+    }
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -78,8 +90,7 @@ class ImageProjectiveTransform : public OpKernel {
                 (TensorShapeUtils::IsMatrix(transform_t.shape()) &&
                  (transform_t.dim_size(0) == images_t.dim_size(0) ||
                   transform_t.dim_size(0) == 1) &&
-                 transform_t.dim_size(1) ==
-                     ProjectiveGenerator<Device, T>::kNumParameters),
+                 transform_t.dim_size(1) == 8),
                 errors::InvalidArgument(
                     "Input transform should be num_images x 8 or 1 x 8"));
 
@@ -116,15 +127,15 @@ class ImageProjectiveTransform : public OpKernel {
     auto transform = transform_t.matrix<float>();
 
     (FillProjectiveTransform<Device, T>(interpolation_))(
-        ctx->eigen_device<Device>(), &output, images, transform);
+        ctx->eigen_device<Device>(), &output, images, transform, fill_mode_);
   }
 };
 
-#define REGISTER(TYPE)                                                \
-  REGISTER_KERNEL_BUILDER(Name("ImageProjectiveTransformV2")          \
-                              .Device(DEVICE_CPU)                     \
-                              .TypeConstraint<TYPE>("dtype"),         \
-                          ImageProjectiveTransform<CPUDevice, TYPE>)
+#define REGISTER(TYPE)                                        \
+  REGISTER_KERNEL_BUILDER(Name("ImageProjectiveTransformV2")  \
+                              .Device(DEVICE_CPU)             \
+                              .TypeConstraint<TYPE>("dtype"), \
+                          ImageProjectiveTransformV2<CPUDevice, TYPE>)
 
 TF_CALL_uint8(REGISTER);
 TF_CALL_int32(REGISTER);
@@ -138,33 +149,48 @@ TF_CALL_double(REGISTER);
 #if GOOGLE_CUDA
 
 typedef Eigen::GpuDevice GPUDevice;
+typedef generator::Mode Mode;
 
 namespace functor {
 
 // NOTE(ringwalt): We get an undefined symbol error if we don't explicitly
 // instantiate the operator() in GCC'd code.
-#define DECLARE_FUNCTOR(TYPE)                                               \
+#define DECLARE_PROJECT_FUNCTOR(TYPE)                                       \
   template <>                                                               \
   void FillProjectiveTransform<GPUDevice, TYPE>::operator()(                \
       const GPUDevice& device, OutputType* output, const InputType& images, \
-      const TransformsType& transform) const;                               \
+      const TransformsType& transform, const Mode fill_mode) const;         \
   extern template struct FillProjectiveTransform<GPUDevice, TYPE>
 
-TF_CALL_uint8(DECLARE_FUNCTOR);
-TF_CALL_int32(DECLARE_FUNCTOR);
-TF_CALL_int64(DECLARE_FUNCTOR);
-TF_CALL_half(DECLARE_FUNCTOR);
-TF_CALL_float(DECLARE_FUNCTOR);
-TF_CALL_double(DECLARE_FUNCTOR);
+TF_CALL_uint8(DECLARE_PROJECT_FUNCTOR);
+TF_CALL_int32(DECLARE_PROJECT_FUNCTOR);
+TF_CALL_int64(DECLARE_PROJECT_FUNCTOR);
+TF_CALL_half(DECLARE_PROJECT_FUNCTOR);
+TF_CALL_float(DECLARE_PROJECT_FUNCTOR);
+TF_CALL_double(DECLARE_PROJECT_FUNCTOR);
 
 }  // end namespace functor
 
-#define REGISTER(TYPE)                                                \
-  REGISTER_KERNEL_BUILDER(Name("ImageProjectiveTransformV2")          \
-                              .Device(DEVICE_GPU)                     \
-                              .TypeConstraint<TYPE>("dtype")          \
-                              .HostMemory("output_shape"),            \
-                          ImageProjectiveTransform<GPUDevice, TYPE>)
+namespace generator {
+
+#define DECLARE_MAP_FUNCTOR(Mode)                                         \
+  template <>                                                             \
+  float MapCoordinate<GPUDevice, Mode>::operator()(const float out_coord, \
+                                                   const DenseIndex len); \
+  extern template struct MapCoordinate<GPUDevice, Mode>
+
+DECLARE_MAP_FUNCTOR(Mode::REFLECT);
+DECLARE_MAP_FUNCTOR(Mode::WRAP);
+DECLARE_MAP_FUNCTOR(Mode::CONSTANT);
+
+}  // end namespace generator
+
+#define REGISTER(TYPE)                                       \
+  REGISTER_KERNEL_BUILDER(Name("ImageProjectiveTransformV2") \
+                              .Device(DEVICE_GPU)            \
+                              .TypeConstraint<TYPE>("dtype") \
+                              .HostMemory("output_shape"),   \
+                          ImageProjectiveTransformV2<GPUDevice, TYPE>)
 
 TF_CALL_uint8(REGISTER);
 TF_CALL_int32(REGISTER);
diff --git a/tensorflow/core/kernels/image_ops.h b/tensorflow/core/kernels/image_ops.h
index 4e375a67184..300c65921bd 100644
--- a/tensorflow/core/kernels/image_ops.h
+++ b/tensorflow/core/kernels/image_ops.h
@@ -29,12 +29,67 @@ namespace tensorflow {
 
 namespace generator {
 
-enum Interpolation { INTERPOLATION_NEAREST, INTERPOLATION_BILINEAR };
+enum Interpolation { NEAREST, BILINEAR };
+enum Mode { REFLECT, WRAP, CONSTANT };
 
 using Eigen::array;
 using Eigen::DenseIndex;
 
-template <typename Device, typename T>
+template <typename Device, Mode M>
+struct MapCoordinate {
+  float operator()(const float out_coord, const DenseIndex len);
+};
+
+template <typename Device>
+struct MapCoordinate<Device, Mode::REFLECT> {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float operator()(const float out_coord,
+                                                         const DenseIndex len) {
+    float in_coord = out_coord;
+    // Reflect [abcd] to [dcba|abcd|dcba], periodically from [0, 2 * len)
+    // over [abcddcba]
+    const DenseIndex boundary = 2 * len;
+    // Shift coordinate to (-boundary, boundary)
+    in_coord -= boundary * static_cast<DenseIndex>(in_coord / boundary);
+    // Convert negative coordinates from [-boundary, 0) to [0, boundary)
+    if (in_coord < 0) {
+      in_coord += boundary;
+    }
+    // Coordinate in_coord between [len, boundary) should reverse reflect
+    // to coordinate to (bounary - 1 - in_coord) between [0, len)
+    if (in_coord > len - 1) {
+      in_coord = boundary - 1 - in_coord;
+    }
+    return in_coord;
+  }
+};
+
+template <typename Device>
+struct MapCoordinate<Device, Mode::WRAP> {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float operator()(const float out_coord,
+                                                         const DenseIndex len) {
+    float in_coord = out_coord;
+    // Wrap [abcd] to [abcd|abcd|abcd], periodically from [0, len)
+    // over [abcd]
+    const DenseIndex boundary = len;
+    // Shift coordinate to (-boundary, boundary)
+    in_coord -= boundary * static_cast<DenseIndex>(in_coord / boundary);
+    // Shift negative coordinate from [-boundary, 0) to [0, boundary)
+    if (in_coord < 0) {
+      in_coord += boundary;
+    }
+    return in_coord;
+  }
+};
+
+template <typename Device>
+struct MapCoordinate<Device, Mode::CONSTANT> {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float operator()(const float out_coord,
+                                                         const DenseIndex len) {
+    return out_coord;
+  }
+};
+
+template <typename Device, typename T, Mode M>
 class ProjectiveGenerator {
  private:
   typename TTypes<T, 4>::ConstTensor input_;
@@ -42,8 +97,6 @@ class ProjectiveGenerator {
   const Interpolation interpolation_;
 
  public:
-  static const int kNumParameters = 8;
-
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   ProjectiveGenerator(typename TTypes<T, 4>::ConstTensor input,
                       typename TTypes<float>::ConstMatrix transforms,
@@ -52,6 +105,7 @@ class ProjectiveGenerator {
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
   operator()(const array<DenseIndex, 4>& coords) const {
+    const T fill_value = T(0);
     const int64 output_y = coords[1];
     const int64 output_x = coords[2];
     const float* transform =
@@ -62,7 +116,7 @@ class ProjectiveGenerator {
     if (projection == 0) {
       // Return the fill value (0) for infinite coordinates,
       // which are outside the input image
-      return T(0);
+      return fill_value;
     }
     const float input_x =
         (transform[0] * output_x + transform[1] * output_y + transform[2]) /
@@ -71,22 +125,24 @@ class ProjectiveGenerator {
         (transform[3] * output_x + transform[4] * output_y + transform[5]) /
         projection;
 
-    const T fill_value = T(0);
+    // Map out-of-boundary input coordinates to in-boundary based on fill_mode.
+    auto map_functor = MapCoordinate<Device, M>();
+    const float x = map_functor(input_x, input_.dimension(2));
+    const float y = map_functor(input_y, input_.dimension(1));
+
+    const DenseIndex batch = coords[0];
+    const DenseIndex channels = coords[3];
     switch (interpolation_) {
-      case INTERPOLATION_NEAREST:
-        // Switch the order of x and y again for indexing into the image.
-        return nearest_interpolation(coords[0], input_y, input_x, coords[3],
-                                     fill_value);
-      case INTERPOLATION_BILINEAR:
-        return bilinear_interpolation(coords[0], input_y, input_x, coords[3],
-                                      fill_value);
+      case NEAREST:
+        return nearest_interpolation(batch, y, x, channels, fill_value);
+      case BILINEAR:
+        return bilinear_interpolation(batch, y, x, channels, fill_value);
     }
     // Unreachable; ImageProjectiveTransform only uses INTERPOLATION_NEAREST
     // or INTERPOLATION_BILINEAR.
-    return T(0);
+    return fill_value;
   }
 
- private:
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
   nearest_interpolation(const DenseIndex batch, const float y, const float x,
                         const DenseIndex channel, const T fill_value) const {
@@ -138,12 +194,10 @@ class ProjectiveGenerator {
 
 }  // end namespace generator
 
-// NOTE(ringwalt): We MUST wrap the generate() call in a functor and explicitly
-// instantiate the functor in image_ops_gpu.cu.cc. Otherwise, we will be missing
-// some Eigen device code.
 namespace functor {
 
 using generator::Interpolation;
+using generator::Mode;
 using generator::ProjectiveGenerator;
 
 template <typename Device, typename T>
@@ -151,17 +205,32 @@ struct FillProjectiveTransform {
   typedef typename TTypes<T, 4>::Tensor OutputType;
   typedef typename TTypes<T, 4>::ConstTensor InputType;
   typedef typename TTypes<float, 2>::ConstTensor TransformsType;
-  const Interpolation interpolation_;
+  const Interpolation interpolation;
 
-  FillProjectiveTransform(Interpolation interpolation)
-      : interpolation_(interpolation) {}
+  explicit FillProjectiveTransform(Interpolation interpolation)
+      : interpolation(interpolation) {}
 
   EIGEN_ALWAYS_INLINE
   void operator()(const Device& device, OutputType* output,
-                  const InputType& images,
-                  const TransformsType& transform) const {
-    output->device(device) = output->generate(
-        ProjectiveGenerator<Device, T>(images, transform, interpolation_));
+                  const InputType& images, const TransformsType& transform,
+                  const Mode fill_mode) const {
+    switch (fill_mode) {
+      case Mode::REFLECT:
+        output->device(device) =
+            output->generate(ProjectiveGenerator<Device, T, Mode::REFLECT>(
+                images, transform, interpolation));
+        break;
+      case Mode::WRAP:
+        output->device(device) =
+            output->generate(ProjectiveGenerator<Device, T, Mode::WRAP>(
+                images, transform, interpolation));
+        break;
+      case Mode::CONSTANT:
+        output->device(device) =
+            output->generate(ProjectiveGenerator<Device, T, Mode::CONSTANT>(
+                images, transform, interpolation));
+        break;
+    }
   }
 };
 
diff --git a/tensorflow/core/kernels/logging_ops.cc b/tensorflow/core/kernels/logging_ops.cc
index c01bd0aa93b..e1c0ff36c18 100644
--- a/tensorflow/core/kernels/logging_ops.cc
+++ b/tensorflow/core/kernels/logging_ops.cc
@@ -109,7 +109,7 @@ class PrintOp : public OpKernel {
 
  private:
   mutex mu_;
-  int64 call_counter_ GUARDED_BY(mu_) = 0;
+  int64 call_counter_ TF_GUARDED_BY(mu_) = 0;
   int64 first_n_ = 0;
   int32 summarize_ = 0;
   string message_;
diff --git a/tensorflow/core/kernels/lookup_table_op.cc b/tensorflow/core/kernels/lookup_table_op.cc
index 0cd445f619f..9807247ed4f 100644
--- a/tensorflow/core/kernels/lookup_table_op.cc
+++ b/tensorflow/core/kernels/lookup_table_op.cc
@@ -149,7 +149,7 @@ class MutableHashTableOfScalars final : public LookupInterface {
 
  private:
   mutable mutex mu_;
-  std::unordered_map<K, V> table_ GUARDED_BY(mu_);
+  std::unordered_map<K, V> table_ TF_GUARDED_BY(mu_);
 };
 
 // Lookup table that wraps an unordered_map. Behaves identical to
@@ -289,7 +289,7 @@ class MutableHashTableOfTensors final : public LookupInterface {
   TensorShape value_shape_;
   mutable mutex mu_;
   typedef gtl::InlinedVector<V, 4> ValueArray;
-  std::unordered_map<K, ValueArray> table_ GUARDED_BY(mu_);
+  std::unordered_map<K, ValueArray> table_ TF_GUARDED_BY(mu_);
 };
 
 namespace {
@@ -375,13 +375,13 @@ class MutableDenseHashTable final : public LookupInterface {
     OP_REQUIRES_OK(ctx, AllocateBuckets(ctx, initial_num_buckets));
   }
 
-  size_t size() const override LOCKS_EXCLUDED(mu_) {
+  size_t size() const override TF_LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     return num_entries_;
   }
 
   Status Find(OpKernelContext* ctx, const Tensor& key, Tensor* value,
-              const Tensor& default_value) override LOCKS_EXCLUDED(mu_) {
+              const Tensor& default_value) override TF_LOCKS_EXCLUDED(mu_) {
     const int64 num_elements = (key.dims() == 0) ? 1 : key.dim_size(0);
     const int64 key_size = key_shape_.num_elements();
     const int64 value_size = value_shape_.num_elements();
@@ -450,7 +450,7 @@ class MutableDenseHashTable final : public LookupInterface {
   }
 
   Status Insert(OpKernelContext* ctx, const Tensor& key,
-                const Tensor& value) override LOCKS_EXCLUDED(mu_) {
+                const Tensor& value) override TF_LOCKS_EXCLUDED(mu_) {
     const int64 batch_size = (key.dims() == 0) ? 1 : key.dim_size(0);
     if (key.NumElements() != batch_size * key_shape_.num_elements()) {
       TensorShape expected_shape({batch_size});
@@ -476,7 +476,7 @@ class MutableDenseHashTable final : public LookupInterface {
   }
 
   Status Remove(OpKernelContext* ctx, const Tensor& key) override
-      LOCKS_EXCLUDED(mu_) {
+      TF_LOCKS_EXCLUDED(mu_) {
     if (key.NumElements() != key.dim_size(0) * key_shape_.num_elements()) {
       TensorShape expected_shape({key.dim_size(0)});
       expected_shape.AppendShape(key_shape_);
@@ -489,7 +489,7 @@ class MutableDenseHashTable final : public LookupInterface {
   }
 
   Status ImportValues(OpKernelContext* ctx, const Tensor& keys,
-                      const Tensor& values) override LOCKS_EXCLUDED(mu_) {
+                      const Tensor& values) override TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
     num_buckets_ = keys.dim_size(0);
     key_buckets_ = PersistentTensor(keys);
@@ -515,7 +515,7 @@ class MutableDenseHashTable final : public LookupInterface {
     return Status::OK();
   }
 
-  Status ExportValues(OpKernelContext* ctx) override LOCKS_EXCLUDED(mu_) {
+  Status ExportValues(OpKernelContext* ctx) override TF_LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     Tensor key_buckets_tensor = *key_buckets_.AccessTensor(ctx);
     Tensor value_buckets_tensor = *value_buckets_.AccessTensor(ctx);
@@ -566,7 +566,7 @@ class MutableDenseHashTable final : public LookupInterface {
  private:
   Status DoInsert(OpKernelContext* ctx, const Tensor& key, const Tensor& value,
                   bool ignore_empty_and_deleted_key)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     const int64 num_elements = (key.dims() == 0) ? 1 : key.dim_size(0);
     const int64 value_size = value_shape_.num_elements();
     const int64 key_size = key_shape_.num_elements();
@@ -637,7 +637,7 @@ class MutableDenseHashTable final : public LookupInterface {
   }
 
   Status DoRemove(OpKernelContext* ctx, const Tensor& key)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     const int64 num_elements = key.dim_size(0);
     const int64 key_size = key_shape_.num_elements();
     const auto key_matrix = key.shaped<K, 2>({num_elements, key_size});
@@ -690,7 +690,7 @@ class MutableDenseHashTable final : public LookupInterface {
   }
 
   Status AllocateBuckets(OpKernelContext* ctx, int64 new_num_buckets)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (new_num_buckets < 4 ||
         ((new_num_buckets & (new_num_buckets - 1)) != 0)) {
       return errors::InvalidArgument(
@@ -731,7 +731,7 @@ class MutableDenseHashTable final : public LookupInterface {
   }
 
   Status Rebucket(OpKernelContext* ctx, int64 num_new_buckets)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     Tensor old_key_buckets = *key_buckets_.AccessTensor(ctx);
     Tensor old_value_buckets = *value_buckets_.AccessTensor(ctx);
     TF_RETURN_IF_ERROR(AllocateBuckets(ctx, num_new_buckets));
@@ -766,10 +766,10 @@ class MutableDenseHashTable final : public LookupInterface {
   TensorShape value_shape_;
   float max_load_factor_;
   mutable mutex mu_;
-  int64 num_entries_ GUARDED_BY(mu_);
-  int64 num_buckets_ GUARDED_BY(mu_);
-  PersistentTensor key_buckets_ GUARDED_BY(mu_);
-  PersistentTensor value_buckets_ GUARDED_BY(mu_);
+  int64 num_entries_ TF_GUARDED_BY(mu_);
+  int64 num_buckets_ TF_GUARDED_BY(mu_);
+  PersistentTensor key_buckets_ TF_GUARDED_BY(mu_);
+  PersistentTensor value_buckets_ TF_GUARDED_BY(mu_);
   PersistentTensor empty_key_;
   uint64 empty_key_hash_;
   PersistentTensor deleted_key_;
diff --git a/tensorflow/core/kernels/lookup_table_op.h b/tensorflow/core/kernels/lookup_table_op.h
index 0cb84040f10..ad1afdf8242 100644
--- a/tensorflow/core/kernels/lookup_table_op.h
+++ b/tensorflow/core/kernels/lookup_table_op.h
@@ -66,7 +66,7 @@ class LookupTableOp : public OpKernel {
 
     auto creator =
         [ctx, this](lookup::LookupInterface** ret)
-            EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+            TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
               lookup::LookupInterface* container = new Container(ctx, this);
               if (!ctx->status().ok()) {
                 container->Unref();
@@ -124,8 +124,8 @@ class LookupTableOp : public OpKernel {
 
  private:
   mutex mu_;
-  PersistentTensor table_handle_ GUARDED_BY(mu_);
-  bool table_handle_set_ GUARDED_BY(mu_);
+  PersistentTensor table_handle_ TF_GUARDED_BY(mu_);
+  bool table_handle_set_ TF_GUARDED_BY(mu_);
   ContainerInfo cinfo_;
   bool use_node_name_sharing_;
 
diff --git a/tensorflow/core/kernels/map_stage_op.cc b/tensorflow/core/kernels/map_stage_op.cc
index 725abf1f727..6c01e42ff8c 100644
--- a/tensorflow/core/kernels/map_stage_op.cc
+++ b/tensorflow/core/kernels/map_stage_op.cc
@@ -92,22 +92,22 @@ class StagingMap : public ResourceBase {
 
  private:
   // Private variables
-  DataTypeVector dtypes_ GUARDED_BY(mu_);
-  std::size_t capacity_ GUARDED_BY(mu_);
-  std::size_t memory_limit_ GUARDED_BY(mu_);
-  std::size_t current_bytes_ GUARDED_BY(mu_);
+  DataTypeVector dtypes_ TF_GUARDED_BY(mu_);
+  std::size_t capacity_ TF_GUARDED_BY(mu_);
+  std::size_t memory_limit_ TF_GUARDED_BY(mu_);
+  std::size_t current_bytes_ TF_GUARDED_BY(mu_);
   tensorflow::mutex mu_;
   tensorflow::condition_variable not_empty_;
   tensorflow::condition_variable full_;
-  IncompleteType incomplete_ GUARDED_BY(mu_);
-  MapType map_ GUARDED_BY(mu_);
+  IncompleteType incomplete_ TF_GUARDED_BY(mu_);
+  MapType map_ TF_GUARDED_BY(mu_);
 
  private:
   // private methods
 
   // If map is configured for bounded capacity, notify
   // waiting inserters that space is now available
-  void notify_inserters_if_bounded() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  void notify_inserters_if_bounded() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (has_capacity() || has_memory_limit()) {
       // Notify all inserters. The removal of an element
       // may make memory available for many inserters
@@ -125,20 +125,20 @@ class StagingMap : public ResourceBase {
     not_empty_.notify_all();
   }
 
-  bool has_capacity() const EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  bool has_capacity() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     return capacity_ > 0;
   }
 
-  bool has_memory_limit() const EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  bool has_memory_limit() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     return memory_limit_ > 0;
   }
 
   bool would_exceed_memory_limit(std::size_t bytes) const
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     return has_memory_limit() && bytes + current_bytes_ > memory_limit_;
   }
 
-  bool is_capacity_full() const EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  bool is_capacity_full() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     return has_capacity() && map_.size() >= capacity_;
   }
 
@@ -162,7 +162,7 @@ class StagingMap : public ResourceBase {
 
   // Check that the index is within bounds
   Status check_index(const Tensor& key, std::size_t index)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (index >= dtypes_.size()) {
       return Status(errors::InvalidArgument(
           "Index '", index, "' for key '", key.scalar<int64>()(),
@@ -174,7 +174,8 @@ class StagingMap : public ResourceBase {
 
   Status copy_or_move_tensors(OptionalTuple* map_tuple, const Tensor& key,
                               const Tensor& indices, Tuple* output,
-                              bool copy = false) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                              bool copy = false)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     auto findices = indices.flat<int>();
 
     // Return values at specified indices
@@ -207,7 +208,7 @@ class StagingMap : public ResourceBase {
   // is uninitialized
   Status check_index_uninitialized(const Tensor& key, std::size_t index,
                                    const OptionalTuple& tuple)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (tuple[index].has_value()) {
       return Status(errors::InvalidArgument(
           "The tensor for index '", index, "' for key '", key.scalar<int64>()(),
@@ -234,7 +235,8 @@ class StagingMap : public ResourceBase {
   }
 
   // Check bytes are within memory limits memory limits
-  Status check_memory_limit(std::size_t bytes) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  Status check_memory_limit(std::size_t bytes)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (has_memory_limit() && bytes > memory_limit_) {
       return Status(errors::ResourceExhausted(
           "Attempted to insert tensors with combined size of '", bytes,
@@ -248,7 +250,7 @@ class StagingMap : public ResourceBase {
   // Insert incomplete data into the Barrier
   Status put_incomplete(const KeyType& key, const Tensor& indices,
                         OptionalTuple* tuple, tensorflow::mutex_lock* lock)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     auto findices = indices.flat<int>();
 
     // Search for the key in our incomplete set
@@ -324,7 +326,7 @@ class StagingMap : public ResourceBase {
 
   // Does the insertion into the actual staging area
   Status put_complete(const KeyType& key, OptionalTuple* tuple)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     // Insert key and tuples into the map
     map_.insert({key, std::move(*tuple)});
 
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index 7518e614cba..d918327ef5f 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -576,6 +576,12 @@ class MklConcatOp : public OpKernel {
       // format and avoid calling eigen version.
       if (!are_all_tf_inputs && !are_all_mkl_inputs) invoke_eigen = true;
 
+#ifdef ENABLE_MKLDNN_V1
+      // Temporally call Eigen if number of input dimensions is 2.
+      // That is due to an incorrect output results in DNNL 1.2 path.
+      if (expected_dims == 2) invoke_eigen = true;
+#endif  // ENABLE_MKLDNN_V1
+
       OpInputList input_mins, input_maxes;
       bool quantized_input =
           std::is_same<T, qint8>::value || std::is_same<T, quint8>::value;
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index 75f6689b8e9..80a53ad277e 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -991,8 +991,8 @@ class MklConvOp : public OpKernel {
   mutex mu_;
   Padding padding_;
   TensorFormat data_format_;
-  PersistentTensor cached_filter_data_ptensor_ GUARDED_BY(mu_);
-  PersistentTensor cached_filter_md_ptensor_ GUARDED_BY(mu_);
+  PersistentTensor cached_filter_data_ptensor_ TF_GUARDED_BY(mu_);
+  PersistentTensor cached_filter_md_ptensor_ TF_GUARDED_BY(mu_);
 
   // Initialize to values the template is instantiated with
   bool fuse_biasadd_ = bias_enabled;
@@ -1154,10 +1154,11 @@ class MklConvOp : public OpKernel {
 #endif  // ENABLE_MKLDNN_V1
   }
 
-  // LOCKS_EXCLUDED annotation ensures that the lock (mu_) cannot
+  // TF_LOCKS_EXCLUDED annotation ensures that the lock (mu_) cannot
   // be acquired before entering the function, since it is acquired
   // inside the function.
-  inline bool IsFilterCacheEmpty(OpKernelContext* context) LOCKS_EXCLUDED(mu_) {
+  inline bool IsFilterCacheEmpty(OpKernelContext* context)
+      TF_LOCKS_EXCLUDED(mu_) {
     tf_shared_lock lock(mu_);
     const Tensor& cached_filter_data_tensor =
         *cached_filter_data_ptensor_.AccessTensor(context);
@@ -1171,7 +1172,7 @@ class MklConvOp : public OpKernel {
                    const std::shared_ptr<ConvFwdPd>& conv_fwd_pd,
                    Tfilter* filter_data, const Tensor& filter_tensor,
                    MklDnnData<Tfilter>& filter, const memory::desc& filter_md,
-                   const MklDnnShape& filter_mkl_shape) LOCKS_EXCLUDED(mu_) {
+                   const MklDnnShape& filter_mkl_shape) TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock lock(mu_);
     const Tensor& cached_filter_data_tensor =
         *cached_filter_data_ptensor_.AccessTensor(context);
@@ -1217,7 +1218,7 @@ class MklConvOp : public OpKernel {
                    const std::shared_ptr<ConvFwdPd>& conv_fwd_pd,
                    Tfilter* filter_data, const Tensor& filter_tensor,
                    MklDnnData<Tfilter>& filter, const memory::desc& filter_md)
-      LOCKS_EXCLUDED(mu_) {
+      TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock lock(mu_);
     const Tensor& cached_filter_data_tensor =
         *cached_filter_data_ptensor_.AccessTensor(context);
@@ -1242,7 +1243,8 @@ class MklConvOp : public OpKernel {
 #endif  // ENABLE_MKLDNN_V1
 
   Tfilter* GetCachedFilter(OpKernelContext* context,
-                           const MEMORY_DESC& filter_md) LOCKS_EXCLUDED(mu_) {
+                           const MEMORY_DESC& filter_md)
+      TF_LOCKS_EXCLUDED(mu_) {
     tf_shared_lock lock(mu_);
     const Tensor& cached_filter_data =
         *cached_filter_data_ptensor_.AccessTensor(context);
@@ -1611,7 +1613,7 @@ class MklQuantizedConv2DOp
   }
 
   bool is_bias_const_;
-  PersistentTensor cached_bias_data_ptensor_ GUARDED_BY(bias_cache_mu_);
+  PersistentTensor cached_bias_data_ptensor_ TF_GUARDED_BY(bias_cache_mu_);
 
   memory* input_bias_ = nullptr;
   memory* scaled_bias_ = nullptr;
@@ -1636,11 +1638,11 @@ class MklQuantizedConv2DOp
                                 &cached_bias_data_ptensor_, bias_tensor));
   }
 
-  // LOCKS_EXCLUDED annotation ensures that the lock (mu_) cannot
+  // TF_LOCKS_EXCLUDED annotation ensures that the lock (mu_) cannot
   // be acquired before entering the function, since it is acquired
   // inside the function.
   inline bool IsBiasCacheEmpty(OpKernelContext* context)
-      LOCKS_EXCLUDED(bias_cache_mu_) {
+      TF_LOCKS_EXCLUDED(bias_cache_mu_) {
     tf_shared_lock lock(bias_cache_mu_);
     return (cached_bias_data_ptensor_.NumElements() == 0);
   }
@@ -1650,7 +1652,7 @@ class MklQuantizedConv2DOp
   void CacheBias(OpKernelContext* context,
                  const std::shared_ptr<ConvFwdPd>& conv_fwd_pd,
                  Tbias* bias_data, const memory* scaled_bias)
-      LOCKS_EXCLUDED(bias_cache_mu_) {
+      TF_LOCKS_EXCLUDED(bias_cache_mu_) {
     mutex_lock lock(bias_cache_mu_);
 
     // If bias is already cached, there's nothing to do.
@@ -1668,7 +1670,7 @@ class MklQuantizedConv2DOp
   }
 
   Tbias* GetCachedBias(OpKernelContext* context)
-      LOCKS_EXCLUDED(bias_cache_mu_) {
+      TF_LOCKS_EXCLUDED(bias_cache_mu_) {
     tf_shared_lock lock(bias_cache_mu_);
     const Tensor& cached_bias_data =
         *cached_bias_data_ptensor_.AccessTensor(context);
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
index 4c3cac70956..7d1e19566ee 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_conv_ops.h
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "mkldnn.hpp"
 #include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/core/kernels/mkl_matmul_ops_common.h b/tensorflow/core/kernels/mkl_matmul_ops_common.h
index 502d13dbc57..ffe50dd4022 100644
--- a/tensorflow/core/kernels/mkl_matmul_ops_common.h
+++ b/tensorflow/core/kernels/mkl_matmul_ops_common.h
@@ -411,10 +411,11 @@ class MklDnnMatMulOpBase : public OpKernel {
                               output_tf_shape, output_mkl_shape);
   }
 
-  // LOCKS_EXCLUDED annotation ensures that the lock (mu_) cannot
+  // TF_LOCKS_EXCLUDED annotation ensures that the lock (mu_) cannot
   // be acquired before entering the function, since it is acquired
   // inside the function.
-  inline bool IsWeightCacheEmpty(OpKernelContext* context) LOCKS_EXCLUDED(mu_) {
+  inline bool IsWeightCacheEmpty(OpKernelContext* context)
+      TF_LOCKS_EXCLUDED(mu_) {
     tf_shared_lock lock(mu_);
     return (weight_oi_.NumElements() == 0);
   }
@@ -427,7 +428,7 @@ class MklDnnMatMulOpBase : public OpKernel {
           matmul_fwd_pd,
       Tweight* weight_data, const Tensor& weight_tensor,
       MklDnnData<Tweight>& weight, const memory::desc& weight_md)
-      LOCKS_EXCLUDED(mu_) {
+      TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock lock(mu_);
     const Tensor& weight_t = *weight_oi_.AccessTensor(context);
 
@@ -475,7 +476,7 @@ class MklDnnMatMulOpBase : public OpKernel {
 
   Tweight* GetCachedWeight(OpKernelContext* context,
                            const memory::desc& expected_md)
-      LOCKS_EXCLUDED(mu_) {
+      TF_LOCKS_EXCLUDED(mu_) {
     tf_shared_lock lock(mu_);
     const Tensor& weight_t = *weight_oi_.AccessTensor(context);
     const Tensor& weight_md_t = *weight_oi_md_.AccessTensor(context);
@@ -502,8 +503,8 @@ class MklDnnMatMulOpBase : public OpKernel {
  protected:
   // Tensor to save reordered weight
   mutex mu_;
-  PersistentTensor weight_oi_ GUARDED_BY(mu_);
-  PersistentTensor weight_oi_md_ GUARDED_BY(mu_);
+  PersistentTensor weight_oi_ TF_GUARDED_BY(mu_);
+  PersistentTensor weight_oi_md_ TF_GUARDED_BY(mu_);
 
   bool is_weight_const_;
 
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
index 904866f8223..438721f85fd 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
@@ -22,7 +22,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/bounds_check.h"
-#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/kernel_shape_util.h"
 
 namespace tensorflow {
 using mkldnn::prop_kind;
diff --git a/tensorflow/core/kernels/mkl_transpose_op.cc b/tensorflow/core/kernels/mkl_transpose_op.cc
index b735a27ed46..a82aed0243c 100644
--- a/tensorflow/core/kernels/mkl_transpose_op.cc
+++ b/tensorflow/core/kernels/mkl_transpose_op.cc
@@ -143,16 +143,16 @@ Status MKLTransposeND(OpKernelContext* context, const Tensor& in_tensor,
     out.SetUsrMem(in_dims, out_strides, out_tensor);
 
     std::vector<primitive> net;
+#ifdef ENABLE_MKLDNN_V1
+    auto* prim = FindOrCreateReorder<T>(in.GetUsrMem(), out.GetUsrMem());
+    net.push_back(*(prim->GetPrimitive()));
+    std::vector<MemoryArgsMap> net_args;
+    net_args.push_back({{MKLDNN_ARG_FROM, *in.GetUsrMem()},
+                        {MKLDNN_ARG_TO, *out.GetUsrMem()}});
+    execute_primitives(net, prim->GetStream(), net_args);
+#else
     std::shared_ptr<stream> transpose_stream;
     transpose_stream.reset(new CPU_STREAM(cpu_engine));
-#ifdef ENABLE_MKLDNN_V1
-    const int net_idx = 0;
-    net.push_back(reorder(in.GetOpMem(), out.GetOpMem()));
-    std::vector<std::unordered_map<int, memory>> net_args;
-    net_args.push_back(
-        {{MKLDNN_ARG_FROM, in.GetOpMem()}, {MKLDNN_ARG_TO, out.GetOpMem()}});
-    net.at(net_idx).execute(*transpose_stream, net_args.at(net_idx));
-#else
     net.push_back(FindOrCreateReorder<T>(in.GetUsrMem(), out.GetUsrMem()));
     transpose_stream->submit(net).wait();
 #endif  // ENABLE_MKLDNN_V1
diff --git a/tensorflow/core/kernels/mutex_ops.cc b/tensorflow/core/kernels/mutex_ops.cc
index efdd05215b4..172bc275c48 100644
--- a/tensorflow/core/kernels/mutex_ops.cc
+++ b/tensorflow/core/kernels/mutex_ops.cc
@@ -81,7 +81,7 @@ class Mutex : public ResourceBase {
     CancellationToken token{};
     bool* cancelled = nullptr;
     if (cm) {
-      cancelled = new bool(false);  // GUARDED_BY(mu_);
+      cancelled = new bool(false);  // TF_GUARDED_BY(mu_);
       token = cm->get_cancellation_token();
       const bool already_cancelled =
           !cm->RegisterCallback(token, [this, cancelled]() {
@@ -125,8 +125,8 @@ class Mutex : public ResourceBase {
 
  private:
   mutex mu_;
-  condition_variable cv_ GUARDED_BY(mu_);
-  bool locked_ GUARDED_BY(mu_);
+  condition_variable cv_ TF_GUARDED_BY(mu_);
+  bool locked_ TF_GUARDED_BY(mu_);
   std::unique_ptr<thread::ThreadPool> thread_pool_;
   string name_;
 };
diff --git a/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc b/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc
index b218f62ddd9..8e853f2338b 100644
--- a/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #define GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
 #include "public/gemmlowp.h"
 #include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -28,7 +29,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/neon/depthwiseconv_float.h"
-#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
diff --git a/tensorflow/core/kernels/nn_ops_test.cc b/tensorflow/core/kernels/nn_ops_test.cc
index 292be65bc15..331bbe25b17 100644
--- a/tensorflow/core/kernels/nn_ops_test.cc
+++ b/tensorflow/core/kernels/nn_ops_test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -44,7 +45,6 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
-#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/ops_util_test.cc b/tensorflow/core/kernels/ops_util_test.cc
index 13427d71ff6..02f460d50e4 100644
--- a/tensorflow/core/kernels/ops_util_test.cc
+++ b/tensorflow/core/kernels/ops_util_test.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/kernels/ops_util.h"
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/core/kernels/padding_fifo_queue.cc b/tensorflow/core/kernels/padding_fifo_queue.cc
index a600d328972..66937c3c509 100644
--- a/tensorflow/core/kernels/padding_fifo_queue.cc
+++ b/tensorflow/core/kernels/padding_fifo_queue.cc
@@ -98,7 +98,7 @@ void PaddingFIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
       dequeue_attempts_.emplace_back(
           num_elements, [callback]() { callback(Tuple()); }, ctx, cm, token,
           [callback, allow_small_batch,
-           this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+           this](Attempt* attempt) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
             int32 queue_size = queues_[0].size();
             if (closed_ && queue_size < attempt->elements_requested) {
               // If we don't have enough for a full dequeue, we have
diff --git a/tensorflow/core/kernels/padding_fifo_queue_op.cc b/tensorflow/core/kernels/padding_fifo_queue_op.cc
index 0c96eaa57b7..c92cd732d5b 100644
--- a/tensorflow/core/kernels/padding_fifo_queue_op.cc
+++ b/tensorflow/core/kernels/padding_fifo_queue_op.cc
@@ -54,7 +54,7 @@ class PaddingFIFOQueueOp : public TypedQueueOp {
 
  private:
   Status CreateResource(QueueInterface** ret) override
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     PaddingFIFOQueue* queue = new PaddingFIFOQueue(
         capacity_, component_types_, component_shapes_, cinfo_.name());
     return CreateTypedQueue(queue, ret);
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index 2dc02ef6809..a85f3f449fd 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -136,7 +136,7 @@ Status PartitionedCallOp::FillOutputDevices(
   const FunctionLibraryDefinition* flib = lib.GetFunctionLibraryDefinition();
   const FunctionDef* fdef = flib->Find(func_->name());
   if (fdef == nullptr) {
-    return errors::NotFound("Failed for find definition for function \"",
+    return errors::NotFound("Failed to find definition for function \"",
                             func_->name(), "\"");
   }
 
@@ -241,18 +241,11 @@ void PartitionedCallOp::RunFunction(FunctionLibraryRuntime::Handle handle,
   // TODO(akshayka): Consider selecting a runner on a per-device basis,
   // i.e., using device-specific threadpools when available.
   run_opts.runner = ctx->runner();
+  run_opts.run_all_kernels_inline = ctx->run_all_kernels_inline();
   run_opts.source_device =
       lib->device() == nullptr ? "" : lib->device()->name();
   run_opts.allow_dead_tensors = true;
 
-  Rendezvous* rendez;
-  OP_REQUIRES_OK_ASYNC(
-      ctx,
-      ctx->create_rendezvous(run_opts.step_id,
-                             ctx->function_library()->device_mgr(), &rendez),
-      done);
-  run_opts.rendezvous = rendez;
-
   std::vector<Tensor>* rets = new std::vector<Tensor>;
   const string& func_name = func_->name();
   profiler::TraceMe trace_me(
@@ -263,7 +256,7 @@ void PartitionedCallOp::RunFunction(FunctionLibraryRuntime::Handle handle,
       },
       /*level=*/2);
   lib->Run(run_opts, handle, inputs, rets,
-           [rets, rendez, done, ctx, func_name,
+           [rets, done = std::move(done), ctx, func_name,
             step_container](const Status& status) {
              if (!status.ok()) {
                const string function_and_msg =
@@ -277,7 +270,6 @@ void PartitionedCallOp::RunFunction(FunctionLibraryRuntime::Handle handle,
              }
              delete rets;
              delete step_container;
-             rendez->Unref();
              done();
            });
 }
diff --git a/tensorflow/core/kernels/partitioned_function_ops.h b/tensorflow/core/kernels/partitioned_function_ops.h
index 776ebab9695..27a494a20dd 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.h
+++ b/tensorflow/core/kernels/partitioned_function_ops.h
@@ -64,7 +64,7 @@ class PartitionedCallOp : public AsyncOpKernel {
   // Different device placements of PartitionedCallOp also use
   // different FLRs.
   gtl::FlatMap<FunctionLibraryRuntime*, FunctionLibraryRuntime::Handle> handles_
-      GUARDED_BY(mu_);
+      TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/pooling_ops_3d.cc b/tensorflow/core/kernels/pooling_ops_3d.cc
index 7345ccf69ee..31ead11dd34 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index 2c56d8d8bfb..4bd710546fe 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 
diff --git a/tensorflow/core/kernels/priority_queue.cc b/tensorflow/core/kernels/priority_queue.cc
index 342e66a7c7d..f949bd98e74 100644
--- a/tensorflow/core/kernels/priority_queue.cc
+++ b/tensorflow/core/kernels/priority_queue.cc
@@ -80,7 +80,7 @@ void PriorityQueue::TryEnqueue(const Tuple& tuple, OpKernelContext* ctx,
     if (!already_cancelled) {
       enqueue_attempts_.emplace_back(
           1, callback, ctx, cm, token,
-          [tuple, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          [tuple, this](Attempt* attempt) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
             if (closed_) {
               attempt->context->SetStatus(
                   errors::Cancelled("PriorityQueue '", name_, "' is closed."));
@@ -145,7 +145,8 @@ void PriorityQueue::TryEnqueueMany(const Tuple& tuple, OpKernelContext* ctx,
     if (!already_cancelled) {
       enqueue_attempts_.emplace_back(
           batch_size, callback, ctx, cm, token,
-          [tuple, this, ctx](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          [tuple, this,
+           ctx](Attempt* attempt) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
             if (closed_) {
               attempt->context->SetStatus(
                   errors::Cancelled("PriorityQueue '", name_, "' is closed."));
@@ -207,7 +208,7 @@ void PriorityQueue::TryDequeue(OpKernelContext* ctx,
       // TODO(josh11b): This makes two copies of callback, avoid this if possible.
       dequeue_attempts_.emplace_back(
           1, [callback]() { callback(Tuple()); }, ctx, cm, token,
-          [callback, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          [callback, this](Attempt* attempt) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
             const int32 s = queues_[0].size();
             if (closed_ && s == 0) {
               attempt->context->SetStatus(errors::OutOfRange(
@@ -298,8 +299,8 @@ void PriorityQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
       // TODO(josh11b): This makes two copies of callback, avoid this if possible.
       dequeue_attempts_.emplace_back(
           num_elements, [callback]() { callback(Tuple()); }, ctx, cm, token,
-          [callback, this,
-           allow_small_batch](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          [callback, this, allow_small_batch](
+              Attempt* attempt) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
             int32 s = queues_[0].size();
             // Return OutOfRange if closed and there are fewer elements
             // available than requested.  *Unless* allow_small_batch
diff --git a/tensorflow/core/kernels/priority_queue.h b/tensorflow/core/kernels/priority_queue.h
index a719c518c3e..0a40b4cb1ad 100644
--- a/tensorflow/core/kernels/priority_queue.h
+++ b/tensorflow/core/kernels/priority_queue.h
@@ -78,7 +78,7 @@ class PriorityQueue
 
   // Helper for dequeuing a single element from queues_.
   void DequeueLocked(OpKernelContext* ctx, Tuple* tuple)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   static Status GetElementComponentFromBatch(const Tuple& tuple, int index,
                                              int component,
diff --git a/tensorflow/core/kernels/priority_queue_op.cc b/tensorflow/core/kernels/priority_queue_op.cc
index efa275dfdda..3449621b62e 100644
--- a/tensorflow/core/kernels/priority_queue_op.cc
+++ b/tensorflow/core/kernels/priority_queue_op.cc
@@ -51,7 +51,7 @@ class PriorityQueueOp : public TypedQueueOp {
 
  private:
   Status CreateResource(QueueInterface** ret) override
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     PriorityQueue* queue = new PriorityQueue(capacity_, component_types_,
                                              component_shapes_, cinfo_.name());
     return CreateTypedQueue(queue, ret);
diff --git a/tensorflow/core/kernels/quantization_utils_test.cc b/tensorflow/core/kernels/quantization_utils_test.cc
index 98ee5499b7d..81b3f4ff4b2 100644
--- a/tensorflow/core/kernels/quantization_utils_test.cc
+++ b/tensorflow/core/kernels/quantization_utils_test.cc
@@ -950,7 +950,7 @@ RUN_TEST(TestComputeLerp4xAll);
 #endif  // __ANDROID__
 
 int main(int argc, char** argv) {
-  // On Linux, add: FLAGS_logtostderr = true;
+  // On Linux, add: absl::SetFlag(&FLAGS_logtostderr, true);
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/core/kernels/quantized_add_op_test.cc b/tensorflow/core/kernels/quantized_add_op_test.cc
index 376fe34c4b5..c33115759da 100644
--- a/tensorflow/core/kernels/quantized_add_op_test.cc
+++ b/tensorflow/core/kernels/quantized_add_op_test.cc
@@ -295,7 +295,7 @@ RUN_TEST(BenchmarkVectorPlusTensor);
 #endif  // __ANDROID__
 
 int main(int argc, char** argv) {
-  // On Linux, add: FLAGS_logtostderr = true;
+  // On Linux, add: absl::SetFlag(&FLAGS_logtostderr, true);
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/core/kernels/quantized_conv_ops.cc b/tensorflow/core/kernels/quantized_conv_ops.cc
index 5b3570edff5..a4d36cca3e4 100644
--- a/tensorflow/core/kernels/quantized_conv_ops.cc
+++ b/tensorflow/core/kernels/quantized_conv_ops.cc
@@ -22,11 +22,11 @@ limitations under the License.
 
 #define GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
 #include "public/gemmlowp.h"
+#include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/conv_ops.h"
 #include "tensorflow/core/kernels/meta_support.h"
-#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/kernels/reference_gemm.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/core/kernels/quantized_instance_norm_test.cc b/tensorflow/core/kernels/quantized_instance_norm_test.cc
index 896fe046e7e..f34ab9dcd87 100644
--- a/tensorflow/core/kernels/quantized_instance_norm_test.cc
+++ b/tensorflow/core/kernels/quantized_instance_norm_test.cc
@@ -183,7 +183,7 @@ RUN_TEST(TestOutputRangeGiven);
 RUN_TEST(TestClamp);
 
 int main(int argc, char** argv) {
-  // On Linux, add: FLAGS_logtostderr = true;
+  // On Linux, add: absl::SetFlag(&FLAGS_logtostderr, true);
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/core/kernels/quantized_mul_op_test.cc b/tensorflow/core/kernels/quantized_mul_op_test.cc
index a4e407c7a94..f41ebdefb4d 100644
--- a/tensorflow/core/kernels/quantized_mul_op_test.cc
+++ b/tensorflow/core/kernels/quantized_mul_op_test.cc
@@ -296,7 +296,7 @@ RUN_TEST(BenchmarkVectorTimesTensor);
 #endif  // __ANDROID__
 
 int main(int argc, char** argv) {
-  // On Linux, add: FLAGS_logtostderr = true;
+  // On Linux, add: absl::SetFlag(&FLAGS_logtostderr, true);
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc b/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
index 6e359b08f31..ae5ca6e6c1e 100644
--- a/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
+++ b/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
@@ -407,7 +407,7 @@ RUN_TEST(RunBenchmarkResizeBilinearTwoDims);
 #endif  // __ANDROID__
 
 int main(int argc, char** argv) {
-  // On Linux, add: FLAGS_logtostderr = true;
+  // On Linux, add: absl::SetFlag(&FLAGS_logtostderr, true);
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/core/kernels/queue_base.cc b/tensorflow/core/kernels/queue_base.cc
index acfbc02ee7a..1e7c68a244b 100644
--- a/tensorflow/core/kernels/queue_base.cc
+++ b/tensorflow/core/kernels/queue_base.cc
@@ -247,7 +247,7 @@ void QueueBase::Close(OpKernelContext* ctx, bool cancel_pending_enqueues,
       mutex_lock lock(mu_);
       enqueue_attempts_.emplace_back(
           0, callback, ctx, nullptr, CancellationManager::kInvalidToken,
-          [this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          [this](Attempt* attempt) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
             if (closed_) {
               attempt->context->SetStatus(
                   errors::Cancelled("Queue '", name_, "' is already closed."));
diff --git a/tensorflow/core/kernels/queue_base.h b/tensorflow/core/kernels/queue_base.h
index 1a5aa8dff56..d1a9403fd95 100644
--- a/tensorflow/core/kernels/queue_base.h
+++ b/tensorflow/core/kernels/queue_base.h
@@ -126,7 +126,7 @@ class QueueBase : public QueueInterface {
   void CloseAndCancel();
 
   bool TryAttemptLocked(Action action, std::vector<CleanUp>* clean_up)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Tries to make progress on the enqueues or dequeues at the front
   // of the *_attempts_ queues.
@@ -147,7 +147,7 @@ class QueueBase : public QueueInterface {
   const std::vector<TensorShape> component_shapes_;
   const string name_;
   mutable mutex mu_;
-  bool closed_ GUARDED_BY(mu_);
+  bool closed_ TF_GUARDED_BY(mu_);
 
   struct Attempt;
   typedef std::function<RunResult(Attempt*)> RunCallback;
@@ -174,8 +174,8 @@ class QueueBase : public QueueInterface {
           run_callback(run_callback),
           is_cancelled(false) {}
   };
-  std::deque<Attempt> enqueue_attempts_ GUARDED_BY(mu_);
-  std::deque<Attempt> dequeue_attempts_ GUARDED_BY(mu_);
+  std::deque<Attempt> enqueue_attempts_ TF_GUARDED_BY(mu_);
+  std::deque<Attempt> dequeue_attempts_ TF_GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(QueueBase);
 };
diff --git a/tensorflow/core/kernels/random_shuffle_queue_op.cc b/tensorflow/core/kernels/random_shuffle_queue_op.cc
index 02b9b022fdc..240edee7800 100644
--- a/tensorflow/core/kernels/random_shuffle_queue_op.cc
+++ b/tensorflow/core/kernels/random_shuffle_queue_op.cc
@@ -69,7 +69,7 @@ class RandomShuffleQueue : public TypedQueue<std::vector<PersistentTensor> > {
 
   // Helper for dequeuing a single random element from queues_.
   void DequeueLocked(OpKernelContext* ctx, Tuple* tuple)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   static Status GetElementComponentFromBatch(const Tuple& tuple, int64 index,
                                              int component,
@@ -80,8 +80,9 @@ class RandomShuffleQueue : public TypedQueue<std::vector<PersistentTensor> > {
   const int64 original_seed_;
   const int64 original_seed2_;
 
-  random::PhiloxRandom parent_generator_ GUARDED_BY(mu_);
-  random::SingleSampleAdapter<random::PhiloxRandom> generator_ GUARDED_BY(mu_);
+  random::PhiloxRandom parent_generator_ TF_GUARDED_BY(mu_);
+  random::SingleSampleAdapter<random::PhiloxRandom> generator_
+      TF_GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(RandomShuffleQueue);
 };
@@ -136,7 +137,7 @@ void RandomShuffleQueue::TryEnqueue(const Tuple& tuple, OpKernelContext* ctx,
     if (!already_cancelled) {
       enqueue_attempts_.emplace_back(
           1, callback, ctx, cm, token,
-          [tuple, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          [tuple, this](Attempt* attempt) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
             if (closed_) {
               attempt->context->SetStatus(errors::Cancelled(
                   "RandomShuffleQueue '", name_, "' is closed."));
@@ -194,7 +195,7 @@ void RandomShuffleQueue::TryEnqueueMany(const Tuple& tuple,
     if (!already_cancelled) {
       enqueue_attempts_.emplace_back(
           batch_size, callback, ctx, cm, token,
-          [tuple, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          [tuple, this](Attempt* attempt) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
             if (closed_) {
               attempt->context->SetStatus(errors::Cancelled(
                   "RandomShuffleQueue '", name_, "' is closed."));
@@ -242,7 +243,7 @@ void RandomShuffleQueue::TryDequeue(OpKernelContext* ctx,
       // TODO(josh11b): This makes two copies of callback, avoid this if possible.
       dequeue_attempts_.emplace_back(
           1, [callback]() { callback(Tuple()); }, ctx, cm, token,
-          [callback, this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          [callback, this](Attempt* attempt) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
             int32 queue_size = queues_[0].size();
             if (closed_ && queue_size == 0) {
               attempt->context->SetStatus(errors::OutOfRange(
@@ -335,7 +336,7 @@ void RandomShuffleQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
       dequeue_attempts_.emplace_back(
           num_elements, [callback]() { callback(Tuple()); }, ctx, cm, token,
           [callback, allow_small_batch,
-           this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+           this](Attempt* attempt) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
             int32 queue_size = queues_[0].size();
             if (closed_ && queue_size < attempt->elements_requested) {
               // If we don't have enough for a full dequeue, we have
@@ -493,7 +494,7 @@ class RandomShuffleQueueOp : public TypedQueueOp {
 
  private:
   Status CreateResource(QueueInterface** ret) override
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     RandomShuffleQueue* queue = new RandomShuffleQueue(
         capacity_, min_after_dequeue_, seed_, seed2_, component_types_,
         component_shapes_, cinfo_.name());
diff --git a/tensorflow/core/kernels/range_sampler.h b/tensorflow/core/kernels/range_sampler.h
index ed160adfb46..c85fe0380df 100644
--- a/tensorflow/core/kernels/range_sampler.h
+++ b/tensorflow/core/kernels/range_sampler.h
@@ -200,7 +200,7 @@ class UnigramSampler : public RangeSampler {
   void Update(gtl::ArraySlice<int64> values) override;
 
  private:
-  ThreadUnsafeUnigramSampler unsafe_sampler_ GUARDED_BY(mu_);
+  ThreadUnsafeUnigramSampler unsafe_sampler_ TF_GUARDED_BY(mu_);
   mutable mutex mu_;
 };
 
diff --git a/tensorflow/core/kernels/record_yielder.h b/tensorflow/core/kernels/record_yielder.h
index 84a035deee3..4cbca2e7587 100644
--- a/tensorflow/core/kernels/record_yielder.h
+++ b/tensorflow/core/kernels/record_yielder.h
@@ -109,14 +109,14 @@ class RecordYielder {
   mutex mu_;
 
   // Turned to true when this is deleted.
-  bool stop_ GUARDED_BY(mu_) = false;
-  Status status_ GUARDED_BY(mu_);
+  bool stop_ TF_GUARDED_BY(mu_) = false;
+  Status status_ TF_GUARDED_BY(mu_);
 
   // PRG used for randomization.
-  std::mt19937_64 rnd_ GUARDED_BY(mu_);
+  std::mt19937_64 rnd_ TF_GUARDED_BY(mu_);
 
   // Randomization buffer.
-  std::vector<string> buf_ GUARDED_BY(mu_);
+  std::vector<string> buf_ TF_GUARDED_BY(mu_);
 
   // True iff we are draining an epoch.
   bool epoch_end_ = false;
@@ -129,17 +129,17 @@ class RecordYielder {
 
   // condition_variables.
   condition_variable buf_empty_;
-  bool BufEmpty() const SHARED_LOCKS_REQUIRED(mu_) {
+  bool BufEmpty() const TF_SHARED_LOCKS_REQUIRED(mu_) {
     return stop_ || buf_.empty();
   }
 
   condition_variable buf_not_full_;
-  bool BufNotFull() const SHARED_LOCKS_REQUIRED(mu_) {
+  bool BufNotFull() const TF_SHARED_LOCKS_REQUIRED(mu_) {
     return stop_ || buf_.size() < opts_.bufsize;
   }
 
   condition_variable buf_enough_;
-  bool BufEnough() const SHARED_LOCKS_REQUIRED(mu_) {
+  bool BufEnough() const TF_SHARED_LOCKS_REQUIRED(mu_) {
     // NOTE: Unless we are finishing an epoch, we want to make sure
     // the buf_ contains enough randomized elements before yielding
     // any.
diff --git a/tensorflow/core/kernels/relu_op.cc b/tensorflow/core/kernels/relu_op.cc
index 75f6649e983..f5d23ca9d25 100644
--- a/tensorflow/core/kernels/relu_op.cc
+++ b/tensorflow/core/kernels/relu_op.cc
@@ -105,9 +105,7 @@ namespace functor {
   extern template struct Relu6Grad<GPUDevice, T>;                              \
                                                                                \
   template <>                                                                  \
-  void LeakyRelu<GPUDevice, T>::operator()(                                    \
-      const GPUDevice& d, typename TTypes<T>::ConstTensor features, T alpha,   \
-      typename TTypes<T>::Tensor activations);                                 \
+  void LeakyRelu<GPUDevice, T>::operator()(LeakyReluArgs args);                \
   extern template struct LeakyRelu<GPUDevice, T>;                              \
                                                                                \
   template <>                                                                  \
diff --git a/tensorflow/core/kernels/relu_op.h b/tensorflow/core/kernels/relu_op.h
index a4638c70c2c..2ef38a62e40 100644
--- a/tensorflow/core/kernels/relu_op.h
+++ b/tensorflow/core/kernels/relu_op.h
@@ -143,8 +143,8 @@ class LeakyReluOp : public UnaryElementWiseOp<T, LeakyReluOp<Device, T>> {
 
   void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
     functor::LeakyRelu<Device, T> functor;
-    functor(context->eigen_device<Device>(), input.flat<T>(), alpha_,
-            output->flat<T>());
+    functor({context->eigen_device<Device>(), input.flat<T>(), alpha_,
+             output->flat<T>()});
   }
 
  private:
diff --git a/tensorflow/core/kernels/relu_op_functor.h b/tensorflow/core/kernels/relu_op_functor.h
index 9337282b3d3..913d5f7ced0 100644
--- a/tensorflow/core/kernels/relu_op_functor.h
+++ b/tensorflow/core/kernels/relu_op_functor.h
@@ -98,11 +98,21 @@ struct LeakyRelu {
   //
   // features: any shape.
   // activations: same shape as "features".
-  void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
-                  T alpha, typename TTypes<T>::Tensor activations) {
+
+  // Need to bundle the args (to the LeakyRelu functor) within a struct
+  // Not doing so leads to Eigen kernel args not getting populated
+  // corretly for Eigen::half type (when building on the ROCM platform)
+  struct LeakyReluArgs {
+    const Device& d;
+    typename TTypes<T>::ConstTensor features;
+    T alpha;
+    typename TTypes<T>::Tensor activations;
+  };
+  void operator()(LeakyReluArgs args) {
     // Note that alpha might be > 1 or < 0, so we don't use cwiseMax here.
-    activations.device(d) =
-        (features > static_cast<T>(0)).select(features, features * alpha);
+    args.activations.device(args.d) =
+        (args.features > static_cast<T>(0))
+            .select(args.features, args.features * args.alpha);
   }
 };
 
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 6a1cf9e570c..c8a08c65ab9 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -229,6 +229,8 @@ VarHandleOp::VarHandleOp(OpKernelConstruction* context) : OpKernel(context) {
   OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_and_shape_.dtype));
   PartialTensorShape shape;
   OP_REQUIRES_OK(context, context->GetAttr("shape", &dtype_and_shape_.shape));
+  OP_REQUIRES_OK(context,
+                 context->GetAttr("allowed_devices", &allowed_devices_));
 
   is_anonymous_ = name_ == ResourceHandle::ANONYMOUS_NAME;
 
@@ -239,7 +241,8 @@ VarHandleOp::VarHandleOp(OpKernelConstruction* context) : OpKernel(context) {
                                                    &resource_, attr));
     resource_.scalar<ResourceHandle>()() = MakeResourceHandle<Var>(
         context, container_, name_,
-        std::vector<DtypeAndPartialTensorShape>{dtype_and_shape_});
+        std::vector<DtypeAndPartialTensorShape>{dtype_and_shape_},
+        allowed_devices_);
   }
 }
 
@@ -252,7 +255,8 @@ void VarHandleOp::Compute(OpKernelContext* ctx) {
         ctx, ctx->allocate_temp(DT_RESOURCE, TensorShape({}), &handle, attr));
     handle.scalar<ResourceHandle>()() = MakeResourceHandle<Var>(
         ctx, container_, name_,
-        std::vector<DtypeAndPartialTensorShape>{dtype_and_shape_});
+        std::vector<DtypeAndPartialTensorShape>{dtype_and_shape_},
+        allowed_devices_);
     ctx->set_output(0, handle);
   } else {
     ctx->set_output(0, resource_);
diff --git a/tensorflow/core/kernels/resource_variable_ops.h b/tensorflow/core/kernels/resource_variable_ops.h
index 1bb70b537c1..5935fa91d21 100644
--- a/tensorflow/core/kernels/resource_variable_ops.h
+++ b/tensorflow/core/kernels/resource_variable_ops.h
@@ -36,6 +36,10 @@ class VarHandleOp : public OpKernel {
   Tensor resource_;
 
   DtypeAndPartialTensorShape dtype_and_shape_;
+
+  // A set of devices containing the resource variable. Set when the output
+  // ResourceHandle represents a per-replica/partitioned resource variable.
+  std::vector<string> allowed_devices_;
 };
 
 class ReadVariableOp : public OpKernel {
diff --git a/tensorflow/core/kernels/rocm_solvers.h b/tensorflow/core/kernels/rocm_solvers.h
index 9826bcbf923..94d3c82a497 100644
--- a/tensorflow/core/kernels/rocm_solvers.h
+++ b/tensorflow/core/kernels/rocm_solvers.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "rocm/include/rocblas.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_reference.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/stream_executor/blas.h"
diff --git a/tensorflow/core/kernels/roll_op.cc b/tensorflow/core/kernels/roll_op.cc
index 120dd9681ff..8d6147801e7 100644
--- a/tensorflow/core/kernels/roll_op.cc
+++ b/tensorflow/core/kernels/roll_op.cc
@@ -400,6 +400,7 @@ TF_CALL_int64(REGISTER_KERNEL);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNEL);
 TF_CALL_complex64(REGISTER_KERNEL);
 TF_CALL_complex128(REGISTER_KERNEL);
+TF_CALL_uint32(REGISTER_KERNEL);
 
 #undef REGISTER_KERNEL
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/roll_op_gpu.cu.cc b/tensorflow/core/kernels/roll_op_gpu.cu.cc
index 4df0305569e..7ba37e2f59c 100644
--- a/tensorflow/core/kernels/roll_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/roll_op_gpu.cu.cc
@@ -96,6 +96,7 @@ TF_CALL_int64(DEFINE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
 TF_CALL_complex64(DEFINE_GPU_SPECS);
 TF_CALL_complex128(DEFINE_GPU_SPECS);
+TF_CALL_uint32(DEFINE_GPU_SPECS)
 
 #undef DEFINE_GPU_SPECS
 }  // namespace functor
diff --git a/tensorflow/core/kernels/sdca_ops.cc b/tensorflow/core/kernels/sdca_ops.cc
index dd5e0173707..b2fc9b8851e 100644
--- a/tensorflow/core/kernels/sdca_ops.cc
+++ b/tensorflow/core/kernels/sdca_ops.cc
@@ -166,7 +166,7 @@ void DoCompute(const ComputeOptions& options, OpKernelContext* const context) {
   }
   struct {
     mutex mu;
-    Status value GUARDED_BY(mu);
+    Status value TF_GUARDED_BY(mu);
   } train_step_status;
   std::atomic<std::int64_t> atomic_index(-1);
   auto train_step = [&](const int64 begin, const int64 end) {
diff --git a/tensorflow/core/kernels/searchsorted_op_gpu.cu.cc b/tensorflow/core/kernels/searchsorted_op_gpu.cu.cc
index 0b0ff212f61..ad345b8eaa9 100644
--- a/tensorflow/core/kernels/searchsorted_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/searchsorted_op_gpu.cu.cc
@@ -66,6 +66,10 @@ struct UpperBoundFunctor<GPUDevice, T, OutType> {
                         int batch_size, int num_inputs, int num_values,
                         typename TTypes<OutType, 1>::Tensor* output) {
     const GPUDevice& device = context->eigen_device<GPUDevice>();
+    if (values.size() == 0) {
+      // GetGpuLaunchConfig requires work_element_count > 0
+      return Status::OK();
+    }
     GpuLaunchConfig config = GetGpuLaunchConfig(values.size(), device);
 
     TF_CHECK_OK(GpuLaunchKernel(
@@ -85,6 +89,10 @@ struct LowerBoundFunctor<GPUDevice, T, OutType> {
                         int batch_size, int num_inputs, int num_values,
                         typename TTypes<OutType, 1>::Tensor* output) {
     const GPUDevice& device = context->eigen_device<GPUDevice>();
+    if (values.size() == 0) {
+      // GetGpuLaunchConfig requires work_element_count > 0
+      return Status::OK();
+    }
     GpuLaunchConfig config = GetGpuLaunchConfig(values.size(), device);
 
     TF_CHECK_OK(GpuLaunchKernel(
diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc
index 110440c28c8..62c63263d48 100644
--- a/tensorflow/core/kernels/slice_op.cc
+++ b/tensorflow/core/kernels/slice_op.cc
@@ -36,24 +36,25 @@ namespace tensorflow {
 
 namespace {
 
-gtl::InlinedVector<int64, 4> IntTensorToInt64Vec(const Tensor& tensor) {
-  gtl::InlinedVector<int64, 4> out;
+void IntTensorToInt64Vec(const Tensor& tensor,
+                         gtl::InlinedVector<int64, 4>* out) {
+  out->resize(tensor.NumElements());
+  int64* out_ptr = out->data();
   if (tensor.dtype() == DT_INT32) {
+    const int32* tensor_ptr = tensor.flat<int32>().data();
     for (int64 i = 0; i < tensor.NumElements(); ++i) {
-      out.push_back(tensor.flat<int32>()(i));
+      out_ptr[i] = tensor_ptr[i];
     }
   } else if (tensor.dtype() == DT_INT64) {
+    const int64* tensor_ptr = tensor.flat<int64>().data();
     for (int64 i = 0; i < tensor.NumElements(); ++i) {
-      out.push_back(tensor.flat<int64>()(i));
+      out_ptr[i] = tensor_ptr[i];
     }
   } else {
     LOG(FATAL) << "begin must be either int32 or int64";
   }
-  return out;
 }
 
-}  // namespace
-
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
@@ -62,12 +63,11 @@ typedef Eigen::SyclDevice SYCLDevice;
 
 // Shared code that is not dependent on the type of T.  We do this to reduce
 // code size by not duplicating all this for all T (float, double, int32, etc.)
-static void SharedValidation(OpKernelContext* context,
-                             TensorShape* output_shape, bool* is_identity,
-                             bool* slice_dim0,
-                             gtl::InlinedVector<int64, 4>* begin,
-                             gtl::InlinedVector<int64, 4>* size) {
-  const Tensor& input = context->input(0);
+void SharedSliceValidation(OpKernelContext* context, const Tensor& input,
+                           TensorShape* output_shape, bool* is_identity,
+                           bool* slice_dim0,
+                           gtl::InlinedVector<int64, 4>* begin,
+                           gtl::InlinedVector<int64, 4>* size) {
   const Tensor& begin_tensor = context->input(1);
   const Tensor& size_tensor = context->input(2);
 
@@ -83,8 +83,8 @@ static void SharedValidation(OpKernelContext* context,
           " and ", size_tensor.shape().DebugString(), " instead."));
 
   const int input_dims = input.dims();
-  *begin = IntTensorToInt64Vec(begin_tensor);
-  *size = IntTensorToInt64Vec(size_tensor);
+  IntTensorToInt64Vec(begin_tensor, begin);
+  IntTensorToInt64Vec(size_tensor, size);
   for (int i = 0; i < input_dims; ++i) {
     if ((*size)[i] == -1) {
       // A size[i] of -1 means "all elements from begin[i] to dim_size(i)".
@@ -123,18 +123,18 @@ static void SharedValidation(OpKernelContext* context,
 // generic code
 template <typename T>
 static void SharedSliceCommonCases(OpKernelContext* context,
-                                   TensorShape* output_shape,
+                                   const Tensor& input,
                                    gtl::InlinedVector<int64, 4>* begin,
                                    gtl::InlinedVector<int64, 4>* size,
                                    Tensor** result, bool* done) {
   bool is_identity = true;
   bool slice_dim0 = true;
+  TensorShape output_shape;
   *done = false;
 
-  SharedValidation(context, output_shape, &is_identity, &slice_dim0, begin,
-                   size);
+  SharedSliceValidation(context, input, &output_shape, &is_identity,
+                        &slice_dim0, begin, size);
   if (!context->status().ok()) return;
-  const Tensor& input = context->input(0);
   if (is_identity) {
     VLOG(1) << "Slice identity";
     context->set_output(0, input);
@@ -151,7 +151,7 @@ static void SharedSliceCommonCases(OpKernelContext* context,
     return;
   }
 
-  OP_REQUIRES_OK(context, context->allocate_output(0, *output_shape, result));
+  OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, result));
 }
 
 template <typename Device, typename T>
@@ -160,39 +160,45 @@ class SliceOp : public OpKernel {
   explicit SliceOp(OpKernelConstruction* context) : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
-    TensorShape output_shape;
     gtl::InlinedVector<int64, 4> begin;
     gtl::InlinedVector<int64, 4> size;
+    const Tensor& input = context->input(0);
     Tensor* result = nullptr;
     bool done = false;
-    SharedSliceCommonCases<T>(context, &output_shape, &begin, &size, &result,
-                              &done);
+    SharedSliceCommonCases<T>(context, input, &begin, &size, &result, &done);
     if (!context->status().ok() || done == true) return;
 
-    const Tensor& input = context->input(0);
     const int input_dims = input.dims();
 
-    if (output_shape.num_elements() > 0) {
+    if (result->NumElements() > 0) {
       if (std::is_same<Device, CPUDevice>::value && input_dims == 2 &&
           DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) {
-        auto input = context->input(0).tensor<T, 2>();
-        auto output = result->tensor<T, 2>();
+        auto input_t = input.tensor<T, 2>();
+        auto output_t = result->tensor<T, 2>();
+
+        const int64 row_begin = begin[0];
+        const int64 col_begin = begin[1];
+        const int64 row_size = size[0];
+        const int64 col_size = size[1];
+
         // TODO(agarwal): Consider multi-threading this loop for cases where
-        // size[0] is very large.
-        for (int i = 0; i < size[0]; ++i) {
-          const int64 row = begin[0] + i;
+        // row_size is very large.
+        for (int i = 0; i < row_size; ++i) {
+          const int64 row = row_begin + i;
           if (i + 1 < size[0]) {
-            port::prefetch<port::PREFETCH_HINT_T0>(&output(i + 1, 0));
-            port::prefetch<port::PREFETCH_HINT_T0>(&input(row + 1, begin[1]));
+            port::prefetch<port::PREFETCH_HINT_T0>(&output_t(i + 1, 0));
+            port::prefetch<port::PREFETCH_HINT_T0>(
+                &input_t(row + 1, col_begin));
           }
-          memcpy(&output(i, 0), &input(row, begin[1]), size[1] * sizeof(T));
+          memcpy(&output_t(i, 0), &input_t(row, col_begin),
+                 col_size * sizeof(T));
         }
         return;
       }
-#define HANDLE_DIM(NDIM)                            \
-  if (input_dims == NDIM) {                         \
-    HandleCase<NDIM>(context, begin, size, result); \
-    return;                                         \
+#define HANDLE_DIM(NDIM)                                   \
+  if (input_dims == NDIM) {                                \
+    HandleCase<NDIM>(context, begin, size, input, result); \
+    return;                                                \
   }
 
       HANDLE_DIM(1);
@@ -214,8 +220,9 @@ class SliceOp : public OpKernel {
 
  private:
   template <int NDIM>
-  void HandleCase(OpKernelContext* context, const gtl::ArraySlice<int64>& begin,
-                  const gtl::ArraySlice<int64>& size, Tensor* result) {
+  void HandleCase(OpKernelContext* context, gtl::ArraySlice<int64> begin,
+                  gtl::ArraySlice<int64> size, const Tensor& input,
+                  Tensor* result) {
     Eigen::DSizes<Eigen::DenseIndex, NDIM> indices;
     Eigen::DSizes<Eigen::DenseIndex, NDIM> sizes;
     for (int i = 0; i < NDIM; ++i) {
@@ -223,12 +230,14 @@ class SliceOp : public OpKernel {
       sizes[i] = size[i];
     }
 
-    functor::Slice<Device, T, NDIM>()(
-        context->eigen_device<Device>(), result->tensor<T, NDIM>(),
-        context->input(0).tensor<T, NDIM>(), indices, sizes);
+    functor::Slice<Device, T, NDIM>()(context->eigen_device<Device>(),
+                                      result->tensor<T, NDIM>(),
+                                      input.tensor<T, NDIM>(), indices, sizes);
   }
 };
 
+}  // namespace
+
 // Forward declarations of the functor specializations for declared in the
 // sharded source files.
 namespace functor {
diff --git a/tensorflow/core/kernels/slice_op_test.cc b/tensorflow/core/kernels/slice_op_test.cc
index 2ddc3e9220f..f589a09c4fc 100644
--- a/tensorflow/core/kernels/slice_op_test.cc
+++ b/tensorflow/core/kernels/slice_op_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
@@ -62,10 +63,14 @@ static void SliceHelper(int iters, int size) {
                   .Input(test::graph::Constant(g, sizes))
                   .Attr("T", dt)
                   .Finalize(g, &node));
+  FixupSourceAndSinkEdges(g);
 
   testing::BytesProcessed(static_cast<int64>(iters) * kDim * size * sizeof(T));
   testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
+                  "SINGLE_THREADED_EXECUTOR")
+      .Run(iters);
+
   testing::UseRealTime();
 }
 
diff --git a/tensorflow/core/kernels/sparse/BUILD b/tensorflow/core/kernels/sparse/BUILD
index 6b4dba69ff2..fc7fe089f64 100644
--- a/tensorflow/core/kernels/sparse/BUILD
+++ b/tensorflow/core/kernels/sparse/BUILD
@@ -60,6 +60,7 @@ tf_kernel_library(
         "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:bitwise_ops_op_lib",
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
         "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:math_ops_op_lib",
diff --git a/tensorflow/core/kernels/sparse/dense_to_csr_sparse_matrix_op.cc b/tensorflow/core/kernels/sparse/dense_to_csr_sparse_matrix_op.cc
index b42d315789b..fdcff6876c3 100644
--- a/tensorflow/core/kernels/sparse/dense_to_csr_sparse_matrix_op.cc
+++ b/tensorflow/core/kernels/sparse/dense_to_csr_sparse_matrix_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_reference.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc b/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc
index 47efd24f83a..893909ef2fa 100644
--- a/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc
+++ b/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_reference.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
diff --git a/tensorflow/core/kernels/sparse_conditional_accumulator.h b/tensorflow/core/kernels/sparse_conditional_accumulator.h
index a4453bd7abb..b0dd6fa79eb 100644
--- a/tensorflow/core/kernels/sparse_conditional_accumulator.h
+++ b/tensorflow/core/kernels/sparse_conditional_accumulator.h
@@ -83,7 +83,7 @@ class SparseConditionalAccumulator
 
   Status ValidateShape(
       std::tuple<const Tensor*, const Tensor*, const Tensor*>* tensor,
-      bool has_known_shape) EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
+      bool has_known_shape) TF_EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
     const Tensor* tensor_idx = std::get<0>(*tensor);
     const Tensor* tensor_val = std::get<1>(*tensor);
     const Tensor* tensor_shape = std::get<2>(*tensor);
@@ -319,7 +319,7 @@ class SparseConditionalAccumulator
   }
 
   void DivideAccumGradByCounter(OpKernelContext* ctx) override
-      EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
     const int64 nnz = count_element_->size();
     auto accum_flat = accum_val_->flat_outer_dims<T>();
     std::vector<T> count_typet;
@@ -356,7 +356,7 @@ class SparseConditionalAccumulator
   bool GetAndValidateTensorInputForApplyGrad(
       OpKernelContext* ctx,
       std::tuple<const Tensor*, const Tensor*, const Tensor*>** tensor) override
-      EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(this->mu_) {
     // TODO(xinghao, jmchen): The roundabout way of getting attr from
     // OpKernelContext (instead of OpKernelConstruction) is a hack, and should
     // be fixed if it affects efficiency.
diff --git a/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc b/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc
index 0d9980422a7..3b26709d421 100644
--- a/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc
+++ b/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc
@@ -49,7 +49,7 @@ class SparseConditionalAccumulatorOp : public ConditionalAccumulatorBaseOp {
   }
 
   void SetHandleToOutput(OpKernelContext* ctx)
-      SHARED_LOCKS_REQUIRED(mu_) override {
+      TF_SHARED_LOCKS_REQUIRED(mu_) override {
     ctx->set_output_ref(0, &mu_, accumulator_handle_.AccessTensor(ctx));
   }
 
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index 6370eeefd3d..eb460147d71 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -892,14 +892,14 @@ class LibxsmmSparseMatMul {
     // sure the same cache entry is not used from two threads at a time.
     std::multimap<std::tuple<int, int, int, int>,
                   std::unique_ptr<TensorInfoCacheEntry>>
-        entries GUARDED_BY(lock);
+        entries TF_GUARDED_BY(lock);
 
     TensorInfoCache() : lock(), entries() {}
     // Look up and remove first entry with these parameters, creating one if
     // there isn't one
     std::unique_ptr<TensorInfoCacheEntry> take_cache_entry(int M, int K, int N,
                                                            int max_threads)
-        LOCKS_EXCLUDED(lock) {
+        TF_LOCKS_EXCLUDED(lock) {
       tensorflow::mutex_lock ml(lock);
       auto key = std::make_tuple(M, K, N, max_threads);
       auto it = entries.find(key);
@@ -918,7 +918,7 @@ class LibxsmmSparseMatMul {
     }
     // Add a cache entry with certain parameters
     void return_cache_entry(std::unique_ptr<TensorInfoCacheEntry> e)
-        LOCKS_EXCLUDED(lock) {
+        TF_LOCKS_EXCLUDED(lock) {
       tensorflow::mutex_lock ml(lock);
       auto key = std::make_tuple(e->M, e->K, e->N, e->max_threads);
       entries.insert(std::make_pair(key, std::move(e)));
diff --git a/tensorflow/core/kernels/sparse_tensors_map_ops.cc b/tensorflow/core/kernels/sparse_tensors_map_ops.cc
index 939638b3705..c2c0e43ca2b 100644
--- a/tensorflow/core/kernels/sparse_tensors_map_ops.cc
+++ b/tensorflow/core/kernels/sparse_tensors_map_ops.cc
@@ -110,8 +110,9 @@ class SparseTensorsMap : public ResourceBase {
   string name_;
 
   mutex mu_;
-  int64 counter_ GUARDED_BY(mu_);
-  std::unordered_map<int64, PersistentSparseTensor> sp_tensors_ GUARDED_BY(mu_);
+  int64 counter_ TF_GUARDED_BY(mu_);
+  std::unordered_map<int64, PersistentSparseTensor> sp_tensors_
+      TF_GUARDED_BY(mu_);
 };
 
 class SparseTensorAccessingOp : public OpKernel {
@@ -157,7 +158,7 @@ class SparseTensorAccessingOp : public OpKernel {
   ContainerInfo cinfo_;
 
   mutex mu_;
-  SparseTensorsMap* sparse_tensors_map_ PT_GUARDED_BY(mu_);
+  SparseTensorsMap* sparse_tensors_map_ TF_PT_GUARDED_BY(mu_);
 };
 
 class AddSparseToTensorsMapOp : public SparseTensorAccessingOp {
diff --git a/tensorflow/core/kernels/spectrogram.cc b/tensorflow/core/kernels/spectrogram.cc
index 640cf7447f8..07eff7b45b0 100644
--- a/tensorflow/core/kernels/spectrogram.cc
+++ b/tensorflow/core/kernels/spectrogram.cc
@@ -64,19 +64,35 @@ bool Spectrogram::Initialize(const std::vector<double>& window,
   output_frequency_channels_ = 1 + fft_length_ / 2;
 
   // Allocate 2 more than what rdft needs, so we can rationalize the layout.
-  fft_input_output_.assign(fft_length_ + 2, 0.0);
+  fft_input_output_.resize(fft_length_ + 2);
 
   int half_fft_length = fft_length_ / 2;
-  fft_double_working_area_.assign(half_fft_length, 0.0);
-  fft_integer_working_area_.assign(2 + static_cast<int>(sqrt(half_fft_length)),
-                                   0);
+  fft_double_working_area_.resize(half_fft_length);
+  fft_integer_working_area_.resize(2 + static_cast<int>(sqrt(half_fft_length)));
+  initialized_ = true;
+  if (!Reset()) {
+    LOG(ERROR) << "Failed to Reset()";
+    return false;
+  }
+  return true;
+}
+
+bool Spectrogram::Reset() {
+  if (!initialized_) {
+    LOG(ERROR) << "Initialize() has to be called, before Reset().";
+    return false;
+  }
+  std::fill(fft_double_working_area_.begin(), fft_double_working_area_.end(),
+            0.0);
+  std::fill(fft_integer_working_area_.begin(), fft_integer_working_area_.end(),
+            0);
+
   // Set flag element to ensure that the working areas are initialized
   // on the first call to cdft.  It's redundant given the assign above,
   // but keep it as a reminder.
   fft_integer_working_area_[0] = 0;
   input_queue_.clear();
   samples_to_next_step_ = window_length_;
-  initialized_ = true;
   return true;
 }
 
diff --git a/tensorflow/core/kernels/spectrogram.h b/tensorflow/core/kernels/spectrogram.h
index fef0e649428..62544f575de 100644
--- a/tensorflow/core/kernels/spectrogram.h
+++ b/tensorflow/core/kernels/spectrogram.h
@@ -56,6 +56,19 @@ class Spectrogram {
   // Initialize with an explicit window instead of a length.
   bool Initialize(const std::vector<double>& window, int step_length);
 
+  // Reset internal variables.
+  // Spectrogram keeps internal state: remaining input data from previous call.
+  // As a result it can produce different number of frames when you call
+  // ComputeComplexSpectrogram multiple times (even though input data
+  // has the same size). As it is shown in
+  // MultipleCallsToComputeComplexSpectrogramMayYieldDifferentNumbersOfFrames
+  // in tensorflow/core/kernels/spectrogram_test.cc.
+  // But if you need to compute Spectrogram on input data without keeping
+  // internal state (and clear remaining input data from the previous call)
+  // you have to call Reset() before computing Spectrogram.
+  // For example in tensorflow/core/kernels/spectrogram_op.cc
+  bool Reset();
+
   // Processes an arbitrary amount of audio data (contained in input)
   // to yield complex spectrogram frames. After a successful call to
   // Initialize(), Process() may be called repeatedly with new input data
diff --git a/tensorflow/core/kernels/spectrogram_op.cc b/tensorflow/core/kernels/spectrogram_op.cc
index 8e64596b532..7977b3eac64 100644
--- a/tensorflow/core/kernels/spectrogram_op.cc
+++ b/tensorflow/core/kernels/spectrogram_op.cc
@@ -72,6 +72,9 @@ class AudioSpectrogramOp : public OpKernel {
 
     std::vector<float> input_for_channel(sample_count);
     for (int64 channel = 0; channel < channel_count; ++channel) {
+      OP_REQUIRES(context, spectrogram.Reset(),
+                  errors::InvalidArgument("Failed to Reset()"));
+
       float* output_slice =
           output_flat + (channel * output_height * output_width);
       for (int i = 0; i < sample_count; ++i) {
diff --git a/tensorflow/core/kernels/spectrogram_op_test.cc b/tensorflow/core/kernels/spectrogram_op_test.cc
index d34a7c99ecb..380c51e6508 100644
--- a/tensorflow/core/kernels/spectrogram_op_test.cc
+++ b/tensorflow/core/kernels/spectrogram_op_test.cc
@@ -101,6 +101,45 @@ TEST(SpectrogramOpTest, SquaredTest) {
       test::AsTensor<float>({0, 1, 4, 1, 0}, TensorShape({1, 1, 5})), 1e-3);
 }
 
+TEST(SpectrogramOpTest, MultichannelTest) {
+  Scope root = Scope::NewRootScope();
+
+  const int audio_size = 8;
+  const int channel_size = 2;
+  Tensor audio_tensor(DT_FLOAT, TensorShape({audio_size, channel_size}));
+  test::FillValues<float>(
+      &audio_tensor, {-1.0f, -1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, -1.0f,
+                      -1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f});
+
+  Output audio_const_op = Const(root.WithOpName("audio_const_op"),
+                                Input::Initializer(audio_tensor));
+
+  AudioSpectrogram spectrogram_op =
+      AudioSpectrogram(root.WithOpName("spectrogram_op"), audio_const_op,
+                       audio_size, channel_size);
+
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+
+  TF_EXPECT_OK(session.Run(ClientSession::FeedType(),
+                           {spectrogram_op.spectrogram}, &outputs));
+
+  const Tensor& spectrogram_tensor = outputs[0];
+
+  EXPECT_EQ(3, spectrogram_tensor.dims());
+  EXPECT_EQ(5, spectrogram_tensor.dim_size(2));
+  EXPECT_EQ(1, spectrogram_tensor.dim_size(1));
+  EXPECT_EQ(channel_size, spectrogram_tensor.dim_size(0));
+
+  for (int channel = 0; channel < channel_size; channel++) {
+    test::ExpectTensorNear<float>(
+        spectrogram_tensor.SubSlice(channel),
+        test::AsTensor<float>({0, 1, 2, 1, 0}, TensorShape({1, 5})), 1e-3);
+  }
+}
+
 }  // namespace
 }  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/stack.cc b/tensorflow/core/kernels/stack.cc
index f6d37edc896..a30729c3b2f 100644
--- a/tensorflow/core/kernels/stack.cc
+++ b/tensorflow/core/kernels/stack.cc
@@ -112,10 +112,10 @@ class Stack : public ResourceBase {
   const string stack_name_;
   Tensor handle_;
   int max_size_;
-  bool closed_ GUARDED_BY(mu_);
-  std::vector<TensorAndAllocation> stack_ GUARDED_BY(mu_);
+  bool closed_ TF_GUARDED_BY(mu_);
+  std::vector<TensorAndAllocation> stack_ TF_GUARDED_BY(mu_);
 
-  Status CheckNotClosed() const EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  Status CheckNotClosed() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (closed_) {
       return errors::InvalidArgument("Stack[", stack_name_,
                                      "] has already been closed.");
diff --git a/tensorflow/core/kernels/stateful_random_ops.cc b/tensorflow/core/kernels/stateful_random_ops.cc
index 02a8a932a6b..01fb6fce75d 100644
--- a/tensorflow/core/kernels/stateful_random_ops.cc
+++ b/tensorflow/core/kernels/stateful_random_ops.cc
@@ -28,7 +28,7 @@ struct UpdateVariableAndFill_Philox<CPUDevice, Distribution> {
   void operator()(OpKernelContext* ctx, const CPUDevice& device,
                   Distribution dist, UpdateVariableAndFill_Philox_Arg* arg,
                   typename Distribution::ResultElementType* output_data)
-      UNLOCK_FUNCTION() {
+      TF_UNLOCK_FUNCTION() {
     int64 output_size = arg->output_size;
     int64 alg_tag_skip = arg->alg_tag_skip;
     ScopedUnlockUnrefVar* state_var_guard = arg->not_used;
diff --git a/tensorflow/core/kernels/tensor_array.h b/tensorflow/core/kernels/tensor_array.h
index e41b15016b6..405eba6c542 100644
--- a/tensorflow/core/kernels/tensor_array.h
+++ b/tensorflow/core/kernels/tensor_array.h
@@ -353,18 +353,18 @@ class TensorArray : public ResourceBase {
 
  private:
   Status LockedWrite(OpKernelContext* ctx, const int32 index,
-                     PersistentTensor* value) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+                     PersistentTensor* value) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   template <typename Device, typename T>
   Status LockedWriteOrAggregate(OpKernelContext* ctx, const int32 index,
                                 PersistentTensor* value)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   template <typename Device, typename T>
   Status LockedRead(OpKernelContext* ctx, const int32 index,
-                    PersistentTensor* value) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+                    PersistentTensor* value) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-  Status LockedReturnIfClosed() const EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  Status LockedReturnIfClosed() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (closed_) {
       return errors::InvalidArgument("TensorArray ", handle_.vec<tstring>()(1),
                                      " has already been closed.");
@@ -380,7 +380,7 @@ class TensorArray : public ResourceBase {
   mutable mutex mu_;
 
   // Marks that the tensor_array_ has been cleared.
-  bool closed_ GUARDED_BY(mu_);
+  bool closed_ TF_GUARDED_BY(mu_);
 
   // Writes are allowed to grow the array.
   bool dynamic_size_;
@@ -391,7 +391,7 @@ class TensorArray : public ResourceBase {
 
   // If multiple Writes were attempted (e.g. via attribute
   // multiple_writes_aggregate), then gradients are disallowed.
-  bool gradients_disallowed_ GUARDED_BY(mu_);
+  bool gradients_disallowed_ TF_GUARDED_BY(mu_);
 
   // After a read at an index, clear away its PersistentTensor to
   // release memory.
@@ -406,7 +406,7 @@ class TensorArray : public ResourceBase {
 
   // The shape of each element in the TensorArray, may be partially known or not
   // known at all.
-  PartialTensorShape element_shape_ GUARDED_BY(mu_);
+  PartialTensorShape element_shape_ TF_GUARDED_BY(mu_);
 
   // Whether all elements in the TensorArray have identical shapes.
   // This allows certain behaviors, like dynamically checking for
@@ -437,7 +437,7 @@ class TensorArray : public ResourceBase {
     bool local_copy;
   };
   // The list of underlying PersistentTensors and states.
-  std::vector<TensorAndState> tensors_ GUARDED_BY(mu_);
+  std::vector<TensorAndState> tensors_ TF_GUARDED_BY(mu_);
 };
 
 template <typename Device, typename T>
diff --git a/tensorflow/core/kernels/topk_op.cc b/tensorflow/core/kernels/topk_op.cc
index 02b99e44880..c555b42f005 100644
--- a/tensorflow/core/kernels/topk_op.cc
+++ b/tensorflow/core/kernels/topk_op.cc
@@ -258,6 +258,7 @@ namespace functor {
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 TF_CALL_INTEGRAL_TYPES(DECLARE_GPU_SPEC);
+TF_CALL_uint32(DECLARE_GPU_SPEC);
 
 #undef DECLARE_GPU_SPEC
 
@@ -275,7 +276,7 @@ TF_CALL_INTEGRAL_TYPES(DECLARE_GPU_SPEC);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS);
 TF_CALL_INTEGRAL_TYPES(REGISTER_KERNELS);
-
+TF_CALL_uint32(REGISTER_KERNELS)
 #undef REGISTER_KERNELS
 
 #endif  // end GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_uint16.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint16.cu.cc
index 8d5f8ceb06d..bc64a2ecd63 100644
--- a/tensorflow/core/kernels/topk_op_gpu_uint16.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_uint16.cu.cc
@@ -23,6 +23,8 @@ namespace tensorflow {
 using Eigen::GpuDevice;
 
 template struct functor::TopKFunctor<GPUDevice, uint16>;
+template struct functor::TopKFunctor<GPUDevice, uint32>;
+
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index 2f65338165a..68641b37733 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -1107,7 +1107,7 @@ class SparseApplyProximalGradientDescentOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
   }
 
-  void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
+  void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS {
     const bool sparse = true;
     auto locks = MaybeLockVariableInputMutexesInOrder<CPUDevice, T>(
         ctx, use_exclusive_lock_, sparse, {0});
@@ -1578,7 +1578,7 @@ class SparseApplyAdagradOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("update_slots", &update_slots_));
   }
 
-  void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
+  void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS {
     const bool sparse = true;
     auto locks = MaybeLockVariableInputMutexesInOrder<CPUDevice, T>(
         ctx, use_exclusive_lock_, sparse, {0, 1});
@@ -1745,7 +1745,7 @@ class SparseApplyAdagradV2Op : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("update_slots", &update_slots_));
   }
 
-  void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
+  void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS {
     const bool sparse = true;
     auto locks = MaybeLockVariableInputMutexesInOrder<CPUDevice, T>(
         ctx, use_exclusive_lock_, sparse, {0, 1});
@@ -1920,7 +1920,7 @@ class SparseApplyProximalAdagradOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
   }
 
-  void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
+  void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS {
     const bool sparse = true;
     auto locks = MaybeLockVariableInputMutexesInOrder<CPUDevice, T>(
         ctx, use_exclusive_lock_, sparse, {0, 1});
@@ -2195,7 +2195,7 @@ class SparseApplyAdagradDAOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
   }
 
-  void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
+  void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS {
     const bool sparse = true;
     auto locks = MaybeLockVariableInputMutexesInOrder<CPUDevice, T>(
         ctx, use_exclusive_lock_, sparse, {0, 1, 2});
@@ -2581,7 +2581,7 @@ class SparseApplyFtrlOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
   }
 
-  void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
+  void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS {
     const bool sparse = true;
     auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
         ctx, use_exclusive_lock_, sparse, {0, 1, 2});
@@ -2972,7 +2972,7 @@ class SparseApplyMomentumOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_));
   }
 
-  void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
+  void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS {
     const bool sparse = true;
     auto locks = MaybeLockVariableInputMutexesInOrder<CPUDevice, T>(
         ctx, use_exclusive_lock_, sparse, {0, 1});
@@ -3207,7 +3207,7 @@ class SparseApplyKerasMomentumOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_));
   }
 
-  void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
+  void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS {
     const bool sparse = true;
     auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
         ctx, use_exclusive_lock_, sparse, {0, 1});
@@ -4155,7 +4155,7 @@ class SparseApplyRMSPropOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
   }
 
-  void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
+  void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS {
     const bool sparse = true;
     auto locks = MaybeLockVariableInputMutexesInOrder<CPUDevice, T>(
         ctx, use_exclusive_lock_, sparse, {0, 1, 2});
@@ -4285,7 +4285,7 @@ class SparseApplyCenteredRMSPropOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
   }
 
-  void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
+  void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS {
     const bool sparse = true;
     auto locks = MaybeLockVariableInputMutexesInOrder<CPUDevice, T>(
         ctx, use_exclusive_lock_, sparse, {0, 1, 2, 3});
diff --git a/tensorflow/core/kernels/typed_conditional_accumulator_base.h b/tensorflow/core/kernels/typed_conditional_accumulator_base.h
index ca341e511ea..c8c58f5b445 100644
--- a/tensorflow/core/kernels/typed_conditional_accumulator_base.h
+++ b/tensorflow/core/kernels/typed_conditional_accumulator_base.h
@@ -83,7 +83,7 @@ class TypedConditionalAccumulatorBase : public ConditionalAccumulatorBase {
   // Gradient is returned via the GradientTensorType** tensor.
   virtual bool GetAndValidateTensorInputForApplyGrad(
       OpKernelContext* ctx, GradientTensorType** tensor)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) = 0;
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) = 0;
 
   // Method for cleaning up any memory allocated in
   // GetAndValidateTensorInputForApplyGrad
diff --git a/tensorflow/core/kernels/typed_queue.h b/tensorflow/core/kernels/typed_queue.h
index 43dcb4cef74..f24a8c19665 100644
--- a/tensorflow/core/kernels/typed_queue.h
+++ b/tensorflow/core/kernels/typed_queue.h
@@ -41,7 +41,7 @@ class TypedQueue : public QueueBase {
   int64 MemoryUsed() const override;
 
  protected:
-  std::vector<SubQueue> queues_ GUARDED_BY(mu_);
+  std::vector<SubQueue> queues_ TF_GUARDED_BY(mu_);
 };  // class TypedQueue
 
 template <typename SubQueue>
diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc
index b023a506d86..374be1ce4ec 100644
--- a/tensorflow/core/kernels/variable_ops.cc
+++ b/tensorflow/core/kernels/variable_ops.cc
@@ -236,6 +236,7 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNEL);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 TF_CALL_int64(REGISTER_GPU_KERNELS);
+TF_CALL_uint32(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
diff --git a/tensorflow/core/kernels/variable_ops.h b/tensorflow/core/kernels/variable_ops.h
index 4742e429ed9..8a7578770fa 100644
--- a/tensorflow/core/kernels/variable_ops.h
+++ b/tensorflow/core/kernels/variable_ops.h
@@ -38,8 +38,8 @@ class VariableOp : public OpKernel {
   TensorShape shape_;
 
   mutex init_mu_;
-  ContainerInfo cinfo_ GUARDED_BY(init_mu_);
-  bool initialized_ GUARDED_BY(init_mu_){false};
+  ContainerInfo cinfo_ TF_GUARDED_BY(init_mu_);
+  bool initialized_ TF_GUARDED_BY(init_mu_){false};
 
   TF_DISALLOW_COPY_AND_ASSIGN(VariableOp);
 };
diff --git a/tensorflow/core/kernels/word2vec_kernels.cc b/tensorflow/core/kernels/word2vec_kernels.cc
index 42b70e92bab..73f55923a32 100644
--- a/tensorflow/core/kernels/word2vec_kernels.cc
+++ b/tensorflow/core/kernels/word2vec_kernels.cc
@@ -124,19 +124,20 @@ class SkipgramOp : public OpKernel {
   int sentence_index_ = 0;
 
   mutex mu_;
-  random::PhiloxRandom philox_ GUARDED_BY(mu_);
-  random::SimplePhilox rng_ GUARDED_BY(mu_);
-  int32 current_epoch_ GUARDED_BY(mu_) = -1;
-  int64 total_words_processed_ GUARDED_BY(mu_) = 0;
-  int32 example_pos_ GUARDED_BY(mu_);
-  int32 label_pos_ GUARDED_BY(mu_);
-  int32 label_limit_ GUARDED_BY(mu_);
+  random::PhiloxRandom philox_ TF_GUARDED_BY(mu_);
+  random::SimplePhilox rng_ TF_GUARDED_BY(mu_);
+  int32 current_epoch_ TF_GUARDED_BY(mu_) = -1;
+  int64 total_words_processed_ TF_GUARDED_BY(mu_) = 0;
+  int32 example_pos_ TF_GUARDED_BY(mu_);
+  int32 label_pos_ TF_GUARDED_BY(mu_);
+  int32 label_limit_ TF_GUARDED_BY(mu_);
 
   // {example_pos_, label_pos_} is the cursor for the next example.
   // example_pos_ wraps around at the end of corpus_. For each
   // example, we randomly generate [label_pos_, label_limit) for
   // labels.
-  void NextExample(int32* example, int32* label) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  void NextExample(int32* example, int32* label)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     while (true) {
       if (label_pos_ >= label_limit_) {
         ++total_words_processed_;
diff --git a/tensorflow/core/lib/db/sqlite.h b/tensorflow/core/lib/db/sqlite.h
index efe97f78d25..6bfb3556897 100644
--- a/tensorflow/core/lib/db/sqlite.h
+++ b/tensorflow/core/lib/db/sqlite.h
@@ -57,7 +57,7 @@ class SqliteTransaction;
 /// This veneer uses auto-commit mode by default, which means a 4ms
 /// fsync() happens after every write unless a SqliteTransaction is
 /// used or WAL mode is enabled beforehand.
-class LOCKABLE Sqlite : public core::RefCounted {
+class TF_LOCKABLE Sqlite : public core::RefCounted {
  public:
   /// \brief Closes SQLite connection, which can take milliseconds.
   virtual ~Sqlite();
@@ -95,22 +95,22 @@ class LOCKABLE Sqlite : public core::RefCounted {
   /// If the most recent API call was successful, the result is
   /// undefined. The legacy result code can be obtained by saying
   /// errcode() & 0xff.
-  int errcode() const EXCLUSIVE_LOCKS_REQUIRED(this) {
+  int errcode() const TF_EXCLUSIVE_LOCKS_REQUIRED(this) {
     return sqlite3_extended_errcode(db_);
   }
 
   /// \brief Returns pointer to current error message state.
-  const char* errmsg() const EXCLUSIVE_LOCKS_REQUIRED(this) {
+  const char* errmsg() const TF_EXCLUSIVE_LOCKS_REQUIRED(this) {
     return sqlite3_errmsg(db_);
   }
 
   /// \brief Returns rowid assigned to last successful insert.
-  int64 last_insert_rowid() const EXCLUSIVE_LOCKS_REQUIRED(this) {
+  int64 last_insert_rowid() const TF_EXCLUSIVE_LOCKS_REQUIRED(this) {
     return sqlite3_last_insert_rowid(db_);
   }
 
   /// \brief Returns number of rows directly changed by last write.
-  int64 changes() const EXCLUSIVE_LOCKS_REQUIRED(this) {
+  int64 changes() const TF_EXCLUSIVE_LOCKS_REQUIRED(this) {
     return sqlite3_changes(db_);
   }
 
@@ -380,19 +380,19 @@ class SqliteStatement {
 /// \brief Reentrant SQLite connection object lock
 ///
 /// This is a no-op if SQLITE_OPEN_NOMUTEX was used.
-class SCOPED_LOCKABLE SqliteLock {
+class TF_SCOPED_LOCKABLE SqliteLock {
  public:
-  explicit SqliteLock(Sqlite& db) EXCLUSIVE_LOCK_FUNCTION(db)
+  explicit SqliteLock(Sqlite& db) TF_EXCLUSIVE_LOCK_FUNCTION(db)
       : mutex_(sqlite3_db_mutex(db.db_)) {
     sqlite3_mutex_enter(mutex_);
   }
-  SqliteLock(Sqlite& db, std::try_to_lock_t) EXCLUSIVE_LOCK_FUNCTION(db)
+  SqliteLock(Sqlite& db, std::try_to_lock_t) TF_EXCLUSIVE_LOCK_FUNCTION(db)
       : mutex_(sqlite3_db_mutex(db.db_)) {
     if (TF_PREDICT_FALSE(sqlite3_mutex_try(mutex_) != SQLITE_OK)) {
       is_locked_ = false;
     }
   }
-  ~SqliteLock() UNLOCK_FUNCTION() {
+  ~SqliteLock() TF_UNLOCK_FUNCTION() {
     if (is_locked_) sqlite3_mutex_leave(mutex_);
   }
   explicit operator bool() const { return is_locked_; }
@@ -409,17 +409,17 @@ class SCOPED_LOCKABLE SqliteLock {
 /// This class acquires an exclusive lock on the connection object (if
 /// mutexes weren't disabled) and runs BEGIN / ROLLBACK automatically.
 /// Unlike SqliteLock this scope is non-reentrant. To avoid program
-/// crashes, business logic should use the EXCLUSIVE_LOCK_FUNCTION and
-/// LOCKS_EXCLUDED annotations as much as possible.
-class SCOPED_LOCKABLE SqliteTransaction {
+/// crashes, business logic should use the TF_EXCLUSIVE_LOCK_FUNCTION and
+/// TF_LOCKS_EXCLUDED annotations as much as possible.
+class TF_SCOPED_LOCKABLE SqliteTransaction {
  public:
   /// \brief Locks db and begins deferred transaction.
   ///
   /// This will crash if a transaction is already active.
-  explicit SqliteTransaction(Sqlite& db) EXCLUSIVE_LOCK_FUNCTION(db);
+  explicit SqliteTransaction(Sqlite& db) TF_EXCLUSIVE_LOCK_FUNCTION(db);
 
   /// \brief Runs ROLLBACK and unlocks.
-  ~SqliteTransaction() UNLOCK_FUNCTION();
+  ~SqliteTransaction() TF_UNLOCK_FUNCTION();
 
   /// \brief Commits transaction.
   ///
@@ -435,8 +435,8 @@ class SCOPED_LOCKABLE SqliteTransaction {
 };
 
 #define SQLITE_EXCLUSIVE_TRANSACTIONS_REQUIRED(...) \
-  EXCLUSIVE_LOCKS_REQUIRED(__VA_ARGS__)
-#define SQLITE_TRANSACTIONS_EXCLUDED(...) LOCKS_EXCLUDED(__VA_ARGS__)
+  TF_EXCLUSIVE_LOCKS_REQUIRED(__VA_ARGS__)
+#define SQLITE_TRANSACTIONS_EXCLUDED(...) TF_LOCKS_EXCLUDED(__VA_ARGS__)
 
 inline SqliteStatement Sqlite::PrepareOrDie(const StringPiece& sql) {
   SqliteStatement stmt;
diff --git a/tensorflow/core/lib/histogram/histogram.h b/tensorflow/core/lib/histogram/histogram.h
index f882ee9abe8..cb1ba0a00fa 100644
--- a/tensorflow/core/lib/histogram/histogram.h
+++ b/tensorflow/core/lib/histogram/histogram.h
@@ -130,7 +130,7 @@ class ThreadSafeHistogram {
 
  private:
   mutable mutex mu_;
-  Histogram histogram_ GUARDED_BY(mu_);
+  Histogram histogram_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace histogram
diff --git a/tensorflow/core/lib/io/cache.cc b/tensorflow/core/lib/io/cache.cc
index b5521b1752b..4e5fa12474e 100644
--- a/tensorflow/core/lib/io/cache.cc
+++ b/tensorflow/core/lib/io/cache.cc
@@ -186,25 +186,25 @@ class LRUCache {
   void LRU_Append(LRUHandle* list, LRUHandle* e);
   void Ref(LRUHandle* e);
   void Unref(LRUHandle* e);
-  bool FinishErase(LRUHandle* e) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  bool FinishErase(LRUHandle* e) TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   // Initialized before use.
   size_t capacity_;
 
   // mutex_ protects the following state.
   mutable mutex mutex_;
-  size_t usage_ GUARDED_BY(mutex_);
+  size_t usage_ TF_GUARDED_BY(mutex_);
 
   // Dummy head of LRU list.
   // lru.prev is newest entry, lru.next is oldest entry.
   // Entries have refs==1 and in_cache==true.
-  LRUHandle lru_ GUARDED_BY(mutex_);
+  LRUHandle lru_ TF_GUARDED_BY(mutex_);
 
   // Dummy head of in-use list.
   // Entries are in use by clients, and have refs >= 2 and in_cache==true.
-  LRUHandle in_use_ GUARDED_BY(mutex_);
+  LRUHandle in_use_ TF_GUARDED_BY(mutex_);
 
-  HandleTable table_ GUARDED_BY(mutex_);
+  HandleTable table_ TF_GUARDED_BY(mutex_);
 };
 
 LRUCache::LRUCache() : capacity_(0), usage_(0) {
diff --git a/tensorflow/core/lib/io/inputbuffer.cc b/tensorflow/core/lib/io/inputbuffer.cc
index f6a2d91e665..2b138b825e4 100644
--- a/tensorflow/core/lib/io/inputbuffer.cc
+++ b/tensorflow/core/lib/io/inputbuffer.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/lib/io/inputbuffer.h"
+
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -196,5 +197,45 @@ Status InputBuffer::Seek(int64 position) {
   return Status::OK();
 }
 
+Status InputBuffer::Hint(int64 bytes_to_read) {
+  if (bytes_to_read < 0) {
+    return errors::InvalidArgument("Can't read a negative number of bytes: ",
+                                   bytes_to_read);
+  }
+
+  // The internal buffer is too small. Do nothing.
+  if (bytes_to_read > size_) {
+    return Status::OK();
+  }
+
+  const int64 bytes_remain_in_buf = static_cast<int64>(limit_ - pos_);
+
+  // There are enough data in the buffer. Do nothing.
+  if (bytes_to_read <= bytes_remain_in_buf) {
+    return Status::OK();
+  }
+
+  // Additional read from file is necessary. Make some room.
+  memmove(buf_, pos_, bytes_remain_in_buf);
+  pos_ = buf_;
+  limit_ = buf_ + bytes_remain_in_buf;
+  bytes_to_read -= bytes_remain_in_buf;
+
+  // Read the remaining bytes from file.
+  StringPiece data;
+  Status s = file_->Read(file_pos_, bytes_to_read, &data, limit_);
+  if (data.data() != limit_) {
+    memmove(limit_, data.data(), data.size());
+  }
+  limit_ += data.size();
+  file_pos_ += data.size();
+
+  if (errors::IsOutOfRange(s) && data.size() == bytes_to_read) {
+    return Status::OK();
+  } else {
+    return s;
+  }
+}
+
 }  // namespace io
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/io/inputbuffer.h b/tensorflow/core/lib/io/inputbuffer.h
index 61e977da6e1..d27f256b771 100644
--- a/tensorflow/core/lib/io/inputbuffer.h
+++ b/tensorflow/core/lib/io/inputbuffer.h
@@ -75,6 +75,9 @@ class InputBuffer {
   // read will trigger a File::Read().
   Status Seek(int64 position);
 
+  // Provides a hint about future reads, which may improve their performance.
+  Status Hint(int64 bytes_to_read);
+
   // Returns the position in the file.
   int64 Tell() const { return file_pos_ - (limit_ - pos_); }
 
diff --git a/tensorflow/core/lib/io/inputbuffer_test.cc b/tensorflow/core/lib/io/inputbuffer_test.cc
index a8d75edc610..f2bd5162c6c 100644
--- a/tensorflow/core/lib/io/inputbuffer_test.cc
+++ b/tensorflow/core/lib/io/inputbuffer_test.cc
@@ -378,5 +378,48 @@ TEST(InputBuffer, ReadVarint64) {
   }
 }
 
+TEST(InputBuffer, Hint) {
+  Env* env = Env::Default();
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
+  TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
+
+  for (auto buf_size : BufferSizes()) {
+    std::unique_ptr<RandomAccessFile> file;
+    TF_CHECK_OK(env->NewRandomAccessFile(fname, &file));
+    string read;
+    io::InputBuffer in(file.get(), buf_size);
+
+    TF_CHECK_OK(in.ReadNBytes(3, &read));
+    EXPECT_EQ(read, "012");
+    TF_CHECK_OK(in.Hint(4));
+    TF_CHECK_OK(in.ReadNBytes(3, &read));
+    EXPECT_EQ(read, "345");
+    TF_CHECK_OK(in.Hint(1));
+    TF_CHECK_OK(in.ReadNBytes(3, &read));
+    EXPECT_EQ(read, "678");
+
+    TF_CHECK_OK(in.Seek(0));
+    TF_CHECK_OK(in.Hint(7));
+    TF_CHECK_OK(in.ReadNBytes(3, &read));
+    EXPECT_EQ(read, "012");
+    TF_CHECK_OK(in.ReadNBytes(4, &read));
+    EXPECT_EQ(read, "3456");
+
+    TF_CHECK_OK(in.Hint(2));
+    TF_CHECK_OK(in.Seek(4));
+    TF_CHECK_OK(in.ReadNBytes(4, &read));
+    EXPECT_EQ(read, "4567");
+
+    TF_CHECK_OK(in.Seek(0));
+    TF_CHECK_OK(in.Hint(1 << 25));
+
+    TF_CHECK_OK(in.Seek(1 << 25));
+    EXPECT_TRUE(errors::IsOutOfRange(in.Hint(1)));
+
+    EXPECT_TRUE(errors::IsInvalidArgument(in.Hint(-1)));
+  }
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/monitoring/collection_registry.h b/tensorflow/core/lib/monitoring/collection_registry.h
index 26f3ec02694..26e484ccccb 100644
--- a/tensorflow/core/lib/monitoring/collection_registry.h
+++ b/tensorflow/core/lib/monitoring/collection_registry.h
@@ -152,7 +152,7 @@ class CollectionRegistry {
   std::unique_ptr<RegistrationHandle> Register(
       const AbstractMetricDef* metric_def,
       const CollectionFunction& collection_function)
-      LOCKS_EXCLUDED(mu_) TF_MUST_USE_RESULT;
+      TF_LOCKS_EXCLUDED(mu_) TF_MUST_USE_RESULT;
 
   // Options for collecting metrics.
   struct CollectMetricsOptions {
@@ -173,7 +173,7 @@ class CollectionRegistry {
   // Unregisters the metric from this registry. This is private because the
   // public interface provides a Registration handle which automatically calls
   // this upon destruction.
-  void Unregister(const AbstractMetricDef* metric_def) LOCKS_EXCLUDED(mu_);
+  void Unregister(const AbstractMetricDef* metric_def) TF_LOCKS_EXCLUDED(mu_);
 
   // TF environment, mainly used for timestamping.
   Env* const env_;
@@ -186,7 +186,7 @@ class CollectionRegistry {
     CollectionFunction collection_function;
     uint64 registration_time_millis;
   };
-  std::map<StringPiece, CollectionInfo> registry_ GUARDED_BY(mu_);
+  std::map<StringPiece, CollectionInfo> registry_ TF_GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(CollectionRegistry);
 };
@@ -264,7 +264,7 @@ class Collector {
   MetricCollector<metric_kind, Value, NumLabels> GetMetricCollector(
       const MetricDef<metric_kind, Value, NumLabels>* const metric_def,
       const uint64 registration_time_millis,
-      internal::Collector* const collector) LOCKS_EXCLUDED(mu_) {
+      internal::Collector* const collector) TF_LOCKS_EXCLUDED(mu_) {
     auto* const point_set = [&]() {
       mutex_lock l(mu_);
       return collected_metrics_->point_set_map
@@ -279,17 +279,17 @@ class Collector {
   uint64 collection_time_millis() const { return collection_time_millis_; }
 
   void CollectMetricDescriptor(const AbstractMetricDef* const metric_def)
-      LOCKS_EXCLUDED(mu_);
+      TF_LOCKS_EXCLUDED(mu_);
 
   void CollectMetricValues(
       const CollectionRegistry::CollectionInfo& collection_info);
 
   std::unique_ptr<CollectedMetrics> ConsumeCollectedMetrics()
-      LOCKS_EXCLUDED(mu_);
+      TF_LOCKS_EXCLUDED(mu_);
 
  private:
   mutable mutex mu_;
-  std::unique_ptr<CollectedMetrics> collected_metrics_ GUARDED_BY(mu_);
+  std::unique_ptr<CollectedMetrics> collected_metrics_ TF_GUARDED_BY(mu_);
   const uint64 collection_time_millis_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Collector);
diff --git a/tensorflow/core/lib/monitoring/counter.h b/tensorflow/core/lib/monitoring/counter.h
index 20522192778..19cc7c29221 100644
--- a/tensorflow/core/lib/monitoring/counter.h
+++ b/tensorflow/core/lib/monitoring/counter.h
@@ -100,7 +100,7 @@ class Counter {
   // Retrieves the cell for the specified labels, creating it on demand if
   // not already present.
   template <typename... Labels>
-  CounterCell* GetCell(const Labels&... labels) LOCKS_EXCLUDED(mu_);
+  CounterCell* GetCell(const Labels&... labels) TF_LOCKS_EXCLUDED(mu_);
 
   Status GetStatus() { return status_; }
 
@@ -136,7 +136,7 @@ class Counter {
   std::unique_ptr<CollectionRegistry::RegistrationHandle> registration_handle_;
 
   using LabelArray = std::array<string, NumLabels>;
-  std::map<LabelArray, CounterCell> cells_ GUARDED_BY(mu_);
+  std::map<LabelArray, CounterCell> cells_ TF_GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(Counter);
 };
@@ -164,7 +164,7 @@ Counter<NumLabels>* Counter<NumLabels>::New(
 template <int NumLabels>
 template <typename... Labels>
 CounterCell* Counter<NumLabels>::GetCell(const Labels&... labels)
-    LOCKS_EXCLUDED(mu_) {
+    TF_LOCKS_EXCLUDED(mu_) {
   // Provides a more informative error message than the one during array
   // construction below.
   static_assert(sizeof...(Labels) == NumLabels,
diff --git a/tensorflow/core/lib/monitoring/gauge.h b/tensorflow/core/lib/monitoring/gauge.h
index 092e48d8475..0aa47ad1f88 100644
--- a/tensorflow/core/lib/monitoring/gauge.h
+++ b/tensorflow/core/lib/monitoring/gauge.h
@@ -58,13 +58,13 @@ class GaugeCell {
   ~GaugeCell() {}
 
   // Atomically sets the value.
-  void Set(const T& value) LOCKS_EXCLUDED(mu_);
+  void Set(const T& value) TF_LOCKS_EXCLUDED(mu_);
 
   // Retrieves the current value.
-  T value() const LOCKS_EXCLUDED(mu_);
+  T value() const TF_LOCKS_EXCLUDED(mu_);
 
  private:
-  T value_ GUARDED_BY(mu_);
+  T value_ TF_GUARDED_BY(mu_);
   mutable mutex mu_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(GaugeCell);
@@ -152,7 +152,7 @@ class Gauge {
   // Retrieves the cell for the specified labels, creating it on demand if not
   // already present.
   template <typename... Labels>
-  GaugeCell<ValueType>* GetCell(const Labels&... labels) LOCKS_EXCLUDED(mu_);
+  GaugeCell<ValueType>* GetCell(const Labels&... labels) TF_LOCKS_EXCLUDED(mu_);
 
   Status GetStatus() { return status_; }
 
@@ -188,7 +188,7 @@ class Gauge {
   std::unique_ptr<CollectionRegistry::RegistrationHandle> registration_handle_;
 
   using LabelArray = std::array<string, NumLabels>;
-  std::map<LabelArray, GaugeCell<ValueType> > cells_ GUARDED_BY(mu_);
+  std::map<LabelArray, GaugeCell<ValueType> > cells_ TF_GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(Gauge);
 };
@@ -232,7 +232,7 @@ Gauge<ValueType, NumLabels>* Gauge<ValueType, NumLabels>::New(
 template <typename ValueType, int NumLabels>
 template <typename... Labels>
 GaugeCell<ValueType>* Gauge<ValueType, NumLabels>::GetCell(
-    const Labels&... labels) LOCKS_EXCLUDED(mu_) {
+    const Labels&... labels) TF_LOCKS_EXCLUDED(mu_) {
   // Provides a more informative error message than the one during array
   // construction below.
   static_assert(
diff --git a/tensorflow/core/lib/monitoring/percentile_sampler.h b/tensorflow/core/lib/monitoring/percentile_sampler.h
index b4e54eeec41..ddedf497557 100644
--- a/tensorflow/core/lib/monitoring/percentile_sampler.h
+++ b/tensorflow/core/lib/monitoring/percentile_sampler.h
@@ -76,11 +76,11 @@ class PercentileSamplerCell {
   mutable mutex mu_;
   UnitOfMeasure unit_of_measure_;
   const std::vector<double> percentiles_;
-  std::vector<Sample> samples_ GUARDED_BY(mu_);
-  size_t num_samples_ GUARDED_BY(mu_);
-  size_t next_position_ GUARDED_BY(mu_);
-  size_t total_samples_ GUARDED_BY(mu_);
-  long double accumulator_ GUARDED_BY(mu_);
+  std::vector<Sample> samples_ TF_GUARDED_BY(mu_);
+  size_t num_samples_ TF_GUARDED_BY(mu_);
+  size_t next_position_ TF_GUARDED_BY(mu_);
+  size_t total_samples_ TF_GUARDED_BY(mu_);
+  long double accumulator_ TF_GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(PercentileSamplerCell);
 };
@@ -120,7 +120,8 @@ class PercentileSampler {
   // Retrieves the cell for the specified labels, creating it on demand if
   // not already present.
   template <typename... Labels>
-  PercentileSamplerCell* GetCell(const Labels&... labels) LOCKS_EXCLUDED(mu_);
+  PercentileSamplerCell* GetCell(const Labels&... labels)
+      TF_LOCKS_EXCLUDED(mu_);
 
   Status GetStatus() { return status_; }
 
@@ -187,7 +188,7 @@ class PercentileSampler {
   // we need a container here that guarantees pointer stability of the value,
   // namely, the pointer of the value should remain valid even after more cells
   // are inserted.
-  std::map<LabelArray, PercentileSamplerCell> cells_ GUARDED_BY(mu_);
+  std::map<LabelArray, PercentileSamplerCell> cells_ TF_GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(PercentileSampler);
 };
@@ -205,7 +206,7 @@ PercentileSampler<NumLabels>* PercentileSampler<NumLabels>::New(
 template <int NumLabels>
 template <typename... Labels>
 PercentileSamplerCell* PercentileSampler<NumLabels>::GetCell(
-    const Labels&... labels) LOCKS_EXCLUDED(mu_) {
+    const Labels&... labels) TF_LOCKS_EXCLUDED(mu_) {
   // Provides a more informative error message than the one during array
   // construction below.
   static_assert(
diff --git a/tensorflow/core/lib/monitoring/sampler.h b/tensorflow/core/lib/monitoring/sampler.h
index c6f32d46fa2..2deaf54d2ad 100644
--- a/tensorflow/core/lib/monitoring/sampler.h
+++ b/tensorflow/core/lib/monitoring/sampler.h
@@ -137,7 +137,7 @@ class Sampler {
   // Retrieves the cell for the specified labels, creating it on demand if
   // not already present.
   template <typename... Labels>
-  SamplerCell* GetCell(const Labels&... labels) LOCKS_EXCLUDED(mu_);
+  SamplerCell* GetCell(const Labels&... labels) TF_LOCKS_EXCLUDED(mu_);
 
   Status GetStatus() { return status_; }
 
@@ -185,7 +185,7 @@ class Sampler {
   // we need a container here that guarantees pointer stability of the value,
   // namely, the pointer of the value should remain valid even after more cells
   // are inserted.
-  std::map<LabelArray, SamplerCell> cells_ GUARDED_BY(mu_);
+  std::map<LabelArray, SamplerCell> cells_ TF_GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(Sampler);
 };
@@ -213,7 +213,7 @@ Sampler<NumLabels>* Sampler<NumLabels>::New(
 template <int NumLabels>
 template <typename... Labels>
 SamplerCell* Sampler<NumLabels>::GetCell(const Labels&... labels)
-    LOCKS_EXCLUDED(mu_) {
+    TF_LOCKS_EXCLUDED(mu_) {
   // Provides a more informative error message than the one during array
   // construction below.
   static_assert(sizeof...(Labels) == NumLabels,
diff --git a/tensorflow/core/nccl/nccl_manager.cc b/tensorflow/core/nccl/nccl_manager.cc
index 0a591d406e8..9f9fa28be0b 100644
--- a/tensorflow/core/nccl/nccl_manager.cc
+++ b/tensorflow/core/nccl/nccl_manager.cc
@@ -88,8 +88,8 @@ struct NcclManager::NcclStream : public core::RefCounted {
   mutex mu;
   condition_variable cv;
   // Has (collective, participant_idx) pairs.
-  std::deque<std::pair<Collective*, int>> pending_launches_ GUARDED_BY(mu);
-  bool shutdown_requested GUARDED_BY(mu) = false;
+  std::deque<std::pair<Collective*, int>> pending_launches_ TF_GUARDED_BY(mu);
+  bool shutdown_requested TF_GUARDED_BY(mu) = false;
 };
 
 struct NcclManager::CommunicatorMember {
diff --git a/tensorflow/core/nccl/nccl_manager.h b/tensorflow/core/nccl/nccl_manager.h
index b0b4441b776..a8bdce87081 100644
--- a/tensorflow/core/nccl/nccl_manager.h
+++ b/tensorflow/core/nccl/nccl_manager.h
@@ -238,7 +238,7 @@ class NcclManager {
   // function, and the collective is signalled globally ready via
   // `SetMultiNodeReady`.
   bool CheckReady(const string& collective_key, Collective* collective)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Run <collective>.  This calls takes ownership of <collective>.
   void RunCollective(Collective* collective);
@@ -247,13 +247,13 @@ class NcclManager {
   mutex mu_;
 
   // Maps key to collectives currently being assembled or run.
-  absl::flat_hash_map<string, Collective*> collectives_ GUARDED_BY(mu_);
+  absl::flat_hash_map<string, Collective*> collectives_ TF_GUARDED_BY(mu_);
 
   // Maps a device to the communication streams that make up its collective.
   // This is used to share the stream across different communicators that
   // include the same device.
   absl::flat_hash_map<se::StreamExecutor*, std::vector<NcclStream*>>
-      device_to_comm_streams_ GUARDED_BY(mu_);
+      device_to_comm_streams_ TF_GUARDED_BY(mu_);
 
   std::vector<std::unique_ptr<Communicator>> communicators_;
 
diff --git a/tensorflow/core/nccl/nccl_manager_test.cc b/tensorflow/core/nccl/nccl_manager_test.cc
index 3f9e6c62ef8..a76b0494bab 100644
--- a/tensorflow/core/nccl/nccl_manager_test.cc
+++ b/tensorflow/core/nccl/nccl_manager_test.cc
@@ -60,7 +60,7 @@ class NcclManagerTest : public ::testing::Test {
 
     mutex mu;
     Status final_status;
-    int num_completed GUARDED_BY(mu) = 0;
+    int num_completed TF_GUARDED_BY(mu) = 0;
     condition_variable done_cv;
   };
 
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 73718061488..9c4c59872f9 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <ostream>
 
 #include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor.pb.h"
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BeginEpoch.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/BeginEpoch.pbtxt
new file mode 100644
index 00000000000..d933a0a37fa
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/BeginEpoch.pbtxt
@@ -0,0 +1,19 @@
+op {
+  name: "BeginEpoch"
+  input_arg {
+    name: "dataset_id"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "protocol"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "epoch_id"
+    type: DT_INT64
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DataServiceDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DataServiceDataset.pbtxt
new file mode 100644
index 00000000000..5fc666e77a9
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/DataServiceDataset.pbtxt
@@ -0,0 +1,32 @@
+op {
+  name: "DataServiceDataset"
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "protocol"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "max_outstanding_requests"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DepthwiseConv2dNative.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DepthwiseConv2dNative.pbtxt
index 14dc12e3db9..805bf2d379a 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/DepthwiseConv2dNative.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/DepthwiseConv2dNative.pbtxt
@@ -155,3 +155,78 @@ op {
     }
   }
 }
+op {
+  name: "DepthwiseConv2dNative"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DepthwiseConv2dNativeBackpropFilter.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DepthwiseConv2dNativeBackpropFilter.pbtxt
index 9ae9df11c84..119933bf0ae 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/DepthwiseConv2dNativeBackpropFilter.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/DepthwiseConv2dNativeBackpropFilter.pbtxt
@@ -236,3 +236,82 @@ op {
     }
   }
 }
+op {
+  name: "DepthwiseConv2dNativeBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/DepthwiseConv2dNativeBackpropInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/DepthwiseConv2dNativeBackpropInput.pbtxt
index d3329f2bda7..8ad51191058 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/DepthwiseConv2dNativeBackpropInput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/DepthwiseConv2dNativeBackpropInput.pbtxt
@@ -236,3 +236,82 @@ op {
     }
   }
 }
+op {
+  name: "DepthwiseConv2dNativeBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ImageProjectiveTransformV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ImageProjectiveTransformV2.pbtxt
index a662e58174e..cdf54c96bcd 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/ImageProjectiveTransformV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/ImageProjectiveTransformV2.pbtxt
@@ -35,3 +35,47 @@ op {
     type: "string"
   }
 }
+op {
+  name: "ImageProjectiveTransformV2"
+  input_arg {
+    name: "images"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "transforms"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "output_shape"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "transformed_images"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "interpolation"
+    type: "string"
+  }
+  attr {
+    name: "fill_mode"
+    type: "string"
+    default_value {
+      s: "CONSTANT"
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/MakeDataServiceIterator.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/MakeDataServiceIterator.pbtxt
new file mode 100644
index 00000000000..11a7fc1a764
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/MakeDataServiceIterator.pbtxt
@@ -0,0 +1,16 @@
+op {
+  name: "MakeDataServiceIterator"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "epoch_id"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/RegisterDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/RegisterDataset.pbtxt
new file mode 100644
index 00000000000..4c6693926b2
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/RegisterDataset.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "RegisterDataset"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "protocol"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "dataset_id"
+    type: DT_INT64
+  }
+  attr {
+    name: "external_state_policy"
+    type: "int"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResizeArea.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResizeArea.pbtxt
index 688728082dd..ec861564fc4 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/ResizeArea.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResizeArea.pbtxt
@@ -75,3 +75,43 @@ op {
     }
   }
 }
+op {
+  name: "ResizeArea"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResizeBicubic.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResizeBicubic.pbtxt
index 9abf6283b02..5e479a9432b 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/ResizeBicubic.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResizeBicubic.pbtxt
@@ -121,3 +121,50 @@ op {
     }
   }
 }
+op {
+  name: "ResizeBicubic"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResizeBilinear.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResizeBilinear.pbtxt
index 4e7c77278ee..b991a8ad63d 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/ResizeBilinear.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResizeBilinear.pbtxt
@@ -162,3 +162,51 @@ op {
     }
   }
 }
+op {
+  name: "ResizeBilinear"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/ResizeNearestNeighbor.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/ResizeNearestNeighbor.pbtxt
index 627bae843a5..18d87c3583c 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/ResizeNearestNeighbor.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/ResizeNearestNeighbor.pbtxt
@@ -121,3 +121,50 @@ op {
     }
   }
 }
+op {
+  name: "ResizeNearestNeighbor"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "half_pixel_centers"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/Sub.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/Sub.pbtxt
index b9b17ec7a93..346bc8ea9ad 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/Sub.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/Sub.pbtxt
@@ -132,3 +132,39 @@ op {
     }
   }
 }
+op {
+  name: "Sub"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_UINT32
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/VarHandleOp.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/VarHandleOp.pbtxt
index b5722b97032..6e7d9a38cad 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/VarHandleOp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/VarHandleOp.pbtxt
@@ -28,3 +28,41 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "VarHandleOp"
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "allowed_devices"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
index 9fe2290e1b6..d3e3a62aef5 100644
--- a/tensorflow/core/ops/experimental_dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -1037,4 +1037,35 @@ REGISTER_OP("ExperimentalUniqueDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("DataServiceDataset")
+    .Input("address: string")
+    .Input("protocol: string")
+    .Input("max_outstanding_requests: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("RegisterDataset")
+    .Input("dataset: variant")
+    .Input("address: string")
+    .Input("protocol: string")
+    .Output("dataset_id: int64")
+    .Attr("external_state_policy: int")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("BeginEpoch")
+    .Input("dataset_id: int64")
+    .Input("address: string")
+    .Input("protocol: string")
+    .Output("epoch_id: int64")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("MakeDataServiceIterator")
+    .Input("dataset: variant")
+    .Input("epoch_id: int64")
+    .Input("iterator: resource")
+    .SetShapeFn(shape_inference::NoOutputs);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index a366d57c76f..418f1e20e37 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -228,7 +228,9 @@ REGISTER_OP("ResizeArea")
     .Input("images: T")
     .Input("size: int32")
     .Output("resized_images: float")
-    .Attr("T: {int8, uint8, int16, uint16, int32, int64, half, float, double}")
+    .Attr(
+        "T: {int8, uint8, int16, uint16, int32, int64, half, float, double,"
+        "bfloat16}")
     .Attr("align_corners: bool = false")
     .SetShapeFn(ResizeShapeFn);
 
@@ -237,7 +239,9 @@ REGISTER_OP("ResizeBicubic")
     .Input("images: T")
     .Input("size: int32")
     .Output("resized_images: float")
-    .Attr("T: {int8, uint8, int16, uint16, int32, int64, half, float, double}")
+    .Attr(
+        "T: {int8, uint8, int16, uint16, int32, int64, half, float, double,"
+        "bfloat16}")
     .Attr("align_corners: bool = false")
     .Attr("half_pixel_centers: bool = false")
     .SetShapeFn(ResizeShapeFn);
@@ -262,7 +266,7 @@ REGISTER_OP("ResizeBilinear")
     .Output("resized_images: float")
     .Attr(
         "T: {int8, uint8, int16, uint16, int32, int64, bfloat16, half, "
-        "float, double}")
+        "float, double, bfloat16}")
     .Attr("align_corners: bool = false")
     .Attr("half_pixel_centers: bool = false")
     .SetShapeFn(ResizeShapeFn);
@@ -337,7 +341,9 @@ REGISTER_OP("ResizeNearestNeighbor")
     .Input("images: T")
     .Input("size: int32")
     .Output("resized_images: T")
-    .Attr("T: {int8, uint8, int16, uint16, int32, int64, half, float, double}")
+    .Attr(
+        "T: {int8, uint8, int16, uint16, int32, int64, half, float,"
+        "double, bfloat16}")
     .Attr("align_corners: bool = false")
     .Attr("half_pixel_centers: bool = false")
     .SetShapeFn(ResizeShapeFn);
@@ -1022,7 +1028,6 @@ REGISTER_OP("GenerateBoundingBoxProposals")
       return Status::OK();
     });
 
-// TODO(ringwalt): Add a "fill_mode" attr with "constant", "mirror", etc.
 // TODO(ringwalt): Add a "fill_constant" argument for constant mode (default 0).
 // V2 op supports output_shape. V1 op is in contrib.
 REGISTER_OP("ImageProjectiveTransformV2")
@@ -1031,6 +1036,7 @@ REGISTER_OP("ImageProjectiveTransformV2")
     .Input("output_shape: int32")
     .Attr("dtype: {uint8, int32, int64, float16, float32, float64}")
     .Attr("interpolation: string")
+    .Attr("fill_mode: string = 'CONSTANT'")
     .Output("transformed_images: dtype")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input;
diff --git a/tensorflow/core/ops/math_grad.cc b/tensorflow/core/ops/math_grad.cc
index 18f884da3c9..decafe59c59 100644
--- a/tensorflow/core/ops/math_grad.cc
+++ b/tensorflow/core/ops/math_grad.cc
@@ -78,7 +78,7 @@ REGISTER_OP_GRADIENT("Reciprocal", InvGrad);
 Status SquareGrad(const AttrSlice& attrs, FunctionDef* g) {
   // clang-format off
   return GradForUnaryCwise(g, {
-      FDH::Const("c", 2LL),
+      FDH::Const("c", int64{2}),
       {{"two"}, "Cast", {"c"}, {{"SrcT", DT_INT64}, {"DstT", "$T"}}},
       {{"x2"}, "Mul", {"x", "two"}, {}, {"dy"}},  // x * 2
       {{"dx"}, "Mul", {"dy", "x2"}},              // dy * (x * 2)
@@ -619,7 +619,7 @@ REGISTER_OP_GRADIENT("Xdivy", XdivyGrad);
 Status SquaredDifferenceGrad(const AttrSlice& attrs, FunctionDef* g) {
   // clang-format off
   return GradForBinaryCwise(g, {
-      FDH::Const("c", 2LL),
+      FDH::Const("c", int64{2}),
       {{"two"}, "Cast", {"c"}, {{"SrcT", DT_INT64}, {"DstT", "$T"}}},
       {{"x_sub_y"}, "Sub", {"x", "y"}},
       {{"two_x_sub_y"}, "Mul", {"two", "x_sub_y"}},  // 2 * (x - y)
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 7ba946faf92..4ab1d3e68d0 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -435,8 +435,14 @@ Returns `x` + `y` element-wise.
 )doc");
 #endif  // INTEL_MKL
 
-REGISTER_OP("Sub").BINARY_MORE().SetShapeFn(
-    shape_inference::BroadcastBinaryOpShapeFn);
+REGISTER_OP("Sub")
+    .Input("x: T")
+    .Input("y: T")
+    .Output("z: T")
+    .Attr(
+        "T: {bfloat16, half, float, double, uint8, int8, uint16, int16, int32, "
+        "int64, complex64, complex128, uint32}")
+    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("_MklSub")
     .BINARY_FEWER()
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 90c3f246593..f69782c00c1 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cmath>
+
 #include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 89774958603..d12c22ed431 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -4256,6 +4256,25 @@ op {
     }
   }
 }
+op {
+  name: "BeginEpoch"
+  input_arg {
+    name: "dataset_id"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "protocol"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "epoch_id"
+    type: DT_INT64
+  }
+}
 op {
   name: "BesselI0e"
   input_arg {
@@ -10435,6 +10454,38 @@ op {
     }
   }
 }
+op {
+  name: "DataServiceDataset"
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "protocol"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "max_outstanding_requests"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "DatasetCardinality"
   input_arg {
@@ -11603,6 +11654,15 @@ op {
       list {
         s: "SAME"
         s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
       }
     }
   }
@@ -11673,6 +11733,15 @@ op {
       list {
         s: "SAME"
         s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
       }
     }
   }
@@ -11743,6 +11812,15 @@ op {
       list {
         s: "SAME"
         s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
       }
     }
   }
@@ -18118,6 +18196,13 @@ op {
     name: "interpolation"
     type: "string"
   }
+  attr {
+    name: "fill_mode"
+    type: "string"
+    default_value {
+      s: "CONSTANT"
+    }
+  }
 }
 op {
   name: "ImageSummary"
@@ -21275,6 +21360,22 @@ op {
     }
   }
 }
+op {
+  name: "MakeDataServiceIterator"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "epoch_id"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
 op {
   name: "MakeIterator"
   input_arg {
@@ -34571,6 +34672,29 @@ op {
     }
   }
 }
+op {
+  name: "RegisterDataset"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "protocol"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "dataset_id"
+    type: DT_INT64
+  }
+  attr {
+    name: "external_state_policy"
+    type: "int"
+  }
+}
 op {
   name: "Relu"
   input_arg {
@@ -35055,6 +35179,7 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_BFLOAT16
       }
     }
   }
@@ -35094,6 +35219,7 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_BFLOAT16
       }
     }
   }
@@ -35180,6 +35306,7 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_BFLOAT16
       }
     }
   }
@@ -35267,6 +35394,7 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_BFLOAT16
       }
     }
   }
@@ -48114,6 +48242,7 @@ op {
         type: DT_INT64
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_UINT32
       }
     }
   }
@@ -52380,6 +52509,14 @@ op {
     name: "shape"
     type: "shape"
   }
+  attr {
+    name: "allowed_devices"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
   is_stateful: true
 }
 op {
diff --git a/tensorflow/core/ops/random_ops.cc b/tensorflow/core/ops/random_ops.cc
index a76248e05f6..63f0670ad98 100644
--- a/tensorflow/core/ops/random_ops.cc
+++ b/tensorflow/core/ops/random_ops.cc
@@ -45,8 +45,18 @@ REGISTER_OP("RandomUniformInt")
     .Attr("T: {int32, int64}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      Status s = c->WithRank(c->input(1), 0, &unused);
+      if (!s.ok()) {
+        return errors::InvalidArgument(
+            "minval must be a scalar; got a tensor of shape ",
+            c->DebugString(c->input(1)));
+      }
+      s = c->WithRank(c->input(2), 0, &unused);
+      if (!s.ok()) {
+        return errors::InvalidArgument(
+            "maxval must be a scalar; got a tensor of shape ",
+            c->DebugString(c->input(2)));
+      }
       return shape_inference::RandomShape(c);
     });
 
diff --git a/tensorflow/core/ops/resource_variable_ops.cc b/tensorflow/core/ops/resource_variable_ops.cc
index 696a69eff80..77ab5f604c8 100644
--- a/tensorflow/core/ops/resource_variable_ops.cc
+++ b/tensorflow/core/ops/resource_variable_ops.cc
@@ -80,6 +80,7 @@ REGISTER_OP("VarHandleOp")
     .Attr("shared_name: string = ''")
     .Attr("dtype: type")
     .Attr("shape: shape")
+    .Attr("allowed_devices: list(string) = []")
     .Output("resource: resource")
     .SetIsStateful()
     .SetShapeFn([](InferenceContext* c) {
diff --git a/tensorflow/core/ops/stateful_random_ops.cc b/tensorflow/core/ops/stateful_random_ops.cc
index d9e1b1aade9..6c3ca3ac466 100644
--- a/tensorflow/core/ops/stateful_random_ops.cc
+++ b/tensorflow/core/ops/stateful_random_ops.cc
@@ -60,8 +60,18 @@ REGISTER_OP("StatefulUniformInt")
       // Check inputs
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      Status s = c->WithRank(c->input(3), 0, &unused);
+      if (!s.ok()) {
+        return errors::InvalidArgument(
+            "minval must be a scalar; got a tensor of shape ",
+            c->DebugString(c->input(3)));
+      }
+      s = c->WithRank(c->input(4), 0, &unused);
+      if (!s.ok()) {
+        return errors::InvalidArgument(
+            "maxval must be a scalar; got a tensor of shape ",
+            c->DebugString(c->input(4)));
+      }
       // Set output
       ShapeHandle out;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(2, &out));
diff --git a/tensorflow/core/ops/stateless_random_ops.cc b/tensorflow/core/ops/stateless_random_ops.cc
index 83342019ab6..d540b9a04d9 100644
--- a/tensorflow/core/ops/stateless_random_ops.cc
+++ b/tensorflow/core/ops/stateless_random_ops.cc
@@ -63,8 +63,18 @@ REGISTER_OP("StatelessRandomUniformInt")
     .Attr("Tseed: {int32, int64} = DT_INT64")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      Status s = c->WithRank(c->input(2), 0, &unused);
+      if (!s.ok()) {
+        return errors::InvalidArgument(
+            "minval must be a scalar; got a tensor of shape ",
+            c->DebugString(c->input(2)));
+      }
+      s = c->WithRank(c->input(3), 0, &unused);
+      if (!s.ok()) {
+        return errors::InvalidArgument(
+            "maxval must be a scalar; got a tensor of shape ",
+            c->DebugString(c->input(3)));
+      }
       return StatelessShape(c);
     });
 
diff --git a/tensorflow/core/ops/tpu_replication_ops.cc b/tensorflow/core/ops/tpu_replication_ops.cc
index b5bf4ba117e..3bb94044e14 100644
--- a/tensorflow/core/ops/tpu_replication_ops.cc
+++ b/tensorflow/core/ops/tpu_replication_ops.cc
@@ -108,6 +108,7 @@ REGISTER_OP("_TPUReplicate")
     .Attr("padding_map: list(string) = []")
     .Attr("step_marker_location: string = \"STEP_MARK_AT_ENTRY\"")
     .Attr("allow_soft_placement: bool = false")
+    .Attr("num_distributed_variables: int = 0")
     .Input("inputs: Tinputs")
     .Input("broadcast_inputs: Tbroadcast_inputs")
     .Input("variables: NumVariables * resource")
diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index 424de81a066..87b64868c76 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -39,6 +39,7 @@ load(
     "//tensorflow:tensorflow.bzl",
     "if_chromiumos",
     "if_not_android",
+    "if_windows",
     "tf_cc_test",
     "tf_cc_tests",
     "tf_copts",  # @unused
@@ -55,12 +56,13 @@ load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
 
 package(
-    default_visibility = [
-        "//tensorflow:__subpackages__",
-    ],
+    default_visibility = ["//tensorflow:__subpackages__"],
     licenses = ["notice"],  # Apache 2.0
 )
 
+# Authorized users go here.
+package_group(name = "friends")
+
 exports_files(
     [
         "context.h",
@@ -347,7 +349,9 @@ filegroup(
         "platform.h",
         "tstring.h",
         "types.h",
-    ],
+    ] + if_windows([
+        "//tensorflow/core/platform/windows:xla_cpu_runtime_srcs",
+    ]),
 )
 
 cc_library(
@@ -595,6 +599,7 @@ cc_library(
     name = "stacktrace_handler",
     textual_hdrs = ["stacktrace_handler.h"],
     deps = tf_windows_aware_platform_deps("stacktrace_handler"),
+    alwayslink = 1,
 )
 
 cc_library(
@@ -939,8 +944,6 @@ tf_cc_tests(
         "port_test.cc",
         "profile_utils/cpu_utils_test.cc",
         "scanner_test.cc",
-        "stacktrace_handler_test.cc",
-        "stacktrace_test.cc",
         "str_util_test.cc",
         "strcat_test.cc",
         "stringpiece_test.cc",
@@ -969,6 +972,37 @@ tf_cc_tests(
     ],
 )
 
+tf_cc_test(
+    name = "stacktrace_test",
+    size = "small",
+    srcs = [
+        "stacktrace_test.cc",
+    ],
+    tags = ["no_windows"],
+    deps = [
+        ":logging",
+        ":stacktrace",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_tests(
+    name = "stacktrace_handler_test",
+    size = "small",
+    srcs = [
+        "stacktrace_handler_test.cc",
+    ],
+    tags = ["no_windows"],
+    deps = [
+        ":logging",
+        ":stacktrace",
+        ":stacktrace_handler",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_cc_test(
     name = "subprocess_test",
     size = "small",
@@ -1231,6 +1265,7 @@ filegroup(
         "abi.h",
         "casts.h",
         "context.h",
+        "cord.h",
         "cpu_feature_guard.h",
         "cpu_info.h",
         "dynamic_annotations.h",
diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index fe08edceae9..101d7ac5807 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -166,6 +166,7 @@ cc_library(
         "//tensorflow/core/platform:scanner",
         "//tensorflow/core/platform:str_util",
         "//tensorflow/core/platform:stringpiece",
+        "//tensorflow/core/util:env_var",
         "@curl",
     ],
 )
diff --git a/tensorflow/core/platform/cloud/curl_http_request.cc b/tensorflow/core/platform/cloud/curl_http_request.cc
index df710e91b01..a227edb1fb0 100644
--- a/tensorflow/core/platform/cloud/curl_http_request.cc
+++ b/tensorflow/core/platform/cloud/curl_http_request.cc
@@ -19,13 +19,13 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/platform/cloud/curl_http_request.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/scanner.h"
 #include "tensorflow/core/platform/str_util.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/env_var.h"
 
 #define CHECK_CURL_OK(expr) CHECK_EQ(expr, CURLE_OK)
 
@@ -126,10 +126,16 @@ CurlHttpRequest::CurlHttpRequest(LibCurl* libcurl, Env* env)
   curl_ = libcurl_->curl_easy_init();
   CHECK(curl_ != nullptr) << "Couldn't initialize a curl session.";
 
-  // NOTE: CURL_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt is configured by
-  //       default in //third_party:curl.BUILD and can be customized via an
-  //       environment variable.
-
+  // NOTE: The cURL CA bundle path is, by default, set to
+  //   etc/ssl/certs/ca-certificates.crt in tensorflow/third_party/curl.BUILD.
+  //   It can be customized with the CURL_CA_BUNDLE environment variable.
+  //   See also: https://curl.haxx.se/libcurl/c/CURLOPT_CAINFO.html.
+  std::string value = "";
+  TF_CHECK_OK(ReadStringFromEnvVar("CURL_CA_BUNDLE", "", &value));
+  if (!value.empty()) {
+    CHECK_CURL_OK(
+        libcurl_->curl_easy_setopt(curl_, CURLOPT_CAINFO, value.c_str()));
+  }
   CHECK_CURL_OK(
       libcurl_->curl_easy_setopt(curl_, CURLOPT_VERBOSE, kVerboseOutput));
   CHECK_CURL_OK(libcurl_->curl_easy_setopt(
@@ -141,7 +147,8 @@ CurlHttpRequest::CurlHttpRequest(LibCurl* libcurl, Env* env)
   // TODO(b/74351157): Enable HTTP/2.
 
   // Set up the progress meter.
-  CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_NOPROGRESS, 0ULL));
+  CHECK_CURL_OK(
+      libcurl_->curl_easy_setopt(curl_, CURLOPT_NOPROGRESS, uint64{0}));
   CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_XFERINFODATA, this));
   CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_XFERINFOFUNCTION,
                                            &CurlHttpRequest::ProgressCallback));
diff --git a/tensorflow/core/platform/cloud/curl_http_request_test.cc b/tensorflow/core/platform/cloud/curl_http_request_test.cc
index 22489e297aa..341b2110f46 100644
--- a/tensorflow/core/platform/cloud/curl_http_request_test.cc
+++ b/tensorflow/core/platform/cloud/curl_http_request_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cloud/curl_http_request.h"
 
 #include <fstream>
+#include <string>
 
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/mem.h"
@@ -94,6 +95,9 @@ class FakeLibCurl : public LibCurl {
       case CURLOPT_ERRORBUFFER:
         error_buffer_ = reinterpret_cast<char*>(param);
         break;
+      case CURLOPT_CAINFO:
+        ca_info_ = reinterpret_cast<char*>(param);
+        break;
       case CURLOPT_WRITEDATA:
         write_data_ = reinterpret_cast<FILE*>(param);
         break;
@@ -234,6 +238,7 @@ class FakeLibCurl : public LibCurl {
   string url_;
   string range_;
   string custom_request_;
+  string ca_info_;
   char* error_buffer_ = nullptr;
   bool is_initialized_ = false;
   bool is_cleaned_up_ = false;
@@ -283,6 +288,7 @@ TEST(CurlHttpRequestTest, GetRequest) {
   EXPECT_EQ("http://www.testuri.com", libcurl.url_);
   EXPECT_EQ("100-199", libcurl.range_);
   EXPECT_EQ("", libcurl.custom_request_);
+  EXPECT_EQ("", libcurl.ca_info_);
   EXPECT_EQ(1, libcurl.headers_->size());
   EXPECT_EQ("Authorization: Bearer fake-bearer", (*libcurl.headers_)[0]);
   EXPECT_FALSE(libcurl.is_post_);
@@ -314,6 +320,37 @@ TEST(CurlHttpRequestTest, GetRequest_Direct) {
   EXPECT_EQ("http://www.testuri.com", libcurl.url_);
   EXPECT_EQ("100-199", libcurl.range_);
   EXPECT_EQ("", libcurl.custom_request_);
+  EXPECT_EQ("", libcurl.ca_info_);
+  EXPECT_EQ(1, libcurl.headers_->size());
+  EXPECT_EQ("Authorization: Bearer fake-bearer", (*libcurl.headers_)[0]);
+  EXPECT_FALSE(libcurl.is_post_);
+  EXPECT_EQ(200, http_request.GetResponseCode());
+}
+
+TEST(CurlHttpRequestTest, GetRequest_CustomCaInfoFlag) {
+  static char set_var[] = "CURL_CA_BUNDLE=test";
+  putenv(set_var);
+  FakeLibCurl libcurl("get response", 200);
+  CurlHttpRequest http_request(&libcurl);
+
+  std::vector<char> scratch;
+  scratch.insert(scratch.begin(), kTestContent.begin(), kTestContent.end());
+  scratch.reserve(100);
+
+  http_request.SetUri("http://www.testuri.com");
+  http_request.AddAuthBearerHeader("fake-bearer");
+  http_request.SetRange(100, 199);
+  http_request.SetResultBuffer(&scratch);
+  TF_EXPECT_OK(http_request.Send());
+
+  EXPECT_EQ("get response", string(scratch.begin(), scratch.end()));
+
+  // Check interactions with libcurl.
+  EXPECT_TRUE(libcurl.is_initialized_);
+  EXPECT_EQ("http://www.testuri.com", libcurl.url_);
+  EXPECT_EQ("100-199", libcurl.range_);
+  EXPECT_EQ("", libcurl.custom_request_);
+  EXPECT_EQ("test", libcurl.ca_info_);
   EXPECT_EQ(1, libcurl.headers_->size());
   EXPECT_EQ("Authorization: Bearer fake-bearer", (*libcurl.headers_)[0]);
   EXPECT_FALSE(libcurl.is_post_);
diff --git a/tensorflow/core/platform/cloud/expiring_lru_cache.h b/tensorflow/core/platform/cloud/expiring_lru_cache.h
index e2d048f141c..a2008b0ba38 100644
--- a/tensorflow/core/platform/cloud/expiring_lru_cache.h
+++ b/tensorflow/core/platform/cloud/expiring_lru_cache.h
@@ -119,7 +119,8 @@ class ExpiringLRUCache {
     std::list<string>::iterator lru_iterator;
   };
 
-  bool LookupLocked(const string& key, T* value) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  bool LookupLocked(const string& key, T* value)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     auto it = cache_.find(key);
     if (it == cache_.end()) {
       return false;
@@ -136,7 +137,7 @@ class ExpiringLRUCache {
   }
 
   void InsertLocked(const string& key, const T& value)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     lru_list_.push_front(key);
     Entry entry{env_->NowSeconds(), value, lru_list_.begin()};
     auto insert = cache_.insert(std::make_pair(key, entry));
@@ -149,7 +150,7 @@ class ExpiringLRUCache {
     }
   }
 
-  bool DeleteLocked(const string& key) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  bool DeleteLocked(const string& key) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     auto it = cache_.find(key);
     if (it == cache_.end()) {
       return false;
@@ -174,11 +175,11 @@ class ExpiringLRUCache {
   mutex mu_;
 
   /// The cache (a map from string key to Entry).
-  std::map<string, Entry> cache_ GUARDED_BY(mu_);
+  std::map<string, Entry> cache_ TF_GUARDED_BY(mu_);
 
   /// The LRU list of entries. The front of the list identifies the most
   /// recently accessed entry.
-  std::list<string> lru_list_ GUARDED_BY(mu_);
+  std::list<string> lru_list_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/gcs_dns_cache.h b/tensorflow/core/platform/cloud/gcs_dns_cache.h
index 07d0e59fd53..3f589efc708 100644
--- a/tensorflow/core/platform/cloud/gcs_dns_cache.h
+++ b/tensorflow/core/platform/cloud/gcs_dns_cache.h
@@ -62,14 +62,14 @@ class GcsDnsCache {
   mutex mu_;
   Env* env_;
   condition_variable cond_var_;
-  std::default_random_engine random_ GUARDED_BY(mu_);
-  bool started_ GUARDED_BY(mu_) = false;
-  bool cancelled_ GUARDED_BY(mu_) = false;
-  std::unique_ptr<Thread> worker_ GUARDED_BY(mu_);  // After mutable vars.
+  std::default_random_engine random_ TF_GUARDED_BY(mu_);
+  bool started_ TF_GUARDED_BY(mu_) = false;
+  bool cancelled_ TF_GUARDED_BY(mu_) = false;
+  std::unique_ptr<Thread> worker_ TF_GUARDED_BY(mu_);  // After mutable vars.
   const int64 refresh_rate_secs_;
 
   // Entries in this vector correspond to entries in kCachedDomainNames.
-  std::vector<std::vector<string>> addresses_ GUARDED_BY(mu_);
+  std::vector<std::vector<string>> addresses_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 18b7ef02b6f..c30762b96e9 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -357,7 +357,7 @@ class BufferedGcsRandomAccessFile : public RandomAccessFile {
 
  private:
   Status FillBuffer(uint64 start) const
-      EXCLUSIVE_LOCKS_REQUIRED(buffer_mutex_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(buffer_mutex_) {
     buffer_start_ = start;
     buffer_.resize(buffer_size_);
     StringPiece str_piece;
@@ -381,9 +381,9 @@ class BufferedGcsRandomAccessFile : public RandomAccessFile {
   mutable mutex buffer_mutex_;
 
   // Offset of buffer from start of the file.
-  mutable uint64 buffer_start_ GUARDED_BY(buffer_mutex_);
+  mutable uint64 buffer_start_ TF_GUARDED_BY(buffer_mutex_);
 
-  mutable string buffer_ GUARDED_BY(buffer_mutex_);
+  mutable string buffer_ TF_GUARDED_BY(buffer_mutex_);
 };
 
 /// \brief GCS-based implementation of a writeable file.
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.h b/tensorflow/core/platform/cloud/gcs_file_system.h
index 98933532b17..7293f0528b9 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.h
+++ b/tensorflow/core/platform/cloud/gcs_file_system.h
@@ -336,7 +336,7 @@ class GcsFileSystem : public FileSystem {
   void ClearFileCaches(const string& fname);
 
   mutex mu_;
-  std::unique_ptr<AuthProvider> auth_provider_ GUARDED_BY(mu_);
+  std::unique_ptr<AuthProvider> auth_provider_ TF_GUARDED_BY(mu_);
   std::shared_ptr<HttpRequest::Factory> http_request_factory_;
   std::unique_ptr<ZoneProvider> zone_provider_;
 
@@ -347,7 +347,7 @@ class GcsFileSystem : public FileSystem {
   // FileBlockCache instances are themselves threadsafe).
   mutex block_cache_lock_;
   std::unique_ptr<FileBlockCache> file_block_cache_
-      GUARDED_BY(block_cache_lock_);
+      TF_GUARDED_BY(block_cache_lock_);
   std::unique_ptr<GcsDnsCache> dns_cache_;
   GcsThrottle throttle_;
 
diff --git a/tensorflow/core/platform/cloud/gcs_throttle.h b/tensorflow/core/platform/cloud/gcs_throttle.h
index ac5a536f222..cbaf0b95495 100644
--- a/tensorflow/core/platform/cloud/gcs_throttle.h
+++ b/tensorflow/core/platform/cloud/gcs_throttle.h
@@ -109,7 +109,7 @@ class GcsThrottle {
    * purpose of this function is to make available to monitoring or other
    * instrumentation the number of available tokens in the pool.
    */
-  inline int64 available_tokens() LOCKS_EXCLUDED(mu_) {
+  inline int64 available_tokens() TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
     UpdateState();
     return available_tokens_;
@@ -122,7 +122,7 @@ class GcsThrottle {
    * throttle, call SetConfig passing in a configuration that has enabled set to
    * true.
    */
-  bool is_enabled() LOCKS_EXCLUDED(mu_) {
+  bool is_enabled() TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
     return config_.enabled;
   }
@@ -134,7 +134,7 @@ class GcsThrottle {
    * UpdateState should be called in order to mark the passage of time, and
    * therefore add tokens to the available_tokens_ pool.
    */
-  void UpdateState() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  void UpdateState() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   inline uint64 request_bytes_to_tokens(size_t num_bytes) {
     return num_bytes >> 10;
@@ -147,7 +147,7 @@ class GcsThrottle {
    * the internal state of the GcsThrottle was updated. This is important when
    * determining the number of tokens to add to the available_tokens_ pool.
    */
-  uint64 last_updated_secs_ GUARDED_BY(mu_) = 0;
+  uint64 last_updated_secs_ TF_GUARDED_BY(mu_) = 0;
 
   /**
    * available_tokens_ records how many tokens are available to be consumed.
@@ -156,10 +156,10 @@ class GcsThrottle {
    * response comes back that consumes more than the available tokens, the count
    * will go negative, and block future requests until we have available tokens.
    */
-  int64 available_tokens_ GUARDED_BY(mu_) = 0;
+  int64 available_tokens_ TF_GUARDED_BY(mu_) = 0;
 
   EnvTime* const env_time_;
-  GcsThrottleConfig config_ GUARDED_BY(mu_);
+  GcsThrottleConfig config_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/google_auth_provider.h b/tensorflow/core/platform/cloud/google_auth_provider.h
index 4ab816d54c6..fe45f692175 100644
--- a/tensorflow/core/platform/cloud/google_auth_provider.h
+++ b/tensorflow/core/platform/cloud/google_auth_provider.h
@@ -46,20 +46,20 @@ class GoogleAuthProvider : public AuthProvider {
   ///
   /// Tries the file from $GOOGLE_APPLICATION_CREDENTIALS and the
   /// standard gcloud tool's location.
-  Status GetTokenFromFiles() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  Status GetTokenFromFiles() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   /// Gets the bearer token from Google Compute Engine environment.
-  Status GetTokenFromGce() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  Status GetTokenFromGce() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   /// Gets the bearer token from the system env variable, for testing purposes.
-  Status GetTokenForTesting() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  Status GetTokenForTesting() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   std::unique_ptr<OAuthClient> oauth_client_;
   std::shared_ptr<ComputeEngineMetadataClient> compute_engine_metadata_client_;
   Env* env_;
   mutex mu_;
-  string current_token_ GUARDED_BY(mu_);
-  uint64 expiration_timestamp_sec_ GUARDED_BY(mu_) = 0;
+  string current_token_ TF_GUARDED_BY(mu_);
+  uint64 expiration_timestamp_sec_ TF_GUARDED_BY(mu_) = 0;
   TF_DISALLOW_COPY_AND_ASSIGN(GoogleAuthProvider);
 };
 
diff --git a/tensorflow/core/platform/cloud/ram_file_block_cache.h b/tensorflow/core/platform/cloud/ram_file_block_cache.h
index 97105ff046a..940098af387 100644
--- a/tensorflow/core/platform/cloud/ram_file_block_cache.h
+++ b/tensorflow/core/platform/cloud/ram_file_block_cache.h
@@ -97,13 +97,13 @@ class RamFileBlockCache : public FileBlockCache {
   // the new one and remove the file from cache.
   bool ValidateAndUpdateFileSignature(const string& filename,
                                       int64 file_signature) override
-      LOCKS_EXCLUDED(mu_);
+      TF_LOCKS_EXCLUDED(mu_);
 
   /// Remove all cached blocks for `filename`.
-  void RemoveFile(const string& filename) override LOCKS_EXCLUDED(mu_);
+  void RemoveFile(const string& filename) override TF_LOCKS_EXCLUDED(mu_);
 
   /// Remove all cached data.
-  void Flush() override LOCKS_EXCLUDED(mu_);
+  void Flush() override TF_LOCKS_EXCLUDED(mu_);
 
   /// Accessors for cache parameters.
   size_t block_size() const override { return block_size_; }
@@ -111,7 +111,7 @@ class RamFileBlockCache : public FileBlockCache {
   uint64 max_staleness() const override { return max_staleness_; }
 
   /// The current size (in bytes) of the cache.
-  size_t CacheSize() const override LOCKS_EXCLUDED(mu_);
+  size_t CacheSize() const override TF_LOCKS_EXCLUDED(mu_);
 
   // Returns true if the cache is enabled. If false, the BlockFetcher callback
   // is always executed during Read.
@@ -178,7 +178,7 @@ class RamFileBlockCache : public FileBlockCache {
     /// Mutex to guard state variable
     mutex mu;
     /// The state of the block.
-    FetchState state GUARDED_BY(mu) = FetchState::CREATED;
+    FetchState state TF_GUARDED_BY(mu) = FetchState::CREATED;
     /// Wait on cond_var if state is FETCHING.
     condition_variable cond_var;
   };
@@ -189,30 +189,31 @@ class RamFileBlockCache : public FileBlockCache {
   typedef std::map<Key, std::shared_ptr<Block>> BlockMap;
 
   /// Prune the cache by removing files with expired blocks.
-  void Prune() LOCKS_EXCLUDED(mu_);
+  void Prune() TF_LOCKS_EXCLUDED(mu_);
 
   bool BlockNotStale(const std::shared_ptr<Block>& block)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   /// Look up a Key in the block cache.
-  std::shared_ptr<Block> Lookup(const Key& key) LOCKS_EXCLUDED(mu_);
+  std::shared_ptr<Block> Lookup(const Key& key) TF_LOCKS_EXCLUDED(mu_);
 
   Status MaybeFetch(const Key& key, const std::shared_ptr<Block>& block)
-      LOCKS_EXCLUDED(mu_);
+      TF_LOCKS_EXCLUDED(mu_);
 
   /// Trim the block cache to make room for another entry.
-  void Trim() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  void Trim() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   /// Update the LRU iterator for the block at `key`.
   Status UpdateLRU(const Key& key, const std::shared_ptr<Block>& block)
-      LOCKS_EXCLUDED(mu_);
+      TF_LOCKS_EXCLUDED(mu_);
 
   /// Remove all blocks of a file, with mu_ already held.
-  void RemoveFile_Locked(const string& filename) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  void RemoveFile_Locked(const string& filename)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   /// Remove the block `entry` from the block map and LRU list, and update the
   /// cache size accordingly.
-  void RemoveBlock(BlockMap::iterator entry) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  void RemoveBlock(BlockMap::iterator entry) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   /// The cache pruning thread that removes files with expired blocks.
   std::unique_ptr<Thread> pruning_thread_;
@@ -224,24 +225,24 @@ class RamFileBlockCache : public FileBlockCache {
   mutable mutex mu_;
 
   /// The block map (map from Key to Block).
-  BlockMap block_map_ GUARDED_BY(mu_);
+  BlockMap block_map_ TF_GUARDED_BY(mu_);
 
   /// The LRU list of block keys. The front of the list identifies the most
   /// recently accessed block.
-  std::list<Key> lru_list_ GUARDED_BY(mu_);
+  std::list<Key> lru_list_ TF_GUARDED_BY(mu_);
 
   /// The LRA (least recently added) list of block keys. The front of the list
   /// identifies the most recently added block.
   ///
   /// Note: blocks are added to lra_list_ only after they have successfully been
   /// fetched from the underlying block store.
-  std::list<Key> lra_list_ GUARDED_BY(mu_);
+  std::list<Key> lra_list_ TF_GUARDED_BY(mu_);
 
   /// The combined number of bytes in all of the cached blocks.
-  size_t cache_size_ GUARDED_BY(mu_) = 0;
+  size_t cache_size_ TF_GUARDED_BY(mu_) = 0;
 
   // A filename->file_signature map.
-  std::map<string, int64> file_signature_map_ GUARDED_BY(mu_);
+  std::map<string, int64> file_signature_map_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cpu_feature_guard.cc b/tensorflow/core/platform/cpu_feature_guard.cc
index 235dc5756a1..c5a5c287283 100644
--- a/tensorflow/core/platform/cpu_feature_guard.cc
+++ b/tensorflow/core/platform/cpu_feature_guard.cc
@@ -139,16 +139,11 @@ void InfoAboutUnusedCPUFeatures() {
 #endif  // __FMA__
 #endif  // else of if defined(_MSC_VER) && !defined(__clang__)
     if (!missing_instructions.empty()) {
-#ifndef INTEL_MKL
-      LOG(INFO) << "Your CPU supports instructions that this TensorFlow "
-                << "binary was not compiled to use:" << missing_instructions;
-#else
       LOG(INFO) << "This TensorFlow binary is optimized with Intel(R) MKL-DNN "
-                << "to use the following CPU instructions in performance "
+                << "to use the following CPU instructions in performance-"
                 << "critical operations: " << missing_instructions << std::endl
-                << "To enable them in non-MKL-DNN operations, rebuild "
-                << "TensorFlow with the appropriate compiler flags.";
-#endif
+                << "To enable them in other operations, rebuild TensorFlow "
+                << "with the appropriate compiler flags.";
     }
   });
 }
diff --git a/tensorflow/core/platform/default/BUILD b/tensorflow/core/platform/default/BUILD
index 88f033983b5..db714938a45 100644
--- a/tensorflow/core/platform/default/BUILD
+++ b/tensorflow/core/platform/default/BUILD
@@ -1,6 +1,6 @@
 # Tensorflow default + linux implementations of tensorflow/core/platform libraries.
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-load("//tensorflow:tensorflow.bzl", "tf_copts")
+load("//tensorflow:tensorflow.bzl", "if_not_windows", "tf_copts")
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
     "cc_library",
@@ -198,9 +198,8 @@ filegroup(
     srcs = [
         "cord.h",
         "dynamic_annotations.h",
-        "env_time.cc",
         "integral_types.h",
-    ],
+    ] + if_not_windows(["env_time.cc"]),
 )
 
 cc_library(
@@ -365,6 +364,7 @@ cc_library(
         "//tensorflow/core/platform",
         "//tensorflow/core/platform:stacktrace",
     ],
+    alwayslink = 1,
 )
 
 cc_library(
diff --git a/tensorflow/core/platform/default/env.cc b/tensorflow/core/platform/default/env.cc
index 832c968ed54..5f7822f6583 100644
--- a/tensorflow/core/platform/default/env.cc
+++ b/tensorflow/core/platform/default/env.cc
@@ -47,7 +47,7 @@ namespace {
 mutex name_mutex(tensorflow::LINKER_INITIALIZED);
 
 std::map<std::thread::id, string>& GetThreadNameRegistry()
-    EXCLUSIVE_LOCKS_REQUIRED(name_mutex) {
+    TF_EXCLUSIVE_LOCKS_REQUIRED(name_mutex) {
   static auto* thread_name_registry = new std::map<std::thread::id, string>();
   return *thread_name_registry;
 }
diff --git a/tensorflow/core/platform/default/net.cc b/tensorflow/core/platform/default/net.cc
index b34d0e19ebe..57df4a6dd76 100644
--- a/tensorflow/core/platform/default/net.cc
+++ b/tensorflow/core/platform/default/net.cc
@@ -51,7 +51,9 @@ bool IsPortAvailable(int* port, bool is_tcp) {
   int one = 1;
   if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)) < 0) {
     LOG(ERROR) << "setsockopt() failed: " << strerror(errno);
-    close(fd);
+    if (close(fd) < 0) {
+      LOG(ERROR) << "close() failed: " << strerror(errno);
+    };
     return false;
   }
 
@@ -61,7 +63,9 @@ bool IsPortAvailable(int* port, bool is_tcp) {
   addr.sin_port = htons(static_cast<uint16_t>(*port));
   if (bind(fd, reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)) < 0) {
     LOG(WARNING) << "bind(port=" << *port << ") failed: " << strerror(errno);
-    close(fd);
+    if (close(fd) < 0) {
+      LOG(ERROR) << "close() failed: " << strerror(errno);
+    };
     return false;
   }
 
@@ -69,7 +73,9 @@ bool IsPortAvailable(int* port, bool is_tcp) {
   if (getsockname(fd, reinterpret_cast<struct sockaddr*>(&addr), &addr_len) <
       0) {
     LOG(WARNING) << "getsockname() failed: " << strerror(errno);
-    close(fd);
+    if (close(fd) < 0) {
+      LOG(ERROR) << "close() failed: " << strerror(errno);
+    };
     return false;
   }
   CHECK_LE(addr_len, sizeof(addr));
@@ -80,7 +86,9 @@ bool IsPortAvailable(int* port, bool is_tcp) {
   } else {
     CHECK_EQ(*port, actual_port);
   }
-  close(fd);
+  if (close(fd) < 0) {
+    LOG(ERROR) << "close() failed: " << strerror(errno);
+  };
   return true;
 }
 
diff --git a/tensorflow/core/platform/default/subprocess.h b/tensorflow/core/platform/default/subprocess.h
index 9740d75595c..31b0ef39e7b 100644
--- a/tensorflow/core/platform/default/subprocess.h
+++ b/tensorflow/core/platform/default/subprocess.h
@@ -106,22 +106,22 @@ class SubProcess {
   static bool retry(int e) {
     return ((e == EINTR) || (e == EAGAIN) || (e == EWOULDBLOCK));
   }
-  void FreeArgs() EXCLUSIVE_LOCKS_REQUIRED(data_mu_);
-  void ClosePipes() EXCLUSIVE_LOCKS_REQUIRED(data_mu_);
+  void FreeArgs() TF_EXCLUSIVE_LOCKS_REQUIRED(data_mu_);
+  void ClosePipes() TF_EXCLUSIVE_LOCKS_REQUIRED(data_mu_);
   bool WaitInternal(int* status);
 
   // The separation between proc_mu_ and data_mu_ mutexes allows Kill() to be
   // called by a thread while another thread is inside Wait() or Communicate().
   mutable mutex proc_mu_;
-  bool running_ GUARDED_BY(proc_mu_);
-  pid_t pid_ GUARDED_BY(proc_mu_);
+  bool running_ TF_GUARDED_BY(proc_mu_);
+  pid_t pid_ TF_GUARDED_BY(proc_mu_);
 
-  mutable mutex data_mu_ ACQUIRED_AFTER(proc_mu_);
-  char* exec_path_ GUARDED_BY(data_mu_);
-  char** exec_argv_ GUARDED_BY(data_mu_);
-  ChannelAction action_[kNFds] GUARDED_BY(data_mu_);
-  int parent_pipe_[kNFds] GUARDED_BY(data_mu_);
-  int child_pipe_[kNFds] GUARDED_BY(data_mu_);
+  mutable mutex data_mu_ TF_ACQUIRED_AFTER(proc_mu_);
+  char* exec_path_ TF_GUARDED_BY(data_mu_);
+  char** exec_argv_ TF_GUARDED_BY(data_mu_);
+  ChannelAction action_[kNFds] TF_GUARDED_BY(data_mu_);
+  int parent_pipe_[kNFds] TF_GUARDED_BY(data_mu_);
+  int child_pipe_[kNFds] TF_GUARDED_BY(data_mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(SubProcess);
 };
diff --git a/tensorflow/core/platform/default/unbounded_work_queue.h b/tensorflow/core/platform/default/unbounded_work_queue.h
index 9413d067edd..9da906967ae 100644
--- a/tensorflow/core/platform/default/unbounded_work_queue.h
+++ b/tensorflow/core/platform/default/unbounded_work_queue.h
@@ -54,12 +54,13 @@ class UnboundedWorkQueue {
   const string thread_name_;
   const ThreadOptions thread_options_;
   mutex work_queue_mu_;
-  condition_variable work_queue_cv_ GUARDED_BY(work_queue_mu_);
-  size_t num_idle_threads_ GUARDED_BY(work_queue_mu_) = 0;
-  bool cancelled_ GUARDED_BY(work_queue_mu_) = false;
-  std::deque<WorkFunction> work_queue_ GUARDED_BY(work_queue_mu_);
+  condition_variable work_queue_cv_ TF_GUARDED_BY(work_queue_mu_);
+  size_t num_idle_threads_ TF_GUARDED_BY(work_queue_mu_) = 0;
+  bool cancelled_ TF_GUARDED_BY(work_queue_mu_) = false;
+  std::deque<WorkFunction> work_queue_ TF_GUARDED_BY(work_queue_mu_);
   mutex thread_pool_mu_;
-  std::vector<std::unique_ptr<Thread>> thread_pool_ GUARDED_BY(thread_pool_mu_);
+  std::vector<std::unique_ptr<Thread>> thread_pool_
+      TF_GUARDED_BY(thread_pool_mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index 17328bbc8eb..b98fd3c4cb1 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -65,7 +65,7 @@ class FileSystemRegistryImpl : public FileSystemRegistry {
  private:
   mutable mutex mu_;
   mutable std::unordered_map<std::string, std::unique_ptr<FileSystem>> registry_
-      GUARDED_BY(mu_);
+      TF_GUARDED_BY(mu_);
 };
 
 Status FileSystemRegistryImpl::Register(const std::string& scheme,
@@ -400,7 +400,7 @@ bool Env::CreateUniqueFileName(string* prefix, const string& suffix) {
 #else
   int32 pid = static_cast<int32>(getpid());
 #endif
-  uint64 now_microsec = NowMicros();
+  long long now_microsec = NowMicros();  // NOLINT
 
   *prefix += strings::Printf("%s-%x-%d-%llx", port::Hostname().c_str(), tid,
                              pid, now_microsec);
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index 52c15e6d2ab..610e233ef79 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -276,7 +276,7 @@ class HDFSRandomAccessFile : public RandomAccessFile {
   hdfsFS fs_;
 
   mutable mutex mu_;
-  mutable hdfsFile file_ GUARDED_BY(mu_);
+  mutable hdfsFile file_ TF_GUARDED_BY(mu_);
 };
 
 Status HadoopFileSystem::NewRandomAccessFile(
diff --git a/tensorflow/core/platform/mutex.h b/tensorflow/core/platform/mutex.h
index b4762f636a2..a668df4f7b3 100644
--- a/tensorflow/core/platform/mutex.h
+++ b/tensorflow/core/platform/mutex.h
@@ -48,20 +48,20 @@ class Condition;
 // constructor interface.  This type is as fast as mutex, but is also a shared
 // lock, and provides conditional critical sections (via Await()), as an
 // alternative to condition variables.
-class LOCKABLE mutex {
+class TF_LOCKABLE mutex {
  public:
   mutex();
   // The default implementation of the underlying mutex is safe to use after
   // the linker initialization to zero.
   explicit mutex(LinkerInitialized x);
 
-  void lock() EXCLUSIVE_LOCK_FUNCTION();
-  bool try_lock() EXCLUSIVE_TRYLOCK_FUNCTION(true);
-  void unlock() UNLOCK_FUNCTION();
+  void lock() TF_EXCLUSIVE_LOCK_FUNCTION();
+  bool try_lock() TF_EXCLUSIVE_TRYLOCK_FUNCTION(true);
+  void unlock() TF_UNLOCK_FUNCTION();
 
-  void lock_shared() SHARED_LOCK_FUNCTION();
-  bool try_lock_shared() SHARED_TRYLOCK_FUNCTION(true);
-  void unlock_shared() UNLOCK_FUNCTION();
+  void lock_shared() TF_SHARED_LOCK_FUNCTION();
+  bool try_lock_shared() TF_SHARED_TRYLOCK_FUNCTION(true);
+  void unlock_shared() TF_UNLOCK_FUNCTION();
 
   // -------
   // Conditional critical sections.
@@ -142,15 +142,16 @@ class Condition {
 };
 
 // Mimic a subset of the std::unique_lock<tensorflow::mutex> functionality.
-class SCOPED_LOCKABLE mutex_lock {
+class TF_SCOPED_LOCKABLE mutex_lock {
  public:
   typedef ::tensorflow::mutex mutex_type;
 
-  explicit mutex_lock(mutex_type& mu) EXCLUSIVE_LOCK_FUNCTION(mu) : mu_(&mu) {
+  explicit mutex_lock(mutex_type& mu) TF_EXCLUSIVE_LOCK_FUNCTION(mu)
+      : mu_(&mu) {
     mu_->lock();
   }
 
-  mutex_lock(mutex_type& mu, std::try_to_lock_t) EXCLUSIVE_LOCK_FUNCTION(mu)
+  mutex_lock(mutex_type& mu, std::try_to_lock_t) TF_EXCLUSIVE_LOCK_FUNCTION(mu)
       : mu_(&mu) {
     if (!mu.try_lock()) {
       mu_ = nullptr;
@@ -159,11 +160,11 @@ class SCOPED_LOCKABLE mutex_lock {
 
   // Manually nulls out the source to prevent double-free.
   // (std::move does not null the source pointer by default.)
-  mutex_lock(mutex_lock&& ml) noexcept EXCLUSIVE_LOCK_FUNCTION(ml.mu_)
+  mutex_lock(mutex_lock&& ml) noexcept TF_EXCLUSIVE_LOCK_FUNCTION(ml.mu_)
       : mu_(ml.mu_) {
     ml.mu_ = nullptr;
   }
-  ~mutex_lock() UNLOCK_FUNCTION() {
+  ~mutex_lock() TF_UNLOCK_FUNCTION() {
     if (mu_ != nullptr) {
       mu_->unlock();
     }
@@ -181,15 +182,16 @@ class SCOPED_LOCKABLE mutex_lock {
 
 // Mimic a subset of the std::shared_lock<tensorflow::mutex> functionality.
 // Name chosen to minimize conflicts with the tf_shared_lock macro, below.
-class SCOPED_LOCKABLE tf_shared_lock {
+class TF_SCOPED_LOCKABLE tf_shared_lock {
  public:
   typedef ::tensorflow::mutex mutex_type;
 
-  explicit tf_shared_lock(mutex_type& mu) SHARED_LOCK_FUNCTION(mu) : mu_(&mu) {
+  explicit tf_shared_lock(mutex_type& mu) TF_SHARED_LOCK_FUNCTION(mu)
+      : mu_(&mu) {
     mu_->lock_shared();
   }
 
-  tf_shared_lock(mutex_type& mu, std::try_to_lock_t) SHARED_LOCK_FUNCTION(mu)
+  tf_shared_lock(mutex_type& mu, std::try_to_lock_t) TF_SHARED_LOCK_FUNCTION(mu)
       : mu_(&mu) {
     if (!mu.try_lock_shared()) {
       mu_ = nullptr;
@@ -198,11 +200,11 @@ class SCOPED_LOCKABLE tf_shared_lock {
 
   // Manually nulls out the source to prevent double-free.
   // (std::move does not null the source pointer by default.)
-  tf_shared_lock(tf_shared_lock&& ml) noexcept SHARED_LOCK_FUNCTION(ml.mu_)
+  tf_shared_lock(tf_shared_lock&& ml) noexcept TF_SHARED_LOCK_FUNCTION(ml.mu_)
       : mu_(ml.mu_) {
     ml.mu_ = nullptr;
   }
-  ~tf_shared_lock() UNLOCK_FUNCTION() {
+  ~tf_shared_lock() TF_UNLOCK_FUNCTION() {
     if (mu_ != nullptr) {
       mu_->unlock_shared();
     }
diff --git a/tensorflow/core/platform/numbers.cc b/tensorflow/core/platform/numbers.cc
index 3b380b0e883..c8a73b05d44 100644
--- a/tensorflow/core/platform/numbers.cc
+++ b/tensorflow/core/platform/numbers.cc
@@ -439,7 +439,7 @@ string HumanReadableNum(int64 value) {
     value = -value;
   }
   if (value < 1000) {
-    Appendf(&s, "%lld", value);
+    Appendf(&s, "%lld", static_cast<long long>(value));
   } else if (value >= static_cast<int64>(1e15)) {
     // Number bigger than 1E15; use that notation.
     Appendf(&s, "%0.3G", static_cast<double>(value));
@@ -472,7 +472,7 @@ string HumanReadableNumBytes(int64 num_bytes) {
     // No fractions for bytes.
     char buf[8];  // Longest possible string is '-XXXXB'
     snprintf(buf, sizeof(buf), "%s%lldB", neg_str,
-             static_cast<int64>(num_bytes));
+             static_cast<long long>(num_bytes));
     return string(buf);
   }
 
diff --git a/tensorflow/core/platform/status.cc b/tensorflow/core/platform/status.cc
index a7fd3e693a1..756b8314148 100644
--- a/tensorflow/core/platform/status.cc
+++ b/tensorflow/core/platform/status.cc
@@ -61,7 +61,7 @@ class StatusLogSink : public TFLogSink {
     });
   }
 
-  void GetMessages(std::vector<std::string>* logs) LOCKS_EXCLUDED(mu_) {
+  void GetMessages(std::vector<std::string>* logs) TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock lock(mu_);
 
     for (auto& msg : messages_) {
@@ -69,7 +69,7 @@ class StatusLogSink : public TFLogSink {
     }
   }
 
-  void Send(const TFLogEntry& entry) override LOCKS_EXCLUDED(mu_) {
+  void Send(const TFLogEntry& entry) override TF_LOCKS_EXCLUDED(mu_) {
     if (entry.log_severity() < absl::LogSeverity::kWarning) return;
 
     mutex_lock lock(mu_);
@@ -82,7 +82,7 @@ class StatusLogSink : public TFLogSink {
   // for allowing repeated/concurrent calls to enable()
   absl::once_flag flag_;
   int num_messages_ = 0;
-  std::deque<std::string> messages_ GUARDED_BY(mu_);
+  std::deque<std::string> messages_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace
diff --git a/tensorflow/core/platform/thread_annotations.h b/tensorflow/core/platform/thread_annotations.h
index 2abe0b001e6..b9b62fd2fbc 100644
--- a/tensorflow/core/platform/thread_annotations.h
+++ b/tensorflow/core/platform/thread_annotations.h
@@ -39,17 +39,17 @@ limitations under the License.
 // IWYU pragma: friend third_party/tensorflow/core/platform/thread_annotations.h
 
 #if defined(__clang__) && (!defined(SWIG))
-#define THREAD_ANNOTATION_ATTRIBUTE__(x) __attribute__((x))
+#define TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(x) __attribute__((x))
 #else
-#define THREAD_ANNOTATION_ATTRIBUTE__(x)  // no-op
+#define TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(x)  // no-op
 #endif
 
 // Document if a shared variable/field needs to be protected by a mutex.
-// GUARDED_BY allows the user to specify a particular mutex that should be
+// TF_GUARDED_BY allows the user to specify a particular mutex that should be
 // held when accessing the annotated variable.  GUARDED_VAR indicates that
 // a shared variable is guarded by some unspecified mutex, for use in rare
 // cases where a valid mutex expression cannot be specified.
-#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
+#define TF_GUARDED_BY(x) TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(guarded_by(x))
 #define GUARDED_VAR  // no-op
 
 // Document if the memory location pointed to by a pointer should be guarded
@@ -58,72 +58,76 @@ limitations under the License.
 // could itself be a shared variable. For example, if a shared global pointer
 // q, which is guarded by mu1, points to a shared memory location that is
 // guarded by mu2, q should be annotated as follows:
-//     int *q GUARDED_BY(mu1) PT_GUARDED_BY(mu2);
-#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x))
-#define PT_GUARDED_VAR  // no-op
+//     int *q TF_GUARDED_BY(mu1) TF_PT_GUARDED_BY(mu2);
+#define TF_PT_GUARDED_BY(x) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(pt_guarded_by(x))
+#define TF_PT_GUARDED_VAR  // no-op
 
 // Document the acquisition order between locks that can be held
 // simultaneously by a thread. For any two locks that need to be annotated
 // to establish an acquisition order, only one of them needs the annotation.
-// (i.e. You don't have to annotate both locks with both ACQUIRED_AFTER
-// and ACQUIRED_BEFORE.)
-#define ACQUIRED_AFTER(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(__VA_ARGS__))
+// (i.e. You don't have to annotate both locks with both TF_ACQUIRED_AFTER
+// and TF_ACQUIRED_BEFORE.)
+#define TF_ACQUIRED_AFTER(...) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(acquired_after(__VA_ARGS__))
 
-#define ACQUIRED_BEFORE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__))
+#define TF_ACQUIRED_BEFORE(...) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(acquired_before(__VA_ARGS__))
 
-#define ACQUIRE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquire_capability(__VA_ARGS__))
+#define TF_ACQUIRE(...) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(acquire_capability(__VA_ARGS__))
 
-#define ACQUIRE_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquire_shared_capability(__VA_ARGS__))
+#define TF_ACQUIRE_SHARED(...)             \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE( \
+      acquire_shared_capability(__VA_ARGS__))
 
-#define RELEASE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(release_capability(__VA_ARGS__))
+#define TF_RELEASE(...) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(release_capability(__VA_ARGS__))
 
 // Document a function that expects a mutex to be held prior to entry.
 // The mutex is expected to be held both on entry to and exit from the
 // function.
-#define EXCLUSIVE_LOCKS_REQUIRED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(exclusive_locks_required(__VA_ARGS__))
+#define TF_EXCLUSIVE_LOCKS_REQUIRED(...) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(exclusive_locks_required(__VA_ARGS__))
 
-#define SHARED_LOCKS_REQUIRED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(shared_locks_required(__VA_ARGS__))
+#define TF_SHARED_LOCKS_REQUIRED(...) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(shared_locks_required(__VA_ARGS__))
 
 // Document the locks acquired in the body of the function. These locks
 // cannot be held when calling this function (for instance, when the
 // mutex implementation is non-reentrant).
-#define LOCKS_EXCLUDED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__))
+#define TF_LOCKS_EXCLUDED(...) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(locks_excluded(__VA_ARGS__))
 
 // Document a function that returns a mutex without acquiring it.  For example,
 // a public getter method that returns a pointer to a private mutex should
-// be annotated with LOCK_RETURNED.
-#define LOCK_RETURNED(x) THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
+// be annotated with TF_LOCK_RETURNED.
+#define TF_LOCK_RETURNED(x) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(lock_returned(x))
 
 // Document if a class/type is a lockable type (such as the Mutex class).
-#define LOCKABLE THREAD_ANNOTATION_ATTRIBUTE__(lockable)
+#define TF_LOCKABLE TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(lockable)
 
 // Document if a class does RAII locking (such as the MutexLock class).
 // The constructor should use LOCK_FUNCTION to specify the mutex that is
-// acquired, and the destructor should use UNLOCK_FUNCTION with no arguments;
+// acquired, and the destructor should use TF_UNLOCK_FUNCTION with no arguments;
 // the analysis will assume that the destructor unlocks whatever the
 // constructor locked.
-#define SCOPED_LOCKABLE THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
+#define TF_SCOPED_LOCKABLE \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(scoped_lockable)
 
 // Document functions that acquire a lock in the body of a function, and do
 // not release it.
-#define EXCLUSIVE_LOCK_FUNCTION(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(exclusive_lock_function(__VA_ARGS__))
+#define TF_EXCLUSIVE_LOCK_FUNCTION(...) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(exclusive_lock_function(__VA_ARGS__))
 
-#define SHARED_LOCK_FUNCTION(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(shared_lock_function(__VA_ARGS__))
+#define TF_SHARED_LOCK_FUNCTION(...) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(shared_lock_function(__VA_ARGS__))
 
 // Document functions that expect a lock to be held on entry to the function,
 // and release it in the body of the function.
-#define UNLOCK_FUNCTION(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(unlock_function(__VA_ARGS__))
+#define TF_UNLOCK_FUNCTION(...) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(unlock_function(__VA_ARGS__))
 
 // Document functions that try to acquire a lock, and return success or failure
 // (or a non-boolean value that can be interpreted as a boolean).
@@ -131,47 +135,31 @@ limitations under the License.
 // or false for functions that return false on success. The second argument
 // specifies the mutex that is locked on success. If unspecified, it is assumed
 // to be 'this'.
-#define EXCLUSIVE_TRYLOCK_FUNCTION(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(exclusive_trylock_function(__VA_ARGS__))
+#define TF_EXCLUSIVE_TRYLOCK_FUNCTION(...) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE( \
+      exclusive_trylock_function(__VA_ARGS__))
 
-#define SHARED_TRYLOCK_FUNCTION(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(shared_trylock_function(__VA_ARGS__))
+#define TF_SHARED_TRYLOCK_FUNCTION(...) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(shared_trylock_function(__VA_ARGS__))
 
 // Document functions that dynamically check to see if a lock is held, and fail
 // if it is not held.
-#define ASSERT_EXCLUSIVE_LOCK(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(assert_exclusive_lock(__VA_ARGS__))
+#define TF_ASSERT_EXCLUSIVE_LOCK(...) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(assert_exclusive_lock(__VA_ARGS__))
 
-#define ASSERT_SHARED_LOCK(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(assert_shared_lock(__VA_ARGS__))
+#define TF_ASSERT_SHARED_LOCK(...) \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(assert_shared_lock(__VA_ARGS__))
 
 // Turns off thread safety checking within the body of a particular function.
 // This is used as an escape hatch for cases where either (a) the function
 // is correct, but the locking is more complicated than the analyzer can handle,
 // or (b) the function contains race conditions that are known to be benign.
-#define NO_THREAD_SAFETY_ANALYSIS \
-  THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis)
+#define TF_NO_THREAD_SAFETY_ANALYSIS \
+  TF_INTERNAL_THREAD_ANNOTATION_ATTRIBUTE(no_thread_safety_analysis)
 
-// TS_UNCHECKED should be placed around lock expressions that are not valid
+// TF_TS_UNCHECKED should be placed around lock expressions that are not valid
 // C++ syntax, but which are present for documentation purposes.  These
 // annotations will be ignored by the analysis.
-#define TS_UNCHECKED(x) ""
-
-namespace tensorflow {
-namespace thread_safety_analysis {
-
-// Takes a reference to a guarded data member, and returns an unguarded
-// reference.
-template <class T>
-inline const T& ts_unchecked_read(const T& v) NO_THREAD_SAFETY_ANALYSIS {
-  return v;
-}
-
-template <class T>
-inline T& ts_unchecked_read(T& v) NO_THREAD_SAFETY_ANALYSIS {
-  return v;
-}
-}  // namespace thread_safety_analysis
-}  // namespace tensorflow
+#define TF_TS_UNCHECKED(x) ""
 
 #endif  // TENSORFLOW_CORE_PLATFORM_THREAD_ANNOTATIONS_H_
diff --git a/tensorflow/core/platform/unbounded_work_queue_test.cc b/tensorflow/core/platform/unbounded_work_queue_test.cc
index 54ab87e4ce5..609839cb270 100644
--- a/tensorflow/core/platform/unbounded_work_queue_test.cc
+++ b/tensorflow/core/platform/unbounded_work_queue_test.cc
@@ -58,7 +58,7 @@ class UnboundedWorkQueueTest : public ::testing::Test {
 
  private:
   mutex mu_;
-  int closure_count_ GUARDED_BY(mu_) = 0;
+  int closure_count_ TF_GUARDED_BY(mu_) = 0;
   condition_variable cond_var_;
   std::unique_ptr<UnboundedWorkQueue> work_queue_;
 };
diff --git a/tensorflow/core/platform/windows/BUILD b/tensorflow/core/platform/windows/BUILD
index 02d1451d2d2..caba7db0d61 100644
--- a/tensorflow/core/platform/windows/BUILD
+++ b/tensorflow/core/platform/windows/BUILD
@@ -236,6 +236,11 @@ cc_library(
     ],
 )
 
+filegroup(
+    name = "xla_cpu_runtime_srcs",
+    srcs = ["env_time.cc"],
+)
+
 exports_files(
     srcs = ["intrinsics_port.h"],
     visibility = ["//tensorflow/core/platform:__pkg__"],
diff --git a/tensorflow/core/platform/windows/env.cc b/tensorflow/core/platform/windows/env.cc
index 207a9270c09..843f41765ef 100644
--- a/tensorflow/core/platform/windows/env.cc
+++ b/tensorflow/core/platform/windows/env.cc
@@ -44,7 +44,7 @@ namespace {
 mutex name_mutex(tensorflow::LINKER_INITIALIZED);
 
 std::map<std::thread::id, string>& GetThreadNameRegistry()
-    EXCLUSIVE_LOCKS_REQUIRED(name_mutex) {
+    TF_EXCLUSIVE_LOCKS_REQUIRED(name_mutex) {
   static auto* thread_name_registry = new std::map<std::thread::id, string>();
   return *thread_name_registry;
 }
diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc
index be567ccdbbe..117d79aed35 100644
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@@ -112,7 +112,7 @@ void* Malloc(size_t size) { return malloc(size); }
 
 void* Realloc(void* ptr, size_t size) { return realloc(ptr, size); }
 
-void Free(void* ptr) { return free(ptr); }
+void Free(void* ptr) { free(ptr); }
 
 void* NUMAMalloc(int node, size_t size, int minimum_alignment) {
   return AlignedMalloc(size, minimum_alignment);
diff --git a/tensorflow/core/platform/windows/subprocess.cc b/tensorflow/core/platform/windows/subprocess.cc
index f9f18ba48e9..59707eae498 100644
--- a/tensorflow/core/platform/windows/subprocess.cc
+++ b/tensorflow/core/platform/windows/subprocess.cc
@@ -204,7 +204,7 @@ bool SubProcess::Start() {
 
   // No need to store subprocess end of the pipes, they will be closed before
   // this function terminates.
-  HANDLE child_pipe_[kNFds] GUARDED_BY(data_mu_);
+  HANDLE child_pipe_[kNFds] TF_GUARDED_BY(data_mu_);
 
   // Create parent/child pipes for the specified channels and make the
   // parent-side of the pipes non-blocking.
diff --git a/tensorflow/core/platform/windows/subprocess.h b/tensorflow/core/platform/windows/subprocess.h
index f3d62a7a02e..5baba1eaaf0 100644
--- a/tensorflow/core/platform/windows/subprocess.h
+++ b/tensorflow/core/platform/windows/subprocess.h
@@ -101,21 +101,21 @@ class SubProcess {
   static const int kNFds = 3;
   static bool chan_valid(int chan) { return ((chan >= 0) && (chan < kNFds)); }
 
-  void FreeArgs() EXCLUSIVE_LOCKS_REQUIRED(data_mu_);
-  void ClosePipes() EXCLUSIVE_LOCKS_REQUIRED(data_mu_);
+  void FreeArgs() TF_EXCLUSIVE_LOCKS_REQUIRED(data_mu_);
+  void ClosePipes() TF_EXCLUSIVE_LOCKS_REQUIRED(data_mu_);
   bool WaitInternal(int* status);
 
   // The separation between proc_mu_ and data_mu_ mutexes allows Kill() to be
   // called by a thread while another thread is inside Wait() or Communicate().
   mutable mutex proc_mu_;
-  bool running_ GUARDED_BY(proc_mu_);
-  void* win_pi_ GUARDED_BY(proc_mu_);
+  bool running_ TF_GUARDED_BY(proc_mu_);
+  void* win_pi_ TF_GUARDED_BY(proc_mu_);
 
-  mutable mutex data_mu_ ACQUIRED_AFTER(proc_mu_);
-  char* exec_path_ GUARDED_BY(data_mu_);
-  char** exec_argv_ GUARDED_BY(data_mu_);
-  ChannelAction action_[kNFds] GUARDED_BY(data_mu_);
-  void* parent_pipe_[kNFds] GUARDED_BY(data_mu_);
+  mutable mutex data_mu_ TF_ACQUIRED_AFTER(proc_mu_);
+  char* exec_path_ TF_GUARDED_BY(data_mu_);
+  char** exec_argv_ TF_GUARDED_BY(data_mu_);
+  ChannelAction action_[kNFds] TF_GUARDED_BY(data_mu_);
+  void* parent_pipe_[kNFds] TF_GUARDED_BY(data_mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(SubProcess);
 };
diff --git a/tensorflow/core/profiler/convert/xplane_to_trace_events.cc b/tensorflow/core/profiler/convert/xplane_to_trace_events.cc
index e05a7719a7e..728a75b250d 100644
--- a/tensorflow/core/profiler/convert/xplane_to_trace_events.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_trace_events.cc
@@ -36,17 +36,6 @@ Device BuildDeviceAndResource(const XPlaneVisitor& plane) {
   return device;
 }
 
-// Returns true if the given stat shouldn't be shown in the trace viewer.
-bool IsInternalStat(StatType stat_type) {
-  switch (stat_type) {
-    case StatType::kKernelDetails:
-    case StatType::kLevel0:
-      return true;
-    default:
-      return false;
-  }
-}
-
 }  // namespace
 
 void MaybeDropEventsForTraceViewer(Trace* trace, uint32 limit) {
@@ -98,7 +87,7 @@ void ConvertXSpaceToTraceEvents(const XSpace& xspace, Trace* trace) {
 
         xevent.ForEachStat([&](const XStatVisitor& stat) {
           if (stat.ValueCase() == XStat::VALUE_NOT_SET) return;
-          if (stat.Type() && IsInternalStat(StatType(*stat.Type()))) return;
+          if (IsInternalStat(stat.Type())) return;
           args[string(stat.Name())] = stat.ToString();
         });
       });
diff --git a/tensorflow/core/profiler/internal/cpu/BUILD b/tensorflow/core/profiler/internal/cpu/BUILD
index fe028d85cf7..1c229f78c43 100644
--- a/tensorflow/core/profiler/internal/cpu/BUILD
+++ b/tensorflow/core/profiler/internal/cpu/BUILD
@@ -43,6 +43,9 @@ cc_library(
 tf_cc_test(
     name = "host_tracer_test",
     srcs = ["host_tracer_test.cc"],
+    tags = [
+        "no_oss",
+    ],
     deps = [
         ":host_tracer",
         "//tensorflow/core:core_cpu_lib",
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
index bf498747f00..3ce6678de01 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
@@ -962,9 +962,9 @@ class CudaEventRecorder {
   }
 
   absl::Mutex mutex_;
-  bool stopped_ GUARDED_BY(mutex_) = false;
-  std::vector<KernelRecord> kernel_records_ GUARDED_BY(mutex_);
-  std::vector<MemcpyRecord> memcpy_records_ GUARDED_BY(mutex_);
+  bool stopped_ TF_GUARDED_BY(mutex_) = false;
+  std::vector<KernelRecord> kernel_records_ TF_GUARDED_BY(mutex_);
+  std::vector<MemcpyRecord> memcpy_records_ TF_GUARDED_BY(mutex_);
 
   CuptiInterface *cupti_interface_;
   CuptiTraceCollector *collector_;
diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer.cc b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
index 08630d527a4..726cf4600e8 100644
--- a/tensorflow/core/profiler/internal/gpu/device_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
@@ -490,8 +490,9 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
     }
 
     mutex m;
-    std::vector<CuptiTracerEvent> events GUARDED_BY(m);
-    absl::flat_hash_map<uint32, CorrelationInfo> correlation_info GUARDED_BY(m);
+    std::vector<CuptiTracerEvent> events TF_GUARDED_BY(m);
+    absl::flat_hash_map<uint32, CorrelationInfo> correlation_info
+        TF_GUARDED_BY(m);
   };
   absl::FixedArray<PerDeviceCollector> per_device_collector_;
 
diff --git a/tensorflow/core/profiler/internal/profiler_factory.cc b/tensorflow/core/profiler/internal/profiler_factory.cc
index bf1dedc3c4b..9fd90c2dc77 100644
--- a/tensorflow/core/profiler/internal/profiler_factory.cc
+++ b/tensorflow/core/profiler/internal/profiler_factory.cc
@@ -23,7 +23,7 @@ namespace {
 
 mutex mu(LINKER_INITIALIZED);
 
-std::vector<ProfilerFactory>* GetFactories() EXCLUSIVE_LOCKS_REQUIRED(mu) {
+std::vector<ProfilerFactory>* GetFactories() TF_EXCLUSIVE_LOCKS_REQUIRED(mu) {
   static auto factories = new std::vector<ProfilerFactory>();
   return factories;
 }
diff --git a/tensorflow/core/profiler/internal/traceme_recorder.h b/tensorflow/core/profiler/internal/traceme_recorder.h
index 561368dd431..8b5b32cf4bc 100644
--- a/tensorflow/core/profiler/internal/traceme_recorder.h
+++ b/tensorflow/core/profiler/internal/traceme_recorder.h
@@ -108,14 +108,15 @@ class TraceMeRecorder {
   Events StopRecording();
 
   // Gathers events from all active threads, and clears their buffers.
-  Events Clear() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  Events Clear() TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   mutex mutex_;
   // Map of the static container instances (thread_local storage) for each
   // thread. While active, a ThreadLocalRecorder stores trace events.
-  absl::flat_hash_map<uint32, ThreadLocalRecorder*> threads_ GUARDED_BY(mutex_);
+  absl::flat_hash_map<uint32, ThreadLocalRecorder*> threads_
+      TF_GUARDED_BY(mutex_);
   // Events from threads that died during recording.
-  TraceMeRecorder::Events orphaned_events_ GUARDED_BY(mutex_);
+  TraceMeRecorder::Events orphaned_events_ TF_GUARDED_BY(mutex_);
 };
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/lib/profiler_session.h b/tensorflow/core/profiler/lib/profiler_session.h
index 70cedb7ecd9..78504312f54 100644
--- a/tensorflow/core/profiler/lib/profiler_session.h
+++ b/tensorflow/core/profiler/lib/profiler_session.h
@@ -44,15 +44,16 @@ class ProfilerSession {
   // Deletes an existing Profiler and enables starting a new one.
   ~ProfilerSession();
 
-  tensorflow::Status Status() LOCKS_EXCLUDED(mutex_);
+  tensorflow::Status Status() TF_LOCKS_EXCLUDED(mutex_);
 
   tensorflow::Status CollectData(profiler::XSpace* space)
-      LOCKS_EXCLUDED(mutex_);
+      TF_LOCKS_EXCLUDED(mutex_);
 
   tensorflow::Status CollectData(RunMetadata* run_metadata)
-      LOCKS_EXCLUDED(mutex_);
+      TF_LOCKS_EXCLUDED(mutex_);
 
-  tensorflow::Status SerializeToString(string* content) LOCKS_EXCLUDED(mutex_);
+  tensorflow::Status SerializeToString(string* content)
+      TF_LOCKS_EXCLUDED(mutex_);
 
  private:
   // Constructs an instance of the class and starts profiling
@@ -63,12 +64,12 @@ class ProfilerSession {
   ProfilerSession& operator=(const ProfilerSession&) = delete;
 
   std::vector<std::unique_ptr<profiler::ProfilerInterface>> profilers_
-      GUARDED_BY(mutex_);
+      TF_GUARDED_BY(mutex_);
 
   // True if the session is active.
-  bool active_ GUARDED_BY(mutex_);
+  bool active_ TF_GUARDED_BY(mutex_);
 
-  tensorflow::Status status_ GUARDED_BY(mutex_);
+  tensorflow::Status status_ TF_GUARDED_BY(mutex_);
   const uint64 start_time_ns_;
   mutex mutex_;
 };
diff --git a/tensorflow/core/profiler/rpc/BUILD b/tensorflow/core/profiler/rpc/BUILD
index 394c582b3f0..3ecec434ade 100644
--- a/tensorflow/core/profiler/rpc/BUILD
+++ b/tensorflow/core/profiler/rpc/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow:tensorflow.bzl", "tf_external_workspace_visible")
+
 package(
     licenses = ["notice"],  # Apache 2.0
 )
@@ -7,7 +9,9 @@ cc_library(
     srcs = ["profiler_service_impl.cc"],
     hdrs = ["profiler_service_impl.h"],
     features = ["-layering_check"],
-    visibility = ["//tensorflow_serving/model_servers:__pkg__"],
+    visibility = tf_external_workspace_visible(
+        ["//tensorflow_serving/model_servers:__pkg__"],
+    ),
     deps = [
         "//tensorflow:grpc++",
         "//tensorflow/core:framework",
diff --git a/tensorflow/core/profiler/rpc/client/BUILD b/tensorflow/core/profiler/rpc/client/BUILD
index 470956e5d69..a36e517c4c2 100644
--- a/tensorflow/core/profiler/rpc/client/BUILD
+++ b/tensorflow/core/profiler/rpc/client/BUILD
@@ -29,5 +29,6 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
     ],
 )
diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.cc b/tensorflow/core/profiler/rpc/client/capture_profile.cc
index e9bcc1b8227..4541e342804 100644
--- a/tensorflow/core/profiler/rpc/client/capture_profile.cc
+++ b/tensorflow/core/profiler/rpc/client/capture_profile.cc
@@ -14,8 +14,6 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/rpc/client/capture_profile.h"
 
-#include <cstdio>
-#include <ctime>
 #include <vector>
 
 #include "grpcpp/grpcpp.h"
@@ -37,14 +35,6 @@ namespace {
 
 constexpr uint64 kMaxEvents = 1000000;
 
-string GetCurrentTimeStampAsString() {
-  char s[128];
-  std::time_t t = std::time(nullptr);
-  auto result = std::strftime(s, sizeof(s), "%F_%T", std::localtime(&t));
-  DCHECK_NE(result, 0);
-  return s;
-}
-
 ProfileRequest PopulateProfileRequest(int duration_ms,
                                       const string& repository_root,
                                       const string& session_id,
@@ -107,8 +97,8 @@ Status Profile(const string& service_addr, const string& logdir,
       FromGrpcStatus(stub->Profile(&context, request, &response)));
 
   if (!response.empty_trace()) {
-    TF_CHECK_OK(SaveTensorboardProfile(logdir, session_id, request.host_name(),
-                                       response, &std::cout));
+    TF_RETURN_IF_ERROR(SaveTensorboardProfile(
+        logdir, session_id, request.host_name(), response, &std::cout));
     // Print this at the end so that it's not buried in irrelevant LOG messages.
     std::cout
         << "NOTE: using the trace duration " << duration_ms << "ms.\n"
diff --git a/tensorflow/core/profiler/rpc/client/save_profile.cc b/tensorflow/core/profiler/rpc/client/save_profile.cc
index 91abc5a8267..dad2918f01a 100644
--- a/tensorflow/core/profiler/rpc/client/save_profile.cc
+++ b/tensorflow/core/profiler/rpc/client/save_profile.cc
@@ -21,9 +21,11 @@ limitations under the License.
 
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/strip.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/compression.h"
-#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/protobuf.h"
 // Windows.h #defines ERROR, but it is also used in
@@ -36,7 +38,39 @@ namespace tensorflow {
 namespace profiler {
 namespace {
 
-using ::tensorflow::io::JoinPath;
+#ifdef PLATFORM_WINDOWS
+const absl::string_view kPathSep = "\\";
+#else
+const absl::string_view kPathSep = "/";
+#endif
+
+string ProfilerJoinPathImpl(std::initializer_list<absl::string_view> paths) {
+  string result;
+  for (absl::string_view path : paths) {
+    if (path.empty()) continue;
+
+    if (result.empty()) {
+      result = string(path);
+      continue;
+    }
+
+    path = absl::StripPrefix(path, kPathSep);
+    if (absl::EndsWith(result, kPathSep)) {
+      strings::StrAppend(&result, path);
+    } else {
+      strings::StrAppend(&result, kPathSep, path);
+    }
+  }
+
+  return result;
+}
+
+// A local duplication of ::tensorflow::io::JoinPath that supports windows.
+// TODO(b/150699701): revert to use ::tensorflow::io::JoinPath when fixed.
+template <typename... T>
+string ProfilerJoinPath(const T&... args) {
+  return ProfilerJoinPathImpl({args...});
+}
 
 constexpr char kProtoTraceFileName[] = "trace";
 constexpr char kTfStatsHelperSuffix[] = "tf_stats_helper_result";
@@ -47,7 +81,8 @@ Status DumpToolDataToLogDirectory(StringPiece run_dir, const string& host,
   // Don't save the intermediate results for combining the per host tool data.
   if (absl::EndsWith(tool.name(), kTfStatsHelperSuffix)) return Status::OK();
   string host_prefix = host.empty() ? "" : absl::StrCat(host, ".");
-  string path = JoinPath(run_dir, absl::StrCat(host_prefix, tool.name()));
+  string path =
+      ProfilerJoinPath(run_dir, absl::StrCat(host_prefix, tool.name()));
   TF_RETURN_IF_ERROR(WriteStringToFile(Env::Default(), path, tool.data()));
   if (os) {
     *os << "Dumped tool data for " << tool.name() << " to " << path
@@ -69,7 +104,7 @@ Status MaybeCreateEmptyEventFile(const string& logdir) {
       return Status::OK();
     }
   }
-  EventsWriter event_writer(JoinPath(logdir, "events"));
+  EventsWriter event_writer(ProfilerJoinPath(logdir, "events"));
   return event_writer.InitWithSuffix(kProfileEmptySuffix);
 }
 
@@ -89,7 +124,8 @@ Status WriteGzippedDataToFile(const string& filepath, const string& data) {
 Status GetOrCreateProfileRunDir(const string& logdir, const string& run,
                                 string* profile_run_dir, std::ostream* os) {
   // Dumps profile data to <logdir>/plugins/profile/<run>/.
-  *profile_run_dir = JoinPath(GetTensorBoardProfilePluginDir(logdir), run);
+  *profile_run_dir =
+      ProfilerJoinPath(GetTensorBoardProfilePluginDir(logdir), run);
   *os << "Creating directory: " << *profile_run_dir;
   TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(*profile_run_dir));
 
@@ -104,7 +140,7 @@ Status GetOrCreateProfileRunDir(const string& logdir, const string& run,
 string GetTensorBoardProfilePluginDir(const string& logdir) {
   constexpr char kPluginName[] = "plugins";
   constexpr char kProfileName[] = "profile";
-  return JoinPath(logdir, kPluginName, kProfileName);
+  return ProfilerJoinPath(logdir, kPluginName, kProfileName);
 }
 
 Status SaveTensorboardProfile(const string& logdir, const string& run,
@@ -132,11 +168,17 @@ Status SaveGzippedToolDataToTensorboardProfile(const string& logdir,
   LOG(INFO) << ss.str();
   TF_RETURN_IF_ERROR(status);
   string host_prefix = host.empty() ? "" : absl::StrCat(host, ".");
-  string path = JoinPath(profile_run_dir, absl::StrCat(host_prefix, tool_name));
+  string path =
+      ProfilerJoinPath(profile_run_dir, absl::StrCat(host_prefix, tool_name));
   TF_RETURN_IF_ERROR(WriteGzippedDataToFile(path, data));
   LOG(INFO) << "Dumped gzipped tool data for " << tool_name << " to " << path;
   return Status::OK();
 }
 
+string GetCurrentTimeStampAsString() {
+  return absl::FormatTime("%E4Y_%m_%d_%H_%M_%S", absl::Now(),
+                          absl::LocalTimeZone());
+}
+
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/client/save_profile.h b/tensorflow/core/profiler/rpc/client/save_profile.h
index 2ac28fde41c..d9070f06c71 100644
--- a/tensorflow/core/profiler/rpc/client/save_profile.h
+++ b/tensorflow/core/profiler/rpc/client/save_profile.h
@@ -22,6 +22,8 @@ limitations under the License.
 namespace tensorflow {
 namespace profiler {
 
+string GetCurrentTimeStampAsString();
+
 // Returns the profile plugin directory given a logdir to TensorBoard.
 string GetTensorBoardProfilePluginDir(const string& logdir);
 
diff --git a/tensorflow/core/profiler/utils/group_events.cc b/tensorflow/core/profiler/utils/group_events.cc
index 3c74e13bd79..0f9fcf939c3 100644
--- a/tensorflow/core/profiler/utils/group_events.cc
+++ b/tensorflow/core/profiler/utils/group_events.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <stack>
 
-#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -79,8 +78,6 @@ void SetGroupId(const XPlaneVisitor& visitor, int64 group_id, XEvent* event) {
                      event);
 }
 
-using VirtualEventContainer = std::vector<std::unique_ptr<XEvent>>;
-
 using VirtualEventNodeMap =
     absl::flat_hash_map<int64 /*step_id*/,
                         absl::flat_hash_map<int64 /*iter_num*/, EventNode*>>;
@@ -93,52 +90,30 @@ std::unique_ptr<XEvent> CreateVirtualEvent(const XStat& step_id_stat,
   return virtual_event;
 }
 
-// Create virtual events of HostEventType::kHostTrainingLoopIteration and event
-// nodes for them. A virtual event is created for each iteration of the host
-// training loop and connected to the HostEventType::kExecutorStateProcess event
-// nodes of the iteration.
-void CreateVirtualEvents(EventNodeMap* event_node_map,
-                         VirtualEventContainer* virtual_event_container) {
-  VirtualEventNodeMap virtual_event_node_map;
-  auto executor_event_node_list =
-      gtl::FindOrNull(*event_node_map, HostEventType::kExecutorStateProcess);
-  if (!executor_event_node_list) return;
-  for (auto& executor_event_node : *executor_event_node_list) {
-    const XStat* step_id_stat =
-        executor_event_node->GetContextStat(StatType::kStepId);
-    const XStat* iter_num_stat =
-        executor_event_node->GetContextStat(StatType::kIterNum);
-    if (!step_id_stat || !iter_num_stat) continue;
-    int64 step_id = step_id_stat->int64_value();
-    int64 iter_num = iter_num_stat->int64_value();
-    // Process the event with nonzero iter_num only to filter out the events
-    // related to tf.data.
-    // TODO(jihochoi): Filter out tf.data events more reliably.
-    if (!iter_num) continue;
-    EventNode*& virtual_event_node = virtual_event_node_map[step_id][iter_num];
-    if (!virtual_event_node) {
-      std::unique_ptr<XEvent> new_virtual_event =
-          CreateVirtualEvent(*step_id_stat, *iter_num_stat);
-      auto new_virtual_event_node = absl::make_unique<EventNode>(
-          &executor_event_node->GetPlaneVisitor(), new_virtual_event.get());
-      // virtual_event_container keeps new_virtual_event alive.
-      virtual_event_container->push_back(std::move(new_virtual_event));
-      virtual_event_node = new_virtual_event_node.get();
-      // event_node_map keeps new_virtual_event_node alive.
-      (*event_node_map)[HostEventType::kHostTrainingLoopIteration].push_back(
-          std::move(new_virtual_event_node));
-    }
-    virtual_event_node->AddChild(executor_event_node.get());
-  }
-}
-
-bool NeedsVirtualEvents(
+bool NeedsVirtualEventsForHostTrainingLoop(
     const std::vector<int64 /*EventType*/>& root_event_types) {
   return std::find(root_event_types.begin(), root_event_types.end(),
                    HostEventType::kHostTrainingLoopIteration) !=
          root_event_types.end();
 }
 
+bool NeedsVirtualEventsForAsyncExecutor(
+    const std::vector<int64 /*EventType*/>& root_event_types) {
+  return std::find(root_event_types.begin(), root_event_types.end(),
+                   HostEventType::kAsyncExecutorTraceContext) !=
+         root_event_types.end();
+}
+
+bool HasFunctionRun(EventNode* event_node) {
+  for (EventNode* child : event_node->GetChildren()) {
+    if (child->GetPlaneVisitor().GetEventType(child->GetEvent()) ==
+        HostEventType::kFunctionRun) {
+      return true;
+    }
+  }
+  return false;
+}
+
 }  // namespace
 
 const XStat* EventNode::GetContextStat(int64 stat_type) const {
@@ -170,6 +145,11 @@ void EventNode::PropagateGroupId(int64 group_id) {
   group_id_ = group_id;
   SetGroupId(*visitor_, group_id, event_);
   for (const auto& child : children_) {
+    // Skip if it already belongs to a group. Some nodes may be added multiple
+    // times as child (e.g., sometimes async ops are executed synchronously and
+    // their nodes are added as child both in ConnectIntraThread and
+    // ConnectInterThread).
+    if (child->GetGroupId()) continue;
     child->PropagateGroupId(*group_id_);
   }
 }
@@ -183,8 +163,8 @@ bool EventNode::IsNestedIn(EventNode* parent) {
   return parent && IsNested(GetEvent(), parent->GetEvent());
 }
 
-void ConnectIntraThread(const XPlaneVisitor& visitor, XPlane* plane,
-                        EventNodeMap* event_node_map) {
+void EventForest::ConnectIntraThread(const XPlaneVisitor& visitor,
+                                     XPlane* plane) {
   for (auto& line : *plane->mutable_lines()) {
     std::vector<EventNode*> parent_nodes;
     for (auto& event : *line.mutable_events()) {
@@ -199,21 +179,20 @@ void ConnectIntraThread(const XPlaneVisitor& visitor, XPlane* plane,
         }
       }
       parent_nodes.push_back(cur_node.get());
-      // event_node_map keeps cur_node alive.
-      (*event_node_map)[GetEventType(visitor, event)].push_back(
+      // event_node_map_ keeps cur_node alive.
+      event_node_map_[GetEventType(visitor, event)].push_back(
           std::move(cur_node));
     }
   }
 }
 
-void ConnectInterThread(
-    const EventNodeMap& event_node_map,
+void EventForest::ConnectInterThread(
     const std::vector<InterThreadConnectInfo>& connect_info_list) {
   for (const auto& connect_info : connect_info_list) {
     absl::flat_hash_map<std::vector<int64>, EventNode*> connect_map;
     const std::vector<int64>& stat_types = connect_info.stat_types;
     if (auto parent_event_node_list =
-            gtl::FindOrNull(event_node_map, connect_info.parent_event_type)) {
+            gtl::FindOrNull(event_node_map_, connect_info.parent_event_type)) {
       for (const auto& parent_event_node : *parent_event_node_list) {
         std::vector<int64> stats;
         for (auto stat_type : stat_types) {
@@ -229,7 +208,7 @@ void ConnectInterThread(
       }
     }
     if (auto child_event_node_list =
-            gtl::FindOrNull(event_node_map, connect_info.child_event_type)) {
+            gtl::FindOrNull(event_node_map_, connect_info.child_event_type)) {
       for (const auto& child_event_node : *child_event_node_list) {
         std::vector<int64> stats;
         for (auto stat_type : stat_types) {
@@ -249,13 +228,12 @@ void ConnectInterThread(
   }
 }
 
-void CreateEventGroup(const std::vector<int64 /*EventType*/>& root_event_types,
-                      const EventNodeMap& event_node_map,
-                      EventGroupNameMap* event_group_name_map) {
+void EventForest::CreateEventGroup(
+    const std::vector<int64 /*EventType*/>& root_event_types) {
   int64 next_group_id = 0;
   for (int64 root_event_type : root_event_types) {
     if (auto root_event_node_list =
-            gtl::FindOrNull(event_node_map, root_event_type)) {
+            gtl::FindOrNull(event_node_map_, root_event_type)) {
       for (const auto& root_event_node : *root_event_node_list) {
         // Skip if it already belongs to a group.
         if (root_event_node->GetGroupId()) continue;
@@ -264,35 +242,96 @@ void CreateEventGroup(const std::vector<int64 /*EventType*/>& root_event_types,
         std::string group_name = root_event_node->GetGroupName();
         // TODO(jihochoi): change event name instead.
         root_event_node->AddStepName(group_name);
-        if (event_group_name_map) {
-          (*event_group_name_map)[group_id] = std::move(group_name);
-        }
+        event_group_name_map_[group_id] = std::move(group_name);
       }
+      // Only use the first root event type found.
+      if (!root_event_node_list->empty()) break;
     }
   }
 }
 
-void GroupEvents(const std::vector<InterThreadConnectInfo>& connect_info_list,
-                 const std::vector<int64>& root_event_types, XSpace* space,
-                 EventGroupNameMap* event_group_name_map) {
-  EventNodeMap event_node_map;
-  // Keeps virtual events alive for this scope.
-  VirtualEventContainer virtual_event_container;
-  std::vector<XPlaneVisitor> visitors;
-  visitors.reserve(space->planes_size());
+void EventForest::CreateVirtualEventsForHostTrainingLoop() {
+  VirtualEventNodeMap virtual_event_node_map;
+  auto executor_event_node_list =
+      gtl::FindOrNull(event_node_map_, HostEventType::kExecutorStateProcess);
+  if (!executor_event_node_list) return;
+  for (auto& executor_event_node : *executor_event_node_list) {
+    const XStat* step_id_stat =
+        executor_event_node->GetContextStat(StatType::kStepId);
+    const XStat* iter_num_stat =
+        executor_event_node->GetContextStat(StatType::kIterNum);
+    if (!step_id_stat || !iter_num_stat) continue;
+    int64 step_id = step_id_stat->int64_value();
+    int64 iter_num = iter_num_stat->int64_value();
+    // Process the event with nonzero iter_num only to filter out the events
+    // related to tf.data.
+    // TODO(jihochoi): Filter out tf.data events more reliably.
+    if (!iter_num) continue;
+    EventNode*& virtual_event_node = virtual_event_node_map[step_id][iter_num];
+    if (!virtual_event_node) {
+      std::unique_ptr<XEvent> new_virtual_event =
+          CreateVirtualEvent(*step_id_stat, *iter_num_stat);
+      auto new_virtual_event_node = absl::make_unique<EventNode>(
+          &executor_event_node->GetPlaneVisitor(), new_virtual_event.get());
+      // virtual_event_container_ keeps new_virtual_event alive.
+      virtual_event_container_.push_back(std::move(new_virtual_event));
+      virtual_event_node = new_virtual_event_node.get();
+      // event_node_map_ keeps new_virtual_event_node alive.
+      event_node_map_[HostEventType::kHostTrainingLoopIteration].push_back(
+          std::move(new_virtual_event_node));
+    }
+    virtual_event_node->AddChild(executor_event_node.get());
+  }
+}
+
+void EventForest::CreateVirtualEventsForAsyncExecutor() {
+  auto eager_kernel_execute_event_node_list =
+      gtl::FindOrNull(event_node_map_, HostEventType::kEagerKernelExecute);
+  if (!eager_kernel_execute_event_node_list) return;
+  EventNode* virtual_event_node = nullptr;
+  for (auto& eager_kernel_execute_event_node :
+       *eager_kernel_execute_event_node_list) {
+    if (HasFunctionRun(eager_kernel_execute_event_node.get())) {
+      auto new_virtual_event = absl::make_unique<XEvent>();
+      auto new_virtual_event_node = absl::make_unique<EventNode>(
+          &eager_kernel_execute_event_node->GetPlaneVisitor(),
+          new_virtual_event.get());
+      // virtual_event_container_ keeps new_virtual_event alive.
+      virtual_event_container_.push_back(std::move(new_virtual_event));
+      virtual_event_node = new_virtual_event_node.get();
+      // event_node_map_ keeps new_virtual_event_node alive.
+      event_node_map_[HostEventType::kAsyncExecutorTraceContext].push_back(
+          std::move(new_virtual_event_node));
+    }
+    if (virtual_event_node) {
+      virtual_event_node->AddChild(eager_kernel_execute_event_node.get());
+    }
+  }
+}
+
+EventForest::EventForest(
+    const std::vector<InterThreadConnectInfo>& connect_info_list,
+    const std::vector<int64>& root_event_types,
+    const std::function<XPlaneVisitor(const XPlane*)> visitor_factory,
+    XSpace* space) {
+  visitors_.reserve(space->planes_size());
   for (auto& plane : *space->mutable_planes()) {
     CreateStatMetadata(&plane);
-    visitors.push_back(CreateTfXPlaneVisitor(&plane));
-    ConnectIntraThread(visitors.back(), &plane, &event_node_map);
+    visitors_.push_back(visitor_factory(&plane));
+    ConnectIntraThread(visitors_.back(), &plane);
   }
-  ConnectInterThread(event_node_map, connect_info_list);
-  if (NeedsVirtualEvents(root_event_types)) {
-    CreateVirtualEvents(&event_node_map, &virtual_event_container);
+  ConnectInterThread(connect_info_list);
+  if (NeedsVirtualEventsForHostTrainingLoop(root_event_types)) {
+    CreateVirtualEventsForHostTrainingLoop();
   }
-  CreateEventGroup(root_event_types, event_node_map, event_group_name_map);
+  if (NeedsVirtualEventsForAsyncExecutor(root_event_types)) {
+    CreateVirtualEventsForAsyncExecutor();
+  }
+  CreateEventGroup(root_event_types);
 }
 
 void GroupTfEvents(XSpace* space, EventGroupNameMap* event_group_name_map) {
+  if (!space) return;
   std::vector<InterThreadConnectInfo> connect_info_list(
       {{HostEventType::kFunctionRun,
         HostEventType::kExecutorStateProcess,
@@ -305,11 +344,18 @@ void GroupTfEvents(XSpace* space, EventGroupNameMap* event_group_name_map) {
         {StatType::kStepId, StatType::kIterNum}},
        {HostEventType::kKernelLaunch,
         HostEventType::kKernelExecute,
-        {StatType::kCorrelationId}}});
+        {StatType::kCorrelationId}},
+       {HostEventType::kLocalExecutableExecuteOnLocalDevice,
+        HostEventType::kLocalExecutableExecute,
+        {StatType::kRunId}}});
   const std::vector<int64 /*EventType*/> root_event_types(
-      {HostEventType::kHostTrainingLoopIteration, HostEventType::kTraceContext,
-       HostEventType::kFunctionRun, HostEventType::kSessionRun});
-  GroupEvents(connect_info_list, root_event_types, space, event_group_name_map);
+      {HostEventType::kTraceContext, HostEventType::kFunctionRun,
+       HostEventType::kSessionRun, HostEventType::kHostTrainingLoopIteration});
+  EventForest event_forest(connect_info_list, root_event_types,
+                           CreateTfXPlaneVisitor, space);
+  if (event_group_name_map) {
+    *event_group_name_map = event_forest.GetEventGroupNameMap();
+  }
 }
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/utils/group_events.h b/tensorflow/core/profiler/utils/group_events.h
index 8404c42af6d..a66b5125d47 100644
--- a/tensorflow/core/profiler/utils/group_events.h
+++ b/tensorflow/core/profiler/utils/group_events.h
@@ -48,6 +48,8 @@ class EventNode {
 
   EventNode* GetParent() const { return parent_; }
 
+  const std::vector<EventNode*>& GetChildren() const { return children_; }
+
   void AddChild(EventNode* child) {
     children_.push_back(child);
     child->parent_ = this;
@@ -82,31 +84,57 @@ using EventNodeMap =
     absl::flat_hash_map<int64 /*event_type*/,
                         std::vector<std::unique_ptr<EventNode>>>;
 
+using VirtualEventContainer = std::vector<std::unique_ptr<XEvent>>;
+
 using EventGroupNameMap = absl::flat_hash_map<int64 /*group_id*/, std::string>;
 
-// Creates an EventNode for each event in event_node_map and connect events
-// according to the nesting relationship within the thread.
-void ConnectIntraThread(const XPlaneVisitor& visitor, XPlane* plane,
-                        EventNodeMap* event_node_map);
+// Creates a forest of EventNode by stitching events in space using the nesting
+// relationship within the same thread and connect_info_list across threads, and
+// groups them by the root events.
+class EventForest {
+ public:
+  EventForest(const std::vector<InterThreadConnectInfo>& connect_info_list,
+              const std::vector<int64>& root_event_types,
+              const std::function<XPlaneVisitor(const XPlane*)> visitor_factory,
+              XSpace* space);
 
-// Connects events across threads according to connect_info_list.
-void ConnectInterThread(
-    const EventNodeMap& event_node_map,
-    const std::vector<InterThreadConnectInfo>& connect_info_list);
+  const EventNodeMap& GetEventNodeMap() const { return event_node_map_; }
 
-// Creates event groups and populates event_group_name_map. For each event of
-// each event type in root_event_types in order, if it was not grouped yet, a
-// new group is created with all the events reachable from the root event.
-void CreateEventGroup(const std::vector<int64 /*EventType*/>& root_event_types,
-                      const EventNodeMap& event_node_map,
-                      EventGroupNameMap* event_group_name_map);
+  const EventGroupNameMap& GetEventGroupNameMap() const {
+    return event_group_name_map_;
+  }
 
-// Groups events in space using the nesting relationship within the same thread
-// and connect_info_list across threads, and populates event_group_name_map if
-// not nullptr.
-void GroupEvents(const std::vector<InterThreadConnectInfo>& connect_info_list,
-                 const std::vector<int64>& root_event_types, XSpace* space,
-                 EventGroupNameMap* event_group_name_map);
+ private:
+  // Creates an EventNode for each event in event_node_map and connect events
+  // according to the nesting relationship within the thread.
+  void ConnectIntraThread(const XPlaneVisitor& visitor, XPlane* plane);
+
+  // Connects events across threads according to connect_info_list.
+  void ConnectInterThread(
+      const std::vector<InterThreadConnectInfo>& connect_info_list);
+
+  // Creates event groups and populates event_group_name_map_. For each event of
+  // each event type in root_event_types in order, if it was not grouped yet, a
+  // new group is created with all the events reachable from the root event.
+  void CreateEventGroup(
+      const std::vector<int64 /*EventType*/>& root_event_types);
+
+  // Create virtual events of HostEventType::kHostTrainingLoopIteration and
+  // event nodes for them. A virtual event is created for each iteration of the
+  // host training loop and connected to the
+  // HostEventType::kExecutorStateProcess event nodes of the iteration.
+  void CreateVirtualEventsForHostTrainingLoop();
+
+  // Create virutal events of HostEventType::kAsyncExecutorTraceContext and
+  // event nodes for them. A virtual event is created for every FunctionRun and
+  // the following eager ops (e.g., for Keras callback).
+  void CreateVirtualEventsForAsyncExecutor();
+
+  EventNodeMap event_node_map_;
+  std::vector<XPlaneVisitor> visitors_;
+  VirtualEventContainer virtual_event_container_;
+  EventGroupNameMap event_group_name_map_;
+};
 
 // Calls GroupEvents with connect_info_list and root_event_types specific to
 // TensorFlow.
diff --git a/tensorflow/core/profiler/utils/group_events_test.cc b/tensorflow/core/profiler/utils/group_events_test.cc
index 970d20385b7..94d576527a7 100644
--- a/tensorflow/core/profiler/utils/group_events_test.cc
+++ b/tensorflow/core/profiler/utils/group_events_test.cc
@@ -27,86 +27,6 @@ namespace tensorflow {
 namespace profiler {
 namespace {
 
-// Test if events on the same thread are connected correctly according to the
-// nesting relationship.
-TEST(GroupEventsTest, ConnectIntraThreadTest) {
-  XPlane plane;
-  XPlaneBuilder plane_builder(&plane);
-  plane_builder.ReserveLines(1);
-  auto line_builder = plane_builder.GetOrCreateLine(0);
-  CreateXEvent(&plane_builder, &line_builder, HostEventType::kTraceContext, 0,
-               100, {});
-  CreateXEvent(&plane_builder, &line_builder, HostEventType::kFunctionRun, 10,
-               90, {});
-  CreateXEvent(&plane_builder, &line_builder, HostEventType::kFunctionRun, 110,
-               190, {});
-
-  XPlaneVisitor plane_visitor = CreateTfXPlaneVisitor(&plane);
-  EventNodeMap event_node_map;
-  ConnectIntraThread(plane_visitor, &plane, &event_node_map);
-  EXPECT_EQ(event_node_map[HostEventType::kTraceContext].size(), 1);
-  EXPECT_EQ(event_node_map[HostEventType::kFunctionRun].size(), 2);
-  EXPECT_EQ(
-      plane_visitor.GetEventType(event_node_map[HostEventType::kFunctionRun][0]
-                                     ->GetParent()
-                                     ->GetEvent()),
-      HostEventType::kTraceContext);
-  EXPECT_EQ(event_node_map[HostEventType::kFunctionRun][1]->GetParent(),
-            nullptr);
-}
-
-// Test (1) if FunctionRun and ExecutorState::Process are connected correctly
-// through id and (2) group_id is set correctly.
-TEST(GroupEventsTest, ConnectInterThreadTest) {
-  XPlane plane;
-  XPlaneBuilder plane_builder(&plane);
-  plane_builder.ReserveLines(2);
-
-  auto main_thread = plane_builder.GetOrCreateLine(0);
-  CreateXEvent(&plane_builder, &main_thread, HostEventType::kFunctionRun, 0,
-               100, {{StatType::kStepId, 0}});
-
-  auto tf_executor_thread = plane_builder.GetOrCreateLine(1);
-  CreateXEvent(&plane_builder, &tf_executor_thread,
-               HostEventType::kExecutorStateProcess, 0, 100,
-               {{StatType::kStepId, 0}});
-  CreateXEvent(&plane_builder, &tf_executor_thread,
-               HostEventType::kExecutorStateProcess, 200, 300,
-               {{StatType::kStepId, 1}});
-
-  XPlaneVisitor plane_visitor = CreateTfXPlaneVisitor(&plane);
-  EventNodeMap event_node_map;
-  ConnectIntraThread(plane_visitor, &plane, &event_node_map);
-  std::vector<InterThreadConnectInfo> connect_info_list(
-      {{HostEventType::kFunctionRun,
-        HostEventType::kExecutorStateProcess,
-        {StatType::kStepId}}});
-  ConnectInterThread(event_node_map, connect_info_list);
-  EXPECT_EQ(event_node_map[HostEventType::kFunctionRun].size(), 1);
-  EXPECT_EQ(event_node_map[HostEventType::kExecutorStateProcess].size(), 2);
-  EXPECT_EQ(plane_visitor.GetEventType(
-                event_node_map[HostEventType::kExecutorStateProcess][0]
-                    ->GetParent()
-                    ->GetEvent()),
-            HostEventType::kFunctionRun);
-  EXPECT_EQ(
-      event_node_map[HostEventType::kExecutorStateProcess][1]->GetParent(),
-      nullptr);
-  EventGroupNameMap event_group_name_map;
-  CreateEventGroup({HostEventType::kFunctionRun}, event_node_map,
-                   &event_group_name_map);
-  EXPECT_EQ(*event_node_map[HostEventType::kFunctionRun][0]->GetGroupId(), 0);
-  EXPECT_EQ(
-      *event_node_map[HostEventType::kExecutorStateProcess][0]->GetGroupId(),
-      0);
-  EXPECT_EQ(event_node_map[HostEventType::kExecutorStateProcess][1]
-                ->GetGroupId()
-                .has_value(),
-            false);
-  EXPECT_EQ(event_group_name_map.size(), 1);
-  EXPECT_EQ(event_group_name_map[0], "0");
-}
-
 TEST(GroupEventsTest, GroupGpuTraceTest) {
   XSpace space;
   XPlaneBuilder host_plane_builder(space.add_planes());
diff --git a/tensorflow/core/profiler/utils/xplane_schema.cc b/tensorflow/core/profiler/utils/xplane_schema.cc
index 9a20cc51c55..8eb7bd7c76d 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.cc
+++ b/tensorflow/core/profiler/utils/xplane_schema.cc
@@ -80,10 +80,15 @@ const HostEventTypeMap& GetHostEventTypeMap() {
       {"WhileOp-StartBody", kWhileOpStartBody},
       {"ForOp", kForOp},
       {"PartitionedCallOp", kPartitionedCallOp},
+      // XLA related.
+      {"LocalExecutable::ExecuteOnLocalDevices",
+       kLocalExecutableExecuteOnLocalDevice},
+      {"LocalExecutable::Execute", kLocalExecutableExecute},
       // tf.data related.
       {"IteratorGetNextOp::DoCompute", kIteratorGetNextOp},
       // Virtual events for grouping.
       {"HostTrainingLoopIteration", kHostTrainingLoopIteration},
+      {"AsyncExecutorTraceContext", kAsyncExecutorTraceContext},
       // GPU related.
       {"KernelLaunch", kKernelLaunch},
       {"KernelExecute", kKernelExecute},
diff --git a/tensorflow/core/profiler/utils/xplane_schema.h b/tensorflow/core/profiler/utils/xplane_schema.h
index 51a94e5c760..ad8efd60033 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.h
+++ b/tensorflow/core/profiler/utils/xplane_schema.h
@@ -74,10 +74,14 @@ enum HostEventType {
   kWhileOpStartBody,
   kForOp,
   kPartitionedCallOp,
+  // XLA related.
+  kLocalExecutableExecuteOnLocalDevice,
+  kLocalExecutableExecute,
   // tf.data related.
   kIteratorGetNextOp,
   // Virtual events for grouping.
   kHostTrainingLoopIteration,
+  kAsyncExecutorTraceContext,
   // GPU related.
   kKernelLaunch,
   kKernelExecute,
@@ -164,6 +168,12 @@ inline bool IsStatType(StatType stat_type, absl::string_view stat_name) {
 
 absl::optional<int64> FindStatType(absl::string_view stat_name);
 
+// Returns true if the given stat shouldn't be shown in the trace viewer.
+inline bool IsInternalStat(absl::optional<int64> stat_type) {
+  return stat_type == StatType::kKernelDetails ||
+         stat_type == StatType::kLevel0;
+}
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 93f350f4c30..7973e002762 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -569,6 +569,13 @@ message ConfigProto {
     // to lower the encapsulated graph to a particular device.
     bool enable_mlir_bridge = 13;
 
+    // Whether to enable the MLIR-based Graph optimizations.
+    //
+    // This will become a part of standard Tensorflow graph optimization
+    // pipeline, currently this is only used for gradual migration and testing
+    // new passes that are replacing existing optimizations in Grappler.
+    bool enable_mlir_graph_optimization = 16;
+
     // If true, the session will not store an additional copy of the graph for
     // each subgraph.
     //
@@ -640,6 +647,13 @@ message RunOptions {
     // and tail) latency.
     // Consider using this option for CPU-bound workloads like inference.
     bool use_run_handler_pool = 2;
+    // Options for run handler thread pool.
+    message RunHandlerPoolOptions {
+      // Priority of the request. The run handler thread pool will schedule ops
+      // based on the priority number. The larger number means higher priority.
+      int64 priority = 1;
+    }
+    RunHandlerPoolOptions run_handler_pool_options = 3;
   }
 
   Experimental experimental = 8;
diff --git a/tensorflow/core/protobuf/eager_service.proto b/tensorflow/core/protobuf/eager_service.proto
index a68f487f8f1..c2a7553306e 100644
--- a/tensorflow/core/protobuf/eager_service.proto
+++ b/tensorflow/core/protobuf/eager_service.proto
@@ -25,6 +25,15 @@ message Operation {
   string name = 2;
   repeated RemoteTensorHandle inputs = 3;
 
+  message Input {
+    oneof item {
+      RemoteTensorHandle remote_handle = 1;
+      TensorProto tensor = 2;
+    }
+  }
+
+  repeated Input op_inputs = 10;
+
   // Control Operation IDs that will be respected when ops are re-ordered by
   // async execution. If async execution (+ op re-ordering) is not enabled, this
   // should have no effect.
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index e657a184b65..38c3ad7ae57 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -60,6 +60,9 @@ message RewriterConfig {
   // Remapping (default is ON)
   // Remap subgraphs onto more efficient implementations.
   Toggle remapping = 14;
+  // Common subgraph elimination (default is ON)
+  // e.g. Simplify arithmetic ops; merge ops with same value (like constants).
+  Toggle common_subgraph_elimination = 24;
   // Arithmetic optimizations (default is ON)
   // e.g. Simplify arithmetic ops; merge ops with same value (like constants).
   Toggle arithmetic_optimization = 7;
diff --git a/tensorflow/core/protobuf/struct.proto b/tensorflow/core/protobuf/struct.proto
index 91af120ad8a..e139d0b4f18 100644
--- a/tensorflow/core/protobuf/struct.proto
+++ b/tensorflow/core/protobuf/struct.proto
@@ -123,6 +123,7 @@ message TypeSpecProto {
     OPTIONAL_SPEC = 7;        // tf.OptionalSpec
     PER_REPLICA_SPEC = 8;     // PerReplicaSpec from distribute/values.py
     VARIABLE_SPEC = 9;        // tf.VariableSpec
+    ROW_PARTITION_SPEC = 10;  // RowPartitionSpec from ragged/row_partition.py
   }
   TypeSpecClass type_spec_class = 1;
 
diff --git a/tensorflow/core/summary/summary_db_writer.cc b/tensorflow/core/summary/summary_db_writer.cc
index a7c5e792e31..c61246f85d6 100644
--- a/tensorflow/core/summary/summary_db_writer.cc
+++ b/tensorflow/core/summary/summary_db_writer.cc
@@ -165,7 +165,7 @@ class IdAllocator {
     DCHECK(db_ != nullptr);
   }
 
-  Status CreateNewId(int64* id) LOCKS_EXCLUDED(mu_) {
+  Status CreateNewId(int64* id) TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock lock(mu_);
     Status s;
     SqliteStatement stmt;
@@ -199,7 +199,7 @@ class IdAllocator {
   }
 
  private:
-  int64 MakeRandomId() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  int64 MakeRandomId() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     int64 id = static_cast<int64>(random::New64() & kIdTiers[tier_]);
     if (id == kAbsent) ++id;
     return id;
@@ -208,7 +208,7 @@ class IdAllocator {
   mutex mu_;
   Env* const env_;
   Sqlite* const db_;
-  int tier_ GUARDED_BY(mu_) = 0;
+  int tier_ TF_GUARDED_BY(mu_) = 0;
 
   TF_DISALLOW_COPY_AND_ASSIGN(IdAllocator);
 };
@@ -396,14 +396,14 @@ class RunMetadata {
   const string& run_name() { return run_name_; }
   const string& user_name() { return user_name_; }
 
-  int64 run_id() LOCKS_EXCLUDED(mu_) {
+  int64 run_id() TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock lock(mu_);
     return run_id_;
   }
 
   Status SetGraph(Sqlite* db, uint64 now, double computed_time,
                   std::unique_ptr<GraphDef> g) SQLITE_TRANSACTIONS_EXCLUDED(*db)
-      LOCKS_EXCLUDED(mu_) {
+      TF_LOCKS_EXCLUDED(mu_) {
     int64 run_id;
     {
       mutex_lock lock(mu_);
@@ -419,7 +419,7 @@ class RunMetadata {
 
   Status GetTagId(Sqlite* db, uint64 now, double computed_time,
                   const string& tag_name, int64* tag_id,
-                  const SummaryMetadata& metadata) LOCKS_EXCLUDED(mu_) {
+                  const SummaryMetadata& metadata) TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock lock(mu_);
     TF_RETURN_IF_ERROR(InitializeRun(db, now, computed_time));
     auto e = tag_ids_.find(tag_name);
@@ -463,7 +463,8 @@ class RunMetadata {
   }
 
  private:
-  Status InitializeUser(Sqlite* db, uint64 now) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  Status InitializeUser(Sqlite* db, uint64 now)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (user_id_ != kAbsent || user_name_.empty()) return Status::OK();
     const char* get_sql = R"sql(
       SELECT user_id FROM Users WHERE user_name = ?
@@ -495,7 +496,7 @@ class RunMetadata {
   }
 
   Status InitializeExperiment(Sqlite* db, uint64 now, double computed_time)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (experiment_name_.empty()) return Status::OK();
     if (experiment_id_ == kAbsent) {
       TF_RETURN_IF_ERROR(InitializeUser(db, now));
@@ -562,7 +563,7 @@ class RunMetadata {
   }
 
   Status InitializeRun(Sqlite* db, uint64 now, double computed_time)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (run_name_.empty()) return Status::OK();
     TF_RETURN_IF_ERROR(InitializeExperiment(db, now, computed_time));
     if (run_id_ == kAbsent) {
@@ -610,12 +611,12 @@ class RunMetadata {
   const string experiment_name_;
   const string run_name_;
   const string user_name_;
-  int64 experiment_id_ GUARDED_BY(mu_) = kAbsent;
-  int64 run_id_ GUARDED_BY(mu_) = kAbsent;
-  int64 user_id_ GUARDED_BY(mu_) = kAbsent;
-  double experiment_started_time_ GUARDED_BY(mu_) = 0.0;
-  double run_started_time_ GUARDED_BY(mu_) = 0.0;
-  std::unordered_map<string, int64> tag_ids_ GUARDED_BY(mu_);
+  int64 experiment_id_ TF_GUARDED_BY(mu_) = kAbsent;
+  int64 run_id_ TF_GUARDED_BY(mu_) = kAbsent;
+  int64 user_id_ TF_GUARDED_BY(mu_) = kAbsent;
+  double experiment_started_time_ TF_GUARDED_BY(mu_) = 0.0;
+  double run_started_time_ TF_GUARDED_BY(mu_) = 0.0;
+  std::unordered_map<string, int64> tag_ids_ TF_GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(RunMetadata);
 };
@@ -631,7 +632,7 @@ class SeriesWriter {
 
   Status Append(Sqlite* db, int64 step, uint64 now, double computed_time,
                 const Tensor& t) SQLITE_TRANSACTIONS_EXCLUDED(*db)
-      LOCKS_EXCLUDED(mu_) {
+      TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock lock(mu_);
     if (rowids_.empty()) {
       Status s = Reserve(db, t);
@@ -650,7 +651,7 @@ class SeriesWriter {
   }
 
   Status Finish(Sqlite* db) SQLITE_TRANSACTIONS_EXCLUDED(*db)
-      LOCKS_EXCLUDED(mu_) {
+      TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock lock(mu_);
     // Delete unused pre-allocated Tensors.
     if (!rowids_.empty()) {
@@ -746,7 +747,7 @@ class SeriesWriter {
   }
 
   Status Reserve(Sqlite* db, const Tensor& t) SQLITE_TRANSACTIONS_EXCLUDED(*db)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     SqliteTransaction txn(*db);  // only for performance
     unflushed_bytes_ = 0;
     if (t.dtype() == DT_STRING) {
@@ -763,7 +764,7 @@ class SeriesWriter {
 
   Status ReserveData(Sqlite* db, SqliteTransaction* txn, size_t size)
       SQLITE_EXCLUSIVE_TRANSACTIONS_REQUIRED(*db)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     int64 space =
         static_cast<int64>(static_cast<double>(size) * kReserveMultiplier);
     if (space < kReserveMinBytes) space = kReserveMinBytes;
@@ -773,7 +774,7 @@ class SeriesWriter {
   Status ReserveTensors(Sqlite* db, SqliteTransaction* txn,
                         int64 reserved_bytes)
       SQLITE_EXCLUSIVE_TRANSACTIONS_REQUIRED(*db)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     const char* sql = R"sql(
       INSERT INTO Tensors (
         series,
@@ -798,7 +799,7 @@ class SeriesWriter {
 
   Status MaybeFlush(Sqlite* db, SqliteTransaction* txn)
       SQLITE_EXCLUSIVE_TRANSACTIONS_REQUIRED(*db)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (unflushed_bytes_ >= kFlushBytes) {
       TF_RETURN_WITH_CONTEXT_IF_ERROR(txn->Commit(), "flushing ",
                                       unflushed_bytes_, " bytes");
@@ -810,9 +811,9 @@ class SeriesWriter {
   mutex mu_;
   const int64 series_;
   RunMetadata* const meta_;
-  uint64 count_ GUARDED_BY(mu_) = 0;
-  std::deque<int64> rowids_ GUARDED_BY(mu_);
-  uint64 unflushed_bytes_ GUARDED_BY(mu_) = 0;
+  uint64 count_ TF_GUARDED_BY(mu_) = 0;
+  std::deque<int64> rowids_ TF_GUARDED_BY(mu_);
+  uint64 unflushed_bytes_ TF_GUARDED_BY(mu_) = 0;
 
   TF_DISALLOW_COPY_AND_ASSIGN(SeriesWriter);
 };
@@ -830,13 +831,13 @@ class RunWriter {
 
   Status Append(Sqlite* db, int64 tag_id, int64 step, uint64 now,
                 double computed_time, const Tensor& t)
-      SQLITE_TRANSACTIONS_EXCLUDED(*db) LOCKS_EXCLUDED(mu_) {
+      SQLITE_TRANSACTIONS_EXCLUDED(*db) TF_LOCKS_EXCLUDED(mu_) {
     SeriesWriter* writer = GetSeriesWriter(tag_id);
     return writer->Append(db, step, now, computed_time, t);
   }
 
   Status Finish(Sqlite* db) SQLITE_TRANSACTIONS_EXCLUDED(*db)
-      LOCKS_EXCLUDED(mu_) {
+      TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock lock(mu_);
     if (series_writers_.empty()) return Status::OK();
     for (auto i = series_writers_.begin(); i != series_writers_.end(); ++i) {
@@ -849,7 +850,7 @@ class RunWriter {
   }
 
  private:
-  SeriesWriter* GetSeriesWriter(int64 tag_id) LOCKS_EXCLUDED(mu_) {
+  SeriesWriter* GetSeriesWriter(int64 tag_id) TF_LOCKS_EXCLUDED(mu_) {
     mutex_lock sl(mu_);
     auto spot = series_writers_.find(tag_id);
     if (spot == series_writers_.end()) {
@@ -864,7 +865,7 @@ class RunWriter {
   mutex mu_;
   RunMetadata* const meta_;
   std::unordered_map<int64, std::unique_ptr<SeriesWriter>> series_writers_
-      GUARDED_BY(mu_);
+      TF_GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(RunWriter);
 };
diff --git a/tensorflow/core/summary/summary_file_writer.cc b/tensorflow/core/summary/summary_file_writer.cc
index 711a7d3d100..6fc78f3c305 100644
--- a/tensorflow/core/summary/summary_file_writer.cc
+++ b/tensorflow/core/summary/summary_file_writer.cc
@@ -155,7 +155,7 @@ class SummaryFileWriter : public SummaryWriterInterface {
     return static_cast<double>(env_->NowMicros()) / 1.0e6;
   }
 
-  Status InternalFlush() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  Status InternalFlush() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     for (const std::unique_ptr<Event>& e : queue_) {
       events_writer_->WriteEvent(*e);
     }
@@ -172,11 +172,11 @@ class SummaryFileWriter : public SummaryWriterInterface {
   uint64 last_flush_;
   Env* env_;
   mutex mu_;
-  std::vector<std::unique_ptr<Event>> queue_ GUARDED_BY(mu_);
+  std::vector<std::unique_ptr<Event>> queue_ TF_GUARDED_BY(mu_);
   // A pointer to allow deferred construction.
-  std::unique_ptr<EventsWriter> events_writer_ GUARDED_BY(mu_);
+  std::unique_ptr<EventsWriter> events_writer_ TF_GUARDED_BY(mu_);
   std::vector<std::pair<string, SummaryMetadata>> registered_summaries_
-      GUARDED_BY(mu_);
+      TF_GUARDED_BY(mu_);
 };
 
 }  // namespace
diff --git a/tensorflow/core/util/BUILD b/tensorflow/core/util/BUILD
index ca5ec3423d9..b8c2b3b4f59 100644
--- a/tensorflow/core/util/BUILD
+++ b/tensorflow/core/util/BUILD
@@ -20,6 +20,10 @@ load(
     "//third_party/mkl:build_defs.bzl",
     "mkl_deps",
 )
+load(
+    "//tensorflow/core/platform:build_config_root.bzl",
+    "if_static",
+)
 
 package(
     default_visibility = [
@@ -444,6 +448,7 @@ cc_library(
     srcs = ["version_info.cc"],
     hdrs = ["//tensorflow/core/public:version.h"],
     copts = tf_copts(),
+    alwayslink = if_static(0, 1),
 )
 
 cc_library(
diff --git a/tensorflow/core/util/debug_events_writer.h b/tensorflow/core/util/debug_events_writer.h
index 15085b9febe..6d219d7c9ef 100644
--- a/tensorflow/core/util/debug_events_writer.h
+++ b/tensorflow/core/util/debug_events_writer.h
@@ -63,7 +63,7 @@ class SingleDebugEventFileWriter {
   std::atomic_int_fast32_t num_outstanding_events_;
 
   std::unique_ptr<WritableFile> writable_file_;
-  std::unique_ptr<io::RecordWriter> record_writer_ PT_GUARDED_BY(writer_mu_);
+  std::unique_ptr<io::RecordWriter> record_writer_ TF_PT_GUARDED_BY(writer_mu_);
   mutex writer_mu_;
 };
 
@@ -229,17 +229,17 @@ class DebugEventsWriter {
   const string dump_root_;
 
   string file_prefix_;
-  bool is_initialized_ GUARDED_BY(initialization_mu_);
+  bool is_initialized_ TF_GUARDED_BY(initialization_mu_);
   mutex initialization_mu_;
 
   const int64 circular_buffer_size_;
-  std::deque<string> execution_buffer_ GUARDED_BY(execution_buffer_mu_);
+  std::deque<string> execution_buffer_ TF_GUARDED_BY(execution_buffer_mu_);
   mutex execution_buffer_mu_;
   std::deque<string> graph_execution_trace_buffer_
-      GUARDED_BY(graph_execution_trace_buffer_mu_);
+      TF_GUARDED_BY(graph_execution_trace_buffer_mu_);
   mutex graph_execution_trace_buffer_mu_;
 
-  absl::flat_hash_map<string, int> device_name_to_id_ GUARDED_BY(device_mu_);
+  absl::flat_hash_map<string, int> device_name_to_id_ TF_GUARDED_BY(device_mu_);
   mutex device_mu_;
 
   std::unique_ptr<SingleDebugEventFileWriter> metadata_writer_;
diff --git a/tensorflow/core/util/debug_events_writer_test.cc b/tensorflow/core/util/debug_events_writer_test.cc
index 925fec7add1..66cde55864b 100644
--- a/tensorflow/core/util/debug_events_writer_test.cc
+++ b/tensorflow/core/util/debug_events_writer_test.cc
@@ -68,8 +68,9 @@ class DebugEventsWriterTest : public ::testing::Test {
   }
 
   void SetUp() override {
-    dump_root_ = io::JoinPath(testing::TmpDir(),
-                              strings::Printf("%010lld", env()->NowMicros()));
+    dump_root_ = io::JoinPath(
+        testing::TmpDir(),
+        strings::Printf("%010lld", static_cast<long long>(env()->NowMicros())));
   }
 
   void TearDown() override {
diff --git a/tensorflow/core/util/events_writer.cc b/tensorflow/core/util/events_writer.cc
index 4585b98c705..482812eb5cc 100644
--- a/tensorflow/core/util/events_writer.cc
+++ b/tensorflow/core/util/events_writer.cc
@@ -66,7 +66,7 @@ Status EventsWriter::InitIfNeeded() {
 
   filename_ =
       strings::Printf("%s.out.tfevents.%010lld.%s%s", file_prefix_.c_str(),
-                      static_cast<int64>(time_in_seconds),
+                      static_cast<long long>(time_in_seconds),
                       port::Hostname().c_str(), file_suffix_.c_str());
 
   // Reset recordio_writer (which has a reference to recordio_file_) so final
diff --git a/tensorflow/core/util/exec_on_stall.h b/tensorflow/core/util/exec_on_stall.h
index 2b296ad56d8..40449b83f44 100644
--- a/tensorflow/core/util/exec_on_stall.h
+++ b/tensorflow/core/util/exec_on_stall.h
@@ -77,8 +77,8 @@ class ExecuteOnStall {
  private:
   mutex mu_;
   condition_variable cond_var_;
-  bool disabled_ GUARDED_BY(mu_);
-  bool joined_ GUARDED_BY(mu_);
+  bool disabled_ TF_GUARDED_BY(mu_);
+  bool joined_ TF_GUARDED_BY(mu_);
   Env* env_;
   std::function<void()> f_;
   int64 deadline_;
diff --git a/tensorflow/core/util/guarded_philox_random.h b/tensorflow/core/util/guarded_philox_random.h
index 8be7a374f05..1c1b7b0268a 100644
--- a/tensorflow/core/util/guarded_philox_random.h
+++ b/tensorflow/core/util/guarded_philox_random.h
@@ -71,7 +71,7 @@ class GuardedPhiloxRandom {
 
  private:
   mutex mu_;
-  random::PhiloxRandom generator_ GUARDED_BY(mu_);
+  random::PhiloxRandom generator_ TF_GUARDED_BY(mu_);
   bool initialized_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(GuardedPhiloxRandom);
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index c126ef6d59c..e0a399f2d6c 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -131,6 +131,15 @@ enum class MklQuantization {
 static const int kSmallBatchSize = 32;
 
 #ifdef ENABLE_MKLDNN_V1
+inline void execute_primitives(
+    std::vector<mkldnn::primitive>& primitives, std::shared_ptr<stream> stream,
+    std::vector<std::unordered_map<int, memory>>& net_args) {
+  DCHECK_EQ(primitives.size(), net_args.size());
+  for (size_t i = 0; i < primitives.size(); ++i) {
+    primitives.at(i).execute(*stream, net_args.at(i));
+  }
+}
+
 // In MKL-DNN v1.x, the format (ex. NCHW) used to initialize a memory descriptor
 // (md) structure will no longer be recorded in its `format` field. Instead, it
 // will be set to a canonical `blocked` format for every fully described md.
@@ -667,6 +676,7 @@ typedef std::vector<MklDnnShape> MklDnnShapeList;
 template <typename T>
 class MklDnnData;
 
+// TODO merge with the execute_primitives.
 inline void ExecutePrimitive(const std::vector<primitive>& net,
                              const std::vector<MemoryArgsMap>* net_args,
                              const engine& cpu_engine) {
@@ -1304,8 +1314,15 @@ inline void CreateAndExecuteReorder(const ReorderPd& reorder_desc,
   ExecutePrimitive(net, NET_ARGS_PTR, engine);
 }
 
+class MklReorderPrimitive;
+
 template <typename T>
+#ifdef ENABLE_MKLDNN_V1
+inline MklReorderPrimitive* FindOrCreateReorder(const memory* from,
+                                                const memory* to);
+#else
 inline primitive FindOrCreateReorder(const memory* from, const memory* to);
+#endif  // ENABLE_MKLDNN_V1
 
 // Class to represent all the resources corresponding to a tensor in TensorFlow
 // that are required to execute an operation (such as Convolution).
@@ -1610,11 +1627,7 @@ class MklDnnData {
     return false;
   }
 
-/// TODO: this is a faster path with reorder primitive cache compared with
-/// CheckReorderToOpMem(..., std::vector<primitive>* net).
-/// TODO(gzmkl): Remove the slower path.
 #ifdef ENABLE_MKLDNN_V1
-  /// TODO(bhavanis): Need to use reorder cache here for better performance.
   inline bool CheckReorderToOpMem(const memory::desc& op_md,
                                   const engine& engine) {
     DCHECK(user_memory_);
@@ -1623,9 +1636,13 @@ class MklDnnData {
       // primitive reuse don't allow two same reorder prim in
       // one stream, so submit it immediately
       reorder_memory_ = new memory(op_md, engine);
-      stream cpu_stream(engine);
-      reorder(*user_memory_, *reorder_memory_)
-          .execute(cpu_stream, *user_memory_, *reorder_memory_);
+      std::vector<primitive> net;
+      auto* prim = FindOrCreateReorder<T>(user_memory_, reorder_memory_);
+      net.push_back(*(prim->GetPrimitive()));
+      std::vector<MemoryArgsMap> net_args;
+      net_args.push_back({{MKLDNN_ARG_FROM, *user_memory_},
+                          {MKLDNN_ARG_TO, *reorder_memory_}});
+      execute_primitives(net, prim->GetStream(), net_args);
 #else
   inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd) {
     CHECK_NOTNULL(user_memory_);
@@ -1699,9 +1716,13 @@ class MklDnnData {
       // primitive reuse don't allow two same reorder prim in
       // one stream, so submit it immediately
       reorder_memory_ = new memory(op_md, engine, reorder_data_handle);
-      stream cpu_stream(engine);
-      reorder(*user_memory_, *reorder_memory_)
-          .execute(cpu_stream, *user_memory_, *reorder_memory_);
+      std::vector<primitive> net;
+      auto* prim = FindOrCreateReorder<T>(user_memory_, reorder_memory_);
+      net.push_back(*(prim->GetPrimitive()));
+      std::vector<MemoryArgsMap> net_args;
+      net_args.push_back({{MKLDNN_ARG_FROM, *user_memory_},
+                          {MKLDNN_ARG_TO, *reorder_memory_}});
+      execute_primitives(net, prim->GetStream(), net_args);
 #else
   inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd,
                                   void* reorder_data_handle) {
@@ -1830,15 +1851,18 @@ class MklDnnData {
     // one stream, so submit it immediately
     std::vector<primitive> net;
 #ifdef ENABLE_MKLDNN_V1
+    auto* prim = FindOrCreateReorder<T>(reorder_memory_, user_memory_);
+    net.push_back(*(prim->GetPrimitive()));
     std::vector<MemoryArgsMap> net_args;
-    // TODO(bhavanis): Need to use reorder cache here for better performance.
-    net.push_back(CreateReorder(reorder_memory_, user_memory_));
-    net_args.push_back(MemoryArgsMap{{MKLDNN_ARG_FROM, *reorder_memory_},
-                                     {MKLDNN_ARG_TO, *user_memory_}});
+    net_args.push_back(
+        {{MKLDNN_ARG_FROM, *reorder_memory_}, {MKLDNN_ARG_TO, *user_memory_}});
+    std::shared_ptr<stream> cpu_stream;
+    cpu_stream.reset(new stream(*cpu_engine_));
+    execute_primitives(net, prim->GetStream(), net_args);
 #else
     net.push_back(FindOrCreateReorder<T>(reorder_memory_, user_memory_));
-#endif  // ENABLE_MKLDNN_V1
     ExecutePrimitive(net, NET_ARGS_PTR, *cpu_engine_);
+#endif  // ENABLE_MKLDNN_V1
   }
 };
 
@@ -2046,6 +2070,8 @@ class MklReorderPrimitive : public MklPrimitive {
     context_.dst_mem->set_data_handle(to->get_data_handle());
   }
 
+  std::shared_ptr<mkldnn::stream> GetStream() { return stream_; }
+
  private:
   struct ReorderContext {
     std::shared_ptr<mkldnn::memory> src_mem;
@@ -2056,6 +2082,7 @@ class MklReorderPrimitive : public MklPrimitive {
   } context_;
 
   engine cpu_engine_ = engine(ENGINE_CPU, 0);
+  std::shared_ptr<mkldnn::stream> stream_;
 
   void Setup(const memory* from, const memory* to) {
     context_.src_mem.reset(
@@ -2064,6 +2091,7 @@ class MklReorderPrimitive : public MklPrimitive {
         new MEMORY_CONSTRUCTOR_WITH_MEM_PD(to, cpu_engine_, DummyData));
     context_.reorder_prim = std::make_shared<mkldnn::reorder>(
         reorder(*context_.src_mem, *context_.dst_mem));
+    stream_.reset(new CPU_STREAM(cpu_engine_));
   }
 };
 
@@ -2096,6 +2124,22 @@ class MklReorderPrimitiveFactory : public MklPrimitiveFactory<T> {
     memory::dims from_dims(from_desc.dims, &from_desc.dims[from_desc.ndims]);
     memory::dims to_dims(to_desc.dims, &to_desc.dims[to_desc.ndims]);
     auto from_strides = from_desc.MEMORY_FORMAT_DESC.blocking.strides;
+#ifdef ENABLE_MKLDNN_V1
+    // As DNNL memory desc has C style array and only init the used
+    // part, so need use the valid part as key.
+    auto from_inner_nblks = from_desc.MEMORY_FORMAT_DESC.blocking.inner_nblks;
+    auto from_inner_blks = from_desc.MEMORY_FORMAT_DESC.blocking.inner_blks;
+    auto from_inner_idxs = from_desc.MEMORY_FORMAT_DESC.blocking.inner_idxs;
+    memory::dims from_inner_blks_1(from_inner_blks,
+                                   &from_inner_blks[from_inner_nblks]);
+    memory::dims from_inner_idxs_1(from_inner_idxs,
+                                   &from_inner_idxs[from_inner_nblks]);
+    auto to_inner_nblks = to_desc.MEMORY_FORMAT_DESC.blocking.inner_nblks;
+    auto to_inner_blks = to_desc.MEMORY_FORMAT_DESC.blocking.inner_blks;
+    auto to_inner_idxs = to_desc.MEMORY_FORMAT_DESC.blocking.inner_idxs;
+    memory::dims to_inner_blks_1(to_inner_blks, &to_inner_blks[to_inner_nblks]);
+    memory::dims to_inner_idxs_1(to_inner_idxs, &to_inner_idxs[to_inner_nblks]);
+#endif  // ENABLE_MKLDNN_V1
     auto to_strides = to_desc.MEMORY_FORMAT_DESC.blocking.strides;
     memory::dims from_strides_outer_blocks(
         GET_BLOCK_STRIDES(from_strides, kIdxFirstStride),
@@ -2109,12 +2153,22 @@ class MklReorderPrimitiveFactory : public MklPrimitiveFactory<T> {
     // `format_kind` is not added in v1.x since it will always set to
     // `mkldnn_blocked`
     key_creator.AddAsKey(static_cast<int>(from_desc.format));
+#else
+    key_creator.AddAsKey(static_cast<int>(from_desc.extra.flags));
+    key_creator.AddAsKey(static_cast<int>(from_inner_nblks));
+    key_creator.AddAsKey(from_inner_blks_1);
+    key_creator.AddAsKey(from_inner_idxs_1);
 #endif  // !ENABLE_MKLDNN_V1
     key_creator.AddAsKey(static_cast<int>(from_desc.data_type));
     key_creator.AddAsKey(from_dims);
     key_creator.AddAsKey(from_strides_outer_blocks);
 #ifndef ENABLE_MKLDNN_V1
     key_creator.AddAsKey(static_cast<int>(to_desc.format));
+#else
+    key_creator.AddAsKey(static_cast<int>(to_desc.extra.flags));
+    key_creator.AddAsKey(static_cast<int>(to_inner_nblks));
+    key_creator.AddAsKey(to_inner_blks_1);
+    key_creator.AddAsKey(to_inner_idxs_1);
 #endif  // !ENABLE_MKLDNN_V1
     key_creator.AddAsKey(static_cast<int>(to_desc.data_type));
     key_creator.AddAsKey(to_dims);
@@ -2142,12 +2196,21 @@ class MklReorderPrimitiveFactory : public MklPrimitiveFactory<T> {
 /// get primitive from pool if it is cached.
 /// Returns the primitive.
 template <typename T>
+#ifdef ENABLE_MKLDNN_V1
+inline MklReorderPrimitive* FindOrCreateReorder(const memory* from,
+                                                const memory* to) {
+#else
 inline primitive FindOrCreateReorder(const memory* from, const memory* to) {
+#endif  // ENABLE_MKLDNN_V1
   CHECK_NOTNULL(from);
   CHECK_NOTNULL(to);
   MklReorderPrimitive* reorder_prim =
       MklReorderPrimitiveFactory<T>::Get(from, to);
+#ifdef ENABLE_MKLDNN_V1
+  return reorder_prim;
+#else
   return *reorder_prim->GetPrimitive();
+#endif  // ENABLE_MKLDNN_V1
 }
 
 // utility function to determine if it is conv 1x1 and stride != 1
@@ -2160,17 +2223,6 @@ inline bool IsConv1x1StrideNot1(memory::dims filter_dims,
           ((strides[0] != 1) || (strides[1] != 1)));
 }
 
-#ifdef ENABLE_MKLDNN_V1
-inline void execute_primitives(
-    std::vector<mkldnn::primitive>& primitives, std::shared_ptr<stream> stream,
-    std::vector<std::unordered_map<int, memory>>& net_args) {
-  DCHECK_EQ(primitives.size(), net_args.size());
-  for (size_t i = 0; i < primitives.size(); ++i) {
-    primitives.at(i).execute(*stream, net_args.at(i));
-  }
-}
-#endif  // ENABLE_MKLDNN_V1
-
 }  // namespace tensorflow
 #endif  // INTEL_MKL
 #endif  // TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
diff --git a/tensorflow/core/util/reffed_status_callback.h b/tensorflow/core/util/reffed_status_callback.h
index 1c552d45c42..8ec7b8a987c 100644
--- a/tensorflow/core/util/reffed_status_callback.h
+++ b/tensorflow/core/util/reffed_status_callback.h
@@ -52,7 +52,7 @@ class ReffedStatusCallback : public core::RefCounted {
  private:
   StatusCallback done_;
   mutex mu_;
-  StatusGroup status_group_ GUARDED_BY(mu_);
+  StatusGroup status_group_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index bbfb8883227..e1234d330fc 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -76,6 +76,7 @@ Status ReadStringTensor(io::InputBuffer* buffered_file, size_t num_elements,
 
   // Reads "num_elements" varint64's from "buffered_file".
   TF_RETURN_IF_ERROR(buffered_file->Seek(offset));
+  TF_RETURN_IF_ERROR(buffered_file->Hint(size));
   std::vector<uint64> string_lengths(num_elements);
   for (size_t i = 0; i < num_elements; ++i) {
     TF_RETURN_IF_ERROR(buffered_file->ReadVarint64(&string_lengths[i]));
@@ -151,6 +152,7 @@ Status ReadVariantTensor(io::InputBuffer* buffered_file, Tensor* ret,
 
   // Reads the actual string bytes.
   TF_RETURN_IF_ERROR(buffered_file->Seek(offset));
+  TF_RETURN_IF_ERROR(buffered_file->Hint(size));
   for (size_t i = 0; i < num_elements; ++i) {
     // Read the serialized variant length.
     uint64 string_length = 0;
diff --git a/tensorflow/core/util/xla_config_registry.cc b/tensorflow/core/util/xla_config_registry.cc
index 8c58b6fc67a..ce180ee7816 100644
--- a/tensorflow/core/util/xla_config_registry.cc
+++ b/tensorflow/core/util/xla_config_registry.cc
@@ -24,7 +24,7 @@ namespace xla_config_registry {
 namespace {
 struct GlobalJitLevelState {
   mutex mu;
-  GlobalJitLevelGetterTy getter GUARDED_BY(mu);
+  GlobalJitLevelGetterTy getter TF_GUARDED_BY(mu);
 };
 
 GlobalJitLevelState* GetSingletonState() {
diff --git a/tensorflow/examples/label_image/label_image.py b/tensorflow/examples/label_image/label_image.py
index f675ec35ec8..8828b90adc3 100644
--- a/tensorflow/examples/label_image/label_image.py
+++ b/tensorflow/examples/label_image/label_image.py
@@ -44,15 +44,13 @@ def read_tensor_from_image_file(file_name,
   output_name = "normalized"
   file_reader = tf.read_file(file_name, input_name)
   if file_name.endswith(".png"):
-    image_reader = tf.image.decode_png(
-        file_reader, channels=3, name="png_reader")
+    image_reader = tf.io.decode_png(file_reader, channels=3, name="png_reader")
   elif file_name.endswith(".gif"):
-    image_reader = tf.squeeze(
-        tf.image.decode_gif(file_reader, name="gif_reader"))
+    image_reader = tf.squeeze(tf.io.decode_gif(file_reader, name="gif_reader"))
   elif file_name.endswith(".bmp"):
-    image_reader = tf.image.decode_bmp(file_reader, name="bmp_reader")
+    image_reader = tf.io.decode_bmp(file_reader, name="bmp_reader")
   else:
-    image_reader = tf.image.decode_jpeg(
+    image_reader = tf.io.decode_jpeg(
         file_reader, channels=3, name="jpeg_reader")
   float_caster = tf.cast(image_reader, tf.float32)
   dims_expander = tf.expand_dims(float_caster, 0)
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 7107159a353..6456f104ad3 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -9900,6 +9900,71 @@ func EncodeProto(scope *Scope, sizes tf.Output, values []tf.Output, field_names
 	return op.Output(0)
 }
 
+// Creates an iterator for reading from the tf.data service.
+//
+// Returns the created operation.
+func MakeDataServiceIterator(scope *Scope, dataset tf.Output, epoch_id tf.Output, iterator tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MakeDataServiceIterator",
+		Input: []tf.Input{
+			dataset, epoch_id, iterator,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Begins a tf.data service dataset epoch.
+func BeginEpoch(scope *Scope, dataset_id tf.Output, address tf.Output, protocol tf.Output) (epoch_id tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BeginEpoch",
+		Input: []tf.Input{
+			dataset_id, address, protocol,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Registers a dataset with the tf.data service.
+func RegisterDataset(scope *Scope, dataset tf.Output, address tf.Output, protocol tf.Output, external_state_policy int64) (dataset_id tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"external_state_policy": external_state_policy}
+	opspec := tf.OpSpec{
+		Type: "RegisterDataset",
+		Input: []tf.Input{
+			dataset, address, protocol,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that reads data from the tf.data service.
+func DataServiceDataset(scope *Scope, address tf.Output, protocol tf.Output, max_outstanding_requests tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "DataServiceDataset",
+		Input: []tf.Input{
+			address, protocol, max_outstanding_requests,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Creates a dataset that contains the unique elements of `input_dataset`.
 func UniqueDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
@@ -12116,7 +12181,7 @@ func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
 //     convert $src.gif -coalesce $dst.gif
 //
 // This op also supports decoding JPEGs and PNGs, though it is cleaner to use
-// `tf.image.decode_image`.
+// `tf.io.decode_image`.
 //
 // Arguments:
 //	contents: 0-D.  The GIF-encoded image.
@@ -15424,7 +15489,7 @@ func DecodePngDtype(value tf.DataType) DecodePngAttr {
 // of color channels.
 //
 // This op also supports decoding JPEGs and non-animated GIFs since the interface
-// is the same, though it is cleaner to use `tf.image.decode_image`.
+// is the same, though it is cleaner to use `tf.io.decode_image`.
 //
 // Arguments:
 //	contents: 0-D.  The PNG-encoded image.
@@ -31376,6 +31441,17 @@ func VarHandleOpSharedName(value string) VarHandleOpAttr {
 	}
 }
 
+// VarHandleOpAllowedDevices sets the optional allowed_devices attribute to value.
+//
+// value: The allowed devices containing the resource variable. Set when the output
+// ResourceHandle represents a per-replica/partitioned resource variable.
+// If not specified, defaults to {}
+func VarHandleOpAllowedDevices(value []string) VarHandleOpAttr {
+	return func(m optionalAttr) {
+		m["allowed_devices"] = value
+	}
+}
+
 // Creates a handle to a Variable resource.
 //
 // Arguments:
@@ -42675,7 +42751,7 @@ func DecodeJpegDctMethod(value string) DecodeJpegAttr {
 //
 //
 // This op also supports decoding PNGs and non-animated GIFs since the interface is
-// the same, though it is cleaner to use `tf.image.decode_image`.
+// the same, though it is cleaner to use `tf.io.decode_image`.
 //
 // Arguments:
 //	contents: 0-D.  The JPEG-encoded image.
@@ -44696,6 +44772,19 @@ func Lu(scope *Scope, input tf.Output, optional ...LuAttr) (lu tf.Output, p tf.O
 	return op.Output(0), op.Output(1)
 }
 
+// ImageProjectiveTransformV2Attr is an optional argument to ImageProjectiveTransformV2.
+type ImageProjectiveTransformV2Attr func(optionalAttr)
+
+// ImageProjectiveTransformV2FillMode sets the optional fill_mode attribute to value.
+//
+// value: Fill mode, "REFLECT", "WRAP", or "CONSTANT".
+// If not specified, defaults to "CONSTANT"
+func ImageProjectiveTransformV2FillMode(value string) ImageProjectiveTransformV2Attr {
+	return func(m optionalAttr) {
+		m["fill_mode"] = value
+	}
+}
+
 // Applies the given transform to each of the images.
 //
 // If one row of `transforms` is `[a0, a1, a2, b0, b1, b2, c0, c1]`, then it maps
@@ -44714,11 +44803,14 @@ func Lu(scope *Scope, input tf.Output, optional ...LuAttr) (lu tf.Output, p tf.O
 //
 // Returns 4-D with shape
 // `[batch, new_height, new_width, channels]`.
-func ImageProjectiveTransformV2(scope *Scope, images tf.Output, transforms tf.Output, output_shape tf.Output, interpolation string) (transformed_images tf.Output) {
+func ImageProjectiveTransformV2(scope *Scope, images tf.Output, transforms tf.Output, output_shape tf.Output, interpolation string, optional ...ImageProjectiveTransformV2Attr) (transformed_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"interpolation": interpolation}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
 		Type: "ImageProjectiveTransformV2",
 		Input: []tf.Input{
diff --git a/tensorflow/go/saved_model.go b/tensorflow/go/saved_model.go
index 0fbd6081ef2..7aa1e83cbc4 100644
--- a/tensorflow/go/saved_model.go
+++ b/tensorflow/go/saved_model.go
@@ -17,6 +17,7 @@ limitations under the License.
 package tensorflow
 
 import (
+	"fmt"
 	"runtime"
 	"unsafe"
 
@@ -57,6 +58,9 @@ func LoadSavedModel(exportDir string, tags []string, options *SessionOptions) (*
 		return nil, err
 	}
 	cExportDir := C.CString(exportDir)
+	if len(tags) == 0 {
+		return nil, fmt.Errorf("empty tags are not allowed")
+	}
 	cTags := make([]*C.char, len(tags))
 	for i := range tags {
 		cTags[i] = C.CString(tags[i])
diff --git a/tensorflow/go/saved_model_test.go b/tensorflow/go/saved_model_test.go
index 0ff448e5661..24811d692af 100644
--- a/tensorflow/go/saved_model_test.go
+++ b/tensorflow/go/saved_model_test.go
@@ -19,7 +19,8 @@ package tensorflow
 import "testing"
 
 func TestSavedModel(t *testing.T) {
-	bundle, err := LoadSavedModel("../cc/saved_model/testdata/half_plus_two/00000123", []string{"serve"}, nil)
+	tags := []string{"serve"}
+	bundle, err := LoadSavedModel("../cc/saved_model/testdata/half_plus_two/00000123", tags, nil)
 	if err != nil {
 		t.Fatalf("LoadSavedModel(): %v", err)
 	}
@@ -30,3 +31,11 @@ func TestSavedModel(t *testing.T) {
 	// TODO(jhseu): half_plus_two has a tf.Example proto dependency to run. Add a
 	// more thorough test when the generated protobufs are available.
 }
+
+func TestSavedModelWithEmptyTags(t *testing.T) {
+	tags := []string{}
+	_, err := LoadSavedModel("../cc/saved_model/testdata/half_plus_two/00000123", tags, nil)
+	if err == nil {
+		t.Fatalf("LoadSavedModel() should return an error if tags are empty")
+	}
+}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/package-info.java b/tensorflow/java/src/main/java/org/tensorflow/package-info.java
index 983cda5260c..e6772854fac 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/package-info.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/package-info.java
@@ -36,7 +36,6 @@ limitations under the License.
  * </ul>
  *
  * <p>Additional examples can be found in the <a
- * href="https://github.com/tensorflow/models/tree/master/samples/languages/java">tensorflow/models</a>
- * GitHub repository.
+ * href="https://github.com/tensorflow/java">tensorflow/java</a> GitHub repository.
  */
 package org.tensorflow;
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index e9539d42f75..5e22b1fed5c 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -416,6 +416,7 @@ cc_library(
     hdrs = ["util.h"],
     copts = TFLITE_DEFAULT_COPTS + tflite_copts(),
     deps = [
+        ":kernel_api",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/schema:schema_fbs",
     ],
@@ -432,6 +433,7 @@ cc_test(
     deps = [
         ":util",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -505,7 +507,7 @@ tflite_cc_shared_object(
         ":framework",
         ":tflite_exported_symbols.lds",
         ":tflite_version_script.lds",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/kernels:builtin_ops_all_linked",
     ],
 )
 
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index b8aefa3e645..9b78aa4df1f 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -255,7 +255,7 @@ def generated_test_models():
         "ceil",
         "concat",
         "constant",
-        "control_dep",
+        # "control_dep", # b/150647401
         "conv",
         "conv_relu",
         "conv_relu1",
@@ -388,8 +388,12 @@ def generated_test_models_failing(conversion_mode):
             "unidirectional_sequence_rnn",
         ]
     elif conversion_mode == "forward-compat":
-        return []
-    return []
+        return [
+            "merged_models",  # b/150647401
+        ]
+    return [
+        "merged_models",  # b/150647401
+    ]
 
 def generated_test_models_successful(conversion_mode):
     """Returns the list of successful test models.
diff --git a/tensorflow/lite/c/builtin_op_data.h b/tensorflow/lite/c/builtin_op_data.h
index 65717c48bbc..d7848f9bcc0 100644
--- a/tensorflow/lite/c/builtin_op_data.h
+++ b/tensorflow/lite/c/builtin_op_data.h
@@ -109,7 +109,7 @@ typedef struct {
   //
   // The information can be deduced from the shape of input and the shape of
   // weights. Since the TFLiteConverter toolchain doesn't support partially
-  // specificed shapes, relying on `depth_multiplier` stops us from supporting
+  // specified shapes, relying on `depth_multiplier` stops us from supporting
   // graphs with dynamic shape tensors.
   //
   // Note: Some of the delegates (e.g. NNAPI, GPU) are still relying on this
diff --git a/tensorflow/lite/c/c_api_test.cc b/tensorflow/lite/c/c_api_test.cc
index 03d22a81376..1de35cc9dc7 100644
--- a/tensorflow/lite/c/c_api_test.cc
+++ b/tensorflow/lite/c/c_api_test.cc
@@ -181,7 +181,7 @@ TEST(CApiSimple, Delegate) {
   // The delegate should have been applied.
   EXPECT_TRUE(delegate_prepared);
 
-  // Subsequent exectuion should behave properly (the delegate is a no-op).
+  // Subsequent execution should behave properly (the delegate is a no-op).
   TfLiteInterpreterOptionsDelete(options);
   TfLiteModelDelete(model);
   EXPECT_EQ(TfLiteInterpreterInvoke(interpreter), kTfLiteOk);
diff --git a/tensorflow/lite/c/common.h b/tensorflow/lite/c/common.h
index eff16783a14..10280df05b3 100644
--- a/tensorflow/lite/c/common.h
+++ b/tensorflow/lite/c/common.h
@@ -15,7 +15,7 @@ limitations under the License.
 
 // This file defines common C types and APIs for implementing operations,
 // delegates and other constructs in TensorFlow Lite. The actual operations and
-// delegtes can be defined using C++, but the interface between the interpreter
+// delegates can be defined using C++, but the interface between the interpreter
 // and the operations are C.
 //
 // Summary of abstractions
diff --git a/tensorflow/lite/core/api/profiler.h b/tensorflow/lite/core/api/profiler.h
index 2aa533c41fa..938652cf698 100644
--- a/tensorflow/lite/core/api/profiler.h
+++ b/tensorflow/lite/core/api/profiler.h
@@ -115,7 +115,8 @@ class ScopedDelegateOperatorProfile : public ScopedProfile {
 
 }  // namespace tflite
 
-#define TFLITE_VARNAME_UNIQ(name, ctr) name##ctr
+#define TFLITE_VARNAME_UNIQ_IMPL(name, ctr) name##ctr
+#define TFLITE_VARNAME_UNIQ(name, ctr) TFLITE_VARNAME_UNIQ_IMPL(name, ctr)
 
 #define TFLITE_SCOPED_TAGGED_DEFAULT_PROFILE(profiler, tag)          \
   tflite::ScopedProfile TFLITE_VARNAME_UNIQ(_profile_, __COUNTER__)( \
diff --git a/tensorflow/lite/delegates/gpu/BUILD b/tensorflow/lite/delegates/gpu/BUILD
index ba2a05b09ec..72af2534988 100644
--- a/tensorflow/lite/delegates/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/BUILD
@@ -99,6 +99,7 @@ objc_library(
         "//tensorflow/lite/delegates/gpu/metal:api",
         "//tensorflow/lite/delegates/gpu/metal:buffer_convert",
         "//tensorflow/lite/delegates/gpu/metal:compiled_model",
+        "//tensorflow/lite/delegates/gpu/metal:environment",
         "//tensorflow/lite/delegates/gpu/metal:inference_context",
         "@com_google_absl//absl/types:span",
     ],
diff --git a/tensorflow/lite/delegates/gpu/cl/api.cc b/tensorflow/lite/delegates/gpu/cl/api.cc
index 0158672f89f..4e85f92c6de 100644
--- a/tensorflow/lite/delegates/gpu/cl/api.cc
+++ b/tensorflow/lite/delegates/gpu/cl/api.cc
@@ -601,9 +601,9 @@ class InferenceBuilderImpl : public InferenceBuilder {
       preferred_storage_types = {GetFastestStorageType(environment_->device()),
                                  TensorStorageType::BUFFER};
     } else {
-      preferred_storage_types = {TensorStorageType::IMAGE_BUFFER,
-                                 GetFastestStorageType(environment_->device()),
-                                 TensorStorageType::BUFFER};
+      preferred_storage_types = {
+          GetStorageTypeWithMinimalMemoryConsumption(environment_->device()),
+          TensorStorageType::BUFFER};
     }
 
     for (TensorStorageType storage_type : preferred_storage_types) {
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_device.cc b/tensorflow/lite/delegates/gpu/cl/cl_device.cc
index 225d80126f3..c47f86a2928 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_device.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_device.cc
@@ -177,6 +177,24 @@ int GetAdrenoGPUVersion(const std::string& gpu_version) {
   return -1;
 }
 
+MaliGPU GetMaliGPUVersion(const std::string& device_name) {
+  const std::map<std::string, MaliGPU> kMapping = {
+      {"T604", MaliGPU::T604}, {"T622", MaliGPU::T622}, {"T624", MaliGPU::T624},
+      {"T628", MaliGPU::T628}, {"T658", MaliGPU::T658}, {"T678", MaliGPU::T678},
+      {"T720", MaliGPU::T720}, {"T760", MaliGPU::T760}, {"T820", MaliGPU::T820},
+      {"T830", MaliGPU::T830}, {"T860", MaliGPU::T860}, {"T880", MaliGPU::T880},
+      {"G31", MaliGPU::G31},   {"G51", MaliGPU::G51},   {"G71", MaliGPU::G71},
+      {"G52", MaliGPU::G52},   {"G72", MaliGPU::G72},   {"G76", MaliGPU::G76},
+      {"G57", MaliGPU::G57},   {"G77", MaliGPU::G77},
+  };
+  for (auto v : kMapping) {
+    if (device_name.find(v.first) != std::string::npos) {
+      return v.second;
+    }
+  }
+  return MaliGPU::UNKNOWN;
+}
+
 std::string VendorToString(Vendor v) {
   switch (v) {
     case Vendor::QUALCOMM:
@@ -257,13 +275,59 @@ int AdrenoInfo::GetWaveSize(bool full_wave) const {
   }
 }
 
-DeviceInfo::DeviceInfo(cl_device_id id)
-    : adreno_info(GetDeviceInfo<std::string>(id, CL_DEVICE_OPENCL_C_VERSION)) {
+MaliInfo::MaliInfo(const std::string& device_name)
+    : gpu_version(GetMaliGPUVersion(device_name)) {}
+
+bool MaliInfo::IsMaliT6xx() const {
+  return gpu_version == MaliGPU::T604 || gpu_version == MaliGPU::T622 ||
+         gpu_version == MaliGPU::T624 || gpu_version == MaliGPU::T628 ||
+         gpu_version == MaliGPU::T658 || gpu_version == MaliGPU::T678;
+}
+
+bool MaliInfo::IsMaliT7xx() const {
+  return gpu_version == MaliGPU::T720 || gpu_version == MaliGPU::T760;
+}
+
+bool MaliInfo::IsMaliT8xx() const {
+  return gpu_version == MaliGPU::T820 || gpu_version == MaliGPU::T830 ||
+         gpu_version == MaliGPU::T860 || gpu_version == MaliGPU::T880;
+}
+
+bool MaliInfo::IsMidgard() const {
+  return IsMaliT6xx() || IsMaliT7xx() || IsMaliT8xx();
+}
+
+bool MaliInfo::IsBifrostGen1() const {
+  return gpu_version == MaliGPU::G31 || gpu_version == MaliGPU::G51 ||
+         gpu_version == MaliGPU::G71;
+}
+
+bool MaliInfo::IsBifrostGen2() const {
+  return gpu_version == MaliGPU::G52 || gpu_version == MaliGPU::G72;
+}
+
+bool MaliInfo::IsBifrostGen3() const { return gpu_version == MaliGPU::G76; }
+
+bool MaliInfo::IsBifrost() const {
+  return IsBifrostGen1() || IsBifrostGen2() || IsBifrostGen3();
+}
+
+bool MaliInfo::IsValhall() const {
+  return gpu_version == MaliGPU::G57 || gpu_version == MaliGPU::G77;
+}
+
+DeviceInfo::DeviceInfo(cl_device_id id) {
   const auto device_name = GetDeviceInfo<std::string>(id, CL_DEVICE_NAME);
   const auto vendor_name = GetDeviceInfo<std::string>(id, CL_DEVICE_VENDOR);
+  const auto opencl_c_version =
+      GetDeviceInfo<std::string>(id, CL_DEVICE_OPENCL_C_VERSION);
   vendor = ParseVendor(device_name, vendor_name);
-  cl_version = ParseCLVersion(
-      GetDeviceInfo<std::string>(id, CL_DEVICE_OPENCL_C_VERSION));
+  if (vendor == Vendor::QUALCOMM) {
+    adreno_info = AdrenoInfo(opencl_c_version);
+  } else if (vendor == Vendor::MALI) {
+    mali_info = MaliInfo(device_name);
+  }
+  cl_version = ParseCLVersion(opencl_c_version);
   extensions =
       absl::StrSplit(GetDeviceInfo<std::string>(id, CL_DEVICE_EXTENSIONS), ' ');
   supports_fp16 = false;
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_device.h b/tensorflow/lite/delegates/gpu/cl/cl_device.h
index 17213714a38..7b3493e3faa 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_device.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_device.h
@@ -61,6 +61,46 @@ struct AdrenoInfo {
   bool support_one_layer_texture_array = true;
 };
 
+enum class MaliGPU {
+  T604,
+  T622,
+  T624,
+  T628,
+  T658,
+  T678,
+  T720,
+  T760,
+  T820,
+  T830,
+  T860,
+  T880,
+  G31,
+  G51,
+  G71,
+  G52,
+  G72,
+  G76,
+  G57,
+  G77,
+  UNKNOWN
+};
+
+struct MaliInfo {
+  MaliInfo() = default;
+  explicit MaliInfo(const std::string& device_name);
+  MaliGPU gpu_version;
+
+  bool IsMaliT6xx() const;
+  bool IsMaliT7xx() const;
+  bool IsMaliT8xx() const;
+  bool IsMidgard() const;
+  bool IsBifrostGen1() const;
+  bool IsBifrostGen2() const;
+  bool IsBifrostGen3() const;
+  bool IsBifrost() const;
+  bool IsValhall() const;
+};
+
 struct DeviceInfo {
   DeviceInfo() = default;
   explicit DeviceInfo(cl_device_id id);
@@ -98,6 +138,7 @@ struct DeviceInfo {
   bool supports_fp16_rtn;
 
   AdrenoInfo adreno_info;
+  MaliInfo mali_info;
 };
 
 // A wrapper around opencl device id
diff --git a/tensorflow/lite/delegates/gpu/cl/environment.cc b/tensorflow/lite/delegates/gpu/cl/environment.cc
index 06cd04af793..ca13e19f73f 100644
--- a/tensorflow/lite/delegates/gpu/cl/environment.cc
+++ b/tensorflow/lite/delegates/gpu/cl/environment.cc
@@ -226,6 +226,34 @@ TensorStorageType GetFastestStorageType(const CLDevice& gpu) {
     }
   } else if (gpu.IsPowerVR()) {
     return TensorStorageType::TEXTURE_2D;
+  } else if (gpu.IsMali()) {
+    const MaliInfo mali_info = gpu.GetInfo().mali_info;
+    if (mali_info.IsMaliT8xx() || mali_info.IsBifrostGen3() ||
+        mali_info.IsValhall()) {
+      return TensorStorageType::TEXTURE_2D;
+    } else {
+      return TensorStorageType::BUFFER;
+    }
+  } else if (gpu.IsNvidia()) {
+    return gpu.SupportsImageBuffer() ? TensorStorageType::IMAGE_BUFFER
+                                     : TensorStorageType::BUFFER;
+  } else if (gpu.IsAMD()) {
+    return gpu.SupportsImageBuffer() ? TensorStorageType::IMAGE_BUFFER
+                                     : TensorStorageType::BUFFER;
+  }
+  return TensorStorageType::BUFFER;
+}
+
+TensorStorageType GetStorageTypeWithMinimalMemoryConsumption(
+    const CLDevice& gpu) {
+  if (gpu.IsAdreno()) {
+    if (gpu.IsAdreno3xx() || gpu.IsAdreno4xx()) {
+      return TensorStorageType::BUFFER;
+    } else {
+      return TensorStorageType::IMAGE_BUFFER;
+    }
+  } else if (gpu.IsPowerVR()) {
+    return TensorStorageType::BUFFER;
   } else if (gpu.IsMali()) {
     return TensorStorageType::BUFFER;
   } else if (gpu.IsNvidia()) {
diff --git a/tensorflow/lite/delegates/gpu/cl/environment.h b/tensorflow/lite/delegates/gpu/cl/environment.h
index 0a872e9c08a..496d6957623 100644
--- a/tensorflow/lite/delegates/gpu/cl/environment.h
+++ b/tensorflow/lite/delegates/gpu/cl/environment.h
@@ -72,6 +72,8 @@ class Environment {
 };
 
 TensorStorageType GetFastestStorageType(const CLDevice& gpu);
+TensorStorageType GetStorageTypeWithMinimalMemoryConsumption(
+    const CLDevice& gpu);
 
 Status CreateEnvironment(Environment* result);
 
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.cc b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
index 6b0511fb267..47998bf8c99 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.cc
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
@@ -181,12 +181,14 @@ Status InferenceContext::InitFromGraph(const CreateInferenceInfo& create_info,
   ReserveGraphTensors(create_info, creation_context, graph);
   precision_ = create_info.precision;
   storage_type_ = create_info.storage_type;
-  auto vendor = env->device().vendor();
-  if (vendor == Vendor::MALI) {
+  if (env->device().IsMali()) {
     need_flush_ = true;
     need_manual_release_ = true;
+
+    flush_periodically_ = true;
+    flush_period_ = 24;
   }
-  if (vendor == Vendor::POWERVR) {
+  if (env->device().IsPowerVR()) {
     need_flush_ = true;
   }
   CopyInAndOutIds(graph);
@@ -400,6 +402,12 @@ void InferenceContext::Merge() {
 void InferenceContext::GetUsages(
     const std::function<bool(const TensorDescriptor&)>& functor,
     std::map<ValueId, int2>* usages) {
+  for (ValueId in_id : input_ids_) {
+    const auto& desc = tensor_reserver_.Get(in_id).descriptor;
+    if (functor(desc)) {
+      AddUsage(in_id, 0, usages);
+    }
+  }
   for (int op_index = 0; op_index < nodes_.size(); ++op_index) {
     auto tensors = GetCLNodeTensors(nodes_[op_index]);
     for (auto& tensor : tensors) {
@@ -408,7 +416,7 @@ void InferenceContext::GetUsages(
       }
     }
   }
-  for (auto& out_id : output_ids_) {
+  for (ValueId out_id : output_ids_) {
     const auto& desc = tensor_reserver_.Get(out_id).descriptor;
     if (functor(desc)) {
       AddUsage(out_id, nodes_.size(), usages);
@@ -552,8 +560,13 @@ Status InferenceContext::AddToQueue(CLCommandQueue* queue) {
     }
     RETURN_IF_ERROR(queue->EnqueueEvent(&prev_enqueue_start_point_));
   }
+  int counter = 0;
   for (auto& node : nodes_) {
     RETURN_IF_ERROR(node.operations[0]->AddToQueue(queue));
+    counter++;
+    if (flush_periodically_ && counter % flush_period_ == 0) {
+      clFlush(queue->queue());
+    }
   }
   if (need_flush_) {
     clFlush(queue->queue());
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.h b/tensorflow/lite/delegates/gpu/cl/inference_context.h
index 198131f7cc3..b8a0b7741b6 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.h
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.h
@@ -116,6 +116,9 @@ class InferenceContext {
   // performance hacks
   bool need_flush_ = false;
 
+  bool flush_periodically_ = false;
+  int flush_period_ = 1;
+
   // In order to reduce memory leak on Mali a pipeline needs to be synchronized
   // with CPU to prevent growing internal global OpenCL kernel pool. One trick
   // is to enqueue an event from a previous run. Most of the time is should
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
index 920bf9fd028..804366082da 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@@ -326,6 +326,7 @@ cc_library(
         ":work_group_picking",
         "//tensorflow/lite/delegates/gpu/cl:buffer",
         "//tensorflow/lite/delegates/gpu/cl:linear_storage",
+        "//tensorflow/lite/delegates/gpu/cl:precision",
         "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/cl:tensor_type",
         "//tensorflow/lite/delegates/gpu/cl:texture2d",
@@ -990,6 +991,45 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "quantize_and_dequantize",
+    srcs = ["quantize_and_dequantize.cc"],
+    hdrs = ["quantize_and_dequantize.h"],
+    deps = [
+        ":flt_type",
+        ":gpu_operation",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/cl:cl_context",
+        "//tensorflow/lite/delegates/gpu/cl:cl_kernel",
+        "//tensorflow/lite/delegates/gpu/cl:linear_storage",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:variant",
+    ],
+)
+
+cc_test(
+    name = "quantize_and_dequantize_test",
+    srcs = ["quantize_and_dequantize_test.cc"],
+    linkstatic = True,
+    tags = tf_gpu_tests_tags() + [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":quantize_and_dequantize",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/kernels/internal:quantization_util",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "relu",
     srcs = ["relu.cc"],
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
index 0d7f1e7ed26..3a8c726021c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.cc
@@ -24,49 +24,56 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
 #include "tensorflow/lite/delegates/gpu/cl/precision.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace tflite {
 namespace gpu {
 namespace cl {
 namespace {
 
-// x_elements - amount of elements processed by thread in W dimension
-// y_elements - amount of elements processed by thread in H dimension
 // element_size must be 1, 2 or 4
 // 1 - is FLT4
 // 2 - is FLT8
 // 4 - is FLT16
 // This function generates code for arithmetic part of convolution
-std::string GetComputationPart(int x_elements, int y_elements, int element_size,
+std::string GetComputationPart(const int3& block_size, int element_size,
                                CalculationsPrecision precision) {
   const std::string hexes[16] = {"0", "1", "2", "3", "4", "5", "6", "7",
                                  "8", "9", "a", "b", "c", "d", "e", "f"};
   std::string c;
-  for (int y = 0; y < y_elements; ++y) {
-    for (int x = 0; x < x_elements; ++x) {
-      std::string s_index = std::to_string(y * x_elements + x);
-      for (int e = 0; e < element_size; ++e) {
-        std::string r_index =
-            std::to_string((y * x_elements + x) * element_size + e);
-        switch (precision) {
-          case CalculationsPrecision::F32:
-          case CalculationsPrecision::F16:
-            c += "    r" + r_index + " += f0.s0123 * s" + s_index + ".s" +
-                 hexes[e * 4 + 0] + ";\n";
-            c += "    r" + r_index + " += f0.s4567 * s" + s_index + ".s" +
-                 hexes[e * 4 + 1] + ";\n";
-            c += "    r" + r_index + " += f0.s89ab * s" + s_index + ".s" +
-                 hexes[e * 4 + 2] + ";\n";
-            c += "    r" + r_index + " += f0.scdef * s" + s_index + ".s" +
-                 hexes[e * 4 + 3] + ";\n";
-            break;
-          case CalculationsPrecision::F32_F16:
-            c += "    r" + r_index + " += convert_float4(f0.s0123 * s" +
-                 s_index + ".s" + hexes[e * 4 + 0] + " + f0.s4567 * s" +
-                 s_index + ".s" + hexes[e * 4 + 1] + " + f0.s89ab * s" +
-                 s_index + ".s" + hexes[e * 4 + 2] + " + f0.scdef * s" +
-                 s_index + ".s" + hexes[e * 4 + 3] + ");\n";
-            break;
+  for (int z = 0; z < block_size.z; ++z) {
+    const std::string z_s = std::to_string(z);
+    c += "    FLT16 W" + z_s + " = weights_cache[" + z_s + "];\n";
+    for (int y = 0; y < block_size.y; ++y) {
+      for (int x = 0; x < block_size.x; ++x) {
+        std::string s_index = std::to_string(y * block_size.x + x);
+        for (int e = 0; e < element_size; ++e) {
+          std::string r_index =
+              z_s + std::to_string(y) + std::to_string(x * element_size + e);
+          const std::string f0 = "W" + z_s + ".s0123";
+          const std::string f1 = "W" + z_s + ".s4567";
+          const std::string f2 = "W" + z_s + ".s89ab";
+          const std::string f3 = "W" + z_s + ".scdef";
+          switch (precision) {
+            case CalculationsPrecision::F32:
+            case CalculationsPrecision::F16:
+              c += "    r" + r_index + " += " + f0 + " * s" + s_index + ".s" +
+                   hexes[e * 4 + 0] + ";\n";
+              c += "    r" + r_index + " += " + f1 + " * s" + s_index + ".s" +
+                   hexes[e * 4 + 1] + ";\n";
+              c += "    r" + r_index + " += " + f2 + " * s" + s_index + ".s" +
+                   hexes[e * 4 + 2] + ";\n";
+              c += "    r" + r_index + " += " + f3 + " * s" + s_index + ".s" +
+                   hexes[e * 4 + 3] + ";\n";
+              break;
+            case CalculationsPrecision::F32_F16:
+              c += "    r" + r_index + " += convert_float4(" + f0 + " * s" +
+                   s_index + ".s" + hexes[e * 4 + 0] + " + " + f1 + " * s" +
+                   s_index + ".s" + hexes[e * 4 + 1] + " + " + f2 + " * s" +
+                   s_index + ".s" + hexes[e * 4 + 2] + " + " + f3 + " * s" +
+                   s_index + ".s" + hexes[e * 4 + 3] + ");\n";
+              break;
+          }
         }
       }
     }
@@ -74,19 +81,8 @@ std::string GetComputationPart(int x_elements, int y_elements, int element_size,
   return c;
 }
 
-std::string GetShiftFromElementSize(int element_size) {
-  if (element_size == 4) {
-    return " >> 2";
-  } else if (element_size == 2) {
-    return " >> 1";
-  } else {
-    return "";
-  }
-}
-
 std::string GenerateConvBuffer1x1(
-    const OperationDef& op_def, int x_elements, int y_elements,
-    int element_size, bool different_weights_for_height,
+    const OperationDef& op_def, const ConvBuffer1x1::ConvParams& conv_params,
     const std::vector<ElementwiseOperation*>& linked_operations) {
   std::string c = GetCommonDefines(op_def.precision);
   TensorCodeGenerator dst_tensor(
@@ -105,6 +101,9 @@ std::string GenerateConvBuffer1x1(
       break;
   }
 
+  const int3 block_size = conv_params.block_size;
+  const int element_size = conv_params.element_size / 4;
+
   c += "__kernel void main_function(\n";
   c += "    __global FLT" + std::to_string(element_size * 4) + "* src_data,\n";
   c += "    __global FLT16* filters_buffer,   \n";
@@ -115,192 +114,230 @@ std::string GenerateConvBuffer1x1(
   c += "    int4 dst_size                    \n";
   c += ") {\n";
   c += "  int X = get_global_id(0) * " +
-       std::to_string(x_elements * element_size) + ";\n";
-  c += "  int Y = get_global_id(1) * " + std::to_string(y_elements) + ";\n";
-  c += "  int Z = get_global_id(2);\n";
+       std::to_string(block_size.x * element_size) + ";\n";
+  c += "  int X_SRC = get_global_id(0) * " + std::to_string(block_size.x) +
+       ";\n";
+  c += "  int Y = get_global_id(1) * " + std::to_string(block_size.y) + ";\n";
+  c += "  int Z = get_global_id(2) * " + std::to_string(block_size.z) + ";\n";
   c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
-  if (different_weights_for_height) {
-    c += "  __global FLT16* temp = filters_buffer + (Z * src_size.y + Y) * "
+  if (conv_params.different_weights_for_height) {
+    c += "  __global FLT16* weights_cache = filters_buffer + (Z * src_size.y + "
+         "Y * " +
+         std::to_string(block_size.z) +
+         ") * "
          "src_size.z;\n";
   } else {
-    c += "  __global FLT16* temp = filters_buffer + Z * src_size.z;\n";
+    c += "  __global FLT16* weights_cache = filters_buffer + Z * src_size.z;\n";
   }
-  c += "  ACCUM_FLT4 bias_val = TO_ACCUM_TYPE(biases[Z]);\n";
-  for (int i = 0; i < x_elements * element_size * y_elements; ++i) {
-    c += "  ACCUM_FLT4 r" + std::to_string(i) + " = bias_val;\n";
+  for (int z = 0; z < block_size.z; ++z) {
+    const std::string z_s = std::to_string(z);
+    c += "  ACCUM_FLT4 bias_val_" + z_s + " = TO_ACCUM_TYPE(biases[Z + " + z_s +
+         "]);\n";
+    for (int y = 0; y < block_size.y; ++y) {
+      for (int x = 0; x < block_size.x * element_size; ++x) {
+        c += "  ACCUM_FLT4 r" + z_s + std::to_string(y) + std::to_string(x) +
+             " = bias_val_" + z_s + ";\n";
+      }
+    }
   }
-  for (int x = 0; x < x_elements; ++x) {
+  for (int x = 0; x < block_size.x; ++x) {
     std::string x_s = std::to_string(x);
-    c += "  int xc" + x_s + " = min(X + " + std::to_string(x * element_size) +
+    c += "  int xc" + x_s + " = min(X_SRC + " + std::to_string(x) +
          ", src_size.x - 1);\n";
   }
-  for (int y = 0; y < y_elements; ++y) {
+  for (int y = 0; y < block_size.y; ++y) {
     std::string y_s = std::to_string(y);
     c += "  int yc" + y_s + " = min(Y + " + y_s + ", src_size.y - 1);\n";
   }
-  std::string shift = GetShiftFromElementSize(element_size);
-  for (int y = 0; y < y_elements; ++y) {
+  for (int y = 0; y < block_size.y; ++y) {
     std::string y_s = std::to_string(y);
-    for (int x = 0; x < x_elements; ++x) {
+    for (int x = 0; x < block_size.x; ++x) {
       std::string x_s = std::to_string(x);
-      std::string i_s = std::to_string(y * x_elements + x);
-      c += "  int src_addr_" + i_s + " = ((yc" + y_s + ") * src_size.x + (xc" +
-           x_s + "))" + shift + ";\n";
+      std::string i_s = std::to_string(y * block_size.x + x);
+      c += "  int src_addr_" + i_s + " = (yc" + y_s + ") * src_size.x + (xc" +
+           x_s + ");\n";
     }
   }
   c += "  for (int s = 0; s < src_size.z; ++s) {\n";
-  for (int y = 0; y < y_elements; ++y) {
+  for (int y = 0; y < block_size.y; ++y) {
     std::string y_s = std::to_string(y);
-    for (int x = 0; x < x_elements; ++x) {
+    for (int x = 0; x < block_size.x; ++x) {
       std::string x_s = std::to_string(x);
-      std::string i_s = std::to_string(y * x_elements + x);
+      std::string i_s = std::to_string(y * block_size.x + x);
       c += "    FLT" + std::to_string(element_size * 4) + " s" + i_s +
            " = src_data[src_addr_" + i_s + "];\n";
     }
   }
-  c += "    FLT16 f0 = temp[0];\n";
-  c += GetComputationPart(x_elements, y_elements, element_size,
-                          op_def.precision);
-  for (int i = 0; i < x_elements * y_elements; ++i) {
+  c += GetComputationPart(block_size, element_size, op_def.precision);
+  for (int i = 0; i < block_size.x * block_size.y; ++i) {
     std::string i_s = std::to_string(i);
     c += "    src_addr_" + i_s + " += src_size.w;\n";
   }
-  c += "    temp += 1;\n";
+  c += "    weights_cache += " + std::to_string(block_size.z) + ";\n";
   c += "  }\n";  // src_size.z = SRC_DEPTH
 
-  for (int y = 0; y < y_elements; ++y) {
-    std::string y_s = std::to_string(y);
-    for (int x = 0; x < x_elements * element_size; ++x) {
-      std::string x_s = std::to_string(x);
-      std::string i_s = std::to_string(y * x_elements * element_size + x);
-      c += "  if (X + " + x_s + " < dst_size.x && Y + " + y_s +
-           " < dst_size.y) {\n";
-      c += "    FLT4 res = TO_FLT4(r" + i_s + ");\n";
-      const LinkingContext context{"res", "X + " + x_s, "Y + " + y_s, "Z"};
-      c += PostProcess(linked_operations, context);
-      c += "  " + dst_tensor.WriteWHS("res", "X + " + x_s, "Y + " + y_s, "Z") +
-           "\n";
-      c += "  }\n";
+  for (int z = 0; z < block_size.z; ++z) {
+    const std::string z_s = std::to_string(z);
+    if (z != 0) {
+      c += "  if (Z + " + z_s + " >= dst_size.z) return;\n";
+    }
+    for (int y = 0; y < block_size.y; ++y) {
+      const std::string y_s = std::to_string(y);
+      for (int x = 0; x < block_size.x * element_size; ++x) {
+        const std::string x_s = std::to_string(x);
+        c += "  if (X + " + x_s + " < dst_size.x && Y + " + y_s +
+             " < dst_size.y) {\n";
+        c += "    FLT4 res = TO_FLT4(r" + z_s + y_s + x_s + ");\n";
+        const LinkingContext context{"res", "X + " + x_s, "Y + " + y_s,
+                                     "Z + " + z_s};
+        c += PostProcess(linked_operations, context);
+        c += "    " + dst_tensor.WriteWHS("res", "X + " + x_s, "Y + " + y_s,
+                                          "Z + " + z_s);
+        c += "  }\n";
+      }
     }
   }
   c += "}\n";
   return c;
 }
 
-int GetGridWidth(int width) {
-  if (width % 2 == 0) {  // using kernel_flt8_
-    return width / 2;
-  } else {  // using kernel_flt4_
-    return width;
+ConvBuffer1x1::ConvParams GetBestParams(const CLDevice& device,
+                                        const OperationDef& definition,
+                                        const BHWC& shape, int src_depth,
+                                        int dst_depth) {
+  ConvBuffer1x1::ConvParams conv_params;
+  conv_params.element_size = 4;
+  conv_params.block_size = int3(1, 1, 1);
+  if (!device.IsMali()) {
+    return conv_params;
   }
+  bool can_use_flt8 = (shape.w * shape.b) % 2 == 0 &&
+                      definition.precision != CalculationsPrecision::F32;
+  bool is_midgard = device.IsMali() && device.GetInfo().mali_info.IsMidgard();
+  if (is_midgard) {
+    if (can_use_flt8) {
+      conv_params.element_size = 8;
+    }
+    if (definition.precision == CalculationsPrecision::F16 || !can_use_flt8) {
+      conv_params.block_size.x = 2;
+    }
+    return conv_params;
+  }
+
+  int task_size = shape.w * shape.b * shape.h * dst_depth;
+  int block_size =
+      GetRecommendedBlockSizeForConv(device, definition.precision, task_size);
+
+  if (!can_use_flt8 && block_size > 4) {
+    block_size = 4;
+  }
+
+  if (can_use_flt8 && block_size >= 2) {
+    conv_params.element_size = 8;
+    block_size /= 2;
+  }
+  if (block_size == 4) {
+    conv_params.block_size.x = 2;
+    if (definition.precision == CalculationsPrecision::F32 && dst_depth < 32) {
+      conv_params.block_size.y = 2;
+    } else {
+      conv_params.block_size.z = 2;
+    }
+  } else if (block_size == 2) {
+    if (dst_depth >= 32) {
+      conv_params.block_size.z = 2;
+    } else {
+      conv_params.block_size.x = 2;
+    }
+  }
+
+  return conv_params;
+}
+
+ConvBuffer1x1::ConvParams GetBestParams(const CLDevice& device,
+                                        const OperationDef& definition,
+                                        int src_depth, int dst_depth) {
+  ConvBuffer1x1::ConvParams conv_params;
+  conv_params.element_size = 4;
+  conv_params.block_size = int3(1, 1, 1);
+  if (device.IsMali() && definition.precision == CalculationsPrecision::F16 &&
+      device.GetInfo().compute_units_count <= 4) {
+    conv_params.block_size.x *= 2;
+  }
+  return conv_params;
 }
 
 }  // namespace
 
-ConvBuffer1x1::ConvBuffer1x1(const OperationDef& definition, int flt4_x_count,
-                             int flt4_y_count, int flt8_x_count,
-                             int flt8_y_count)
-    : GPUOperation(definition),
-      flt4_x_count_(flt4_x_count),
-      flt4_y_count_(flt4_y_count),
-      flt8_x_count_(flt8_x_count),
-      flt8_y_count_(flt8_y_count),
-      different_weights_for_height_(false),
-      work_group_size_(2, 4, 1) {}
+ConvBuffer1x1::ConvBuffer1x1(const OperationDef& definition,
+                             const ConvParams& conv_params)
+    : GPUOperation(definition), conv_params_(conv_params) {}
 
 ConvBuffer1x1::ConvBuffer1x1(ConvBuffer1x1&& operation)
     : GPUOperation(std::move(operation)),
       weights_(std::move(operation.weights_)),
       biases_(std::move(operation.biases_)),
-      kernel_flt4_(std::move(operation.kernel_flt4_)),
-      flt4_x_count_(operation.flt4_x_count_),
-      flt4_y_count_(operation.flt4_y_count_),
-      kernel_flt8_(std::move(operation.kernel_flt8_)),
-      flt8_x_count_(operation.flt8_x_count_),
-      flt8_y_count_(operation.flt8_y_count_),
-      different_weights_for_height_(operation.different_weights_for_height_),
-      work_group_size_(operation.work_group_size_) {}
+      conv_params_(std::move(operation.conv_params_)),
+      kernel_(std::move(operation.kernel_)) {}
 
 ConvBuffer1x1& ConvBuffer1x1::operator=(ConvBuffer1x1&& operation) {
   if (this != &operation) {
     weights_ = std::move(operation.weights_);
     biases_ = std::move(operation.biases_);
-    kernel_flt4_ = std::move(operation.kernel_flt4_);
-    std::swap(flt4_x_count_, operation.flt4_x_count_);
-    std::swap(flt4_y_count_, operation.flt4_y_count_);
-    kernel_flt8_ = std::move(operation.kernel_flt8_);
-    std::swap(flt8_x_count_, operation.flt8_x_count_);
-    std::swap(flt8_y_count_, operation.flt8_y_count_);
-    std::swap(different_weights_for_height_,
-              operation.different_weights_for_height_);
-    std::swap(work_group_size_, operation.work_group_size_);
+    std::swap(conv_params_, operation.conv_params_);
+    kernel_ = std::move(operation.kernel_);
     GPUOperation::operator=(std::move(operation));
   }
   return *this;
 }
 
 Status ConvBuffer1x1::Compile(const CreationContext& creation_context) {
-  std::string code_flt4 =
-      GenerateConvBuffer1x1(definition_, flt4_x_count_, flt4_y_count_, 1,
-                            different_weights_for_height_, linked_operations_);
+  std::string code =
+      GenerateConvBuffer1x1(definition_, conv_params_, linked_operations_);
   RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
-      code_flt4, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_flt4_));
-  std::string code_flt8 =
-      GenerateConvBuffer1x1(definition_, flt8_x_count_, flt8_y_count_, 2,
-                            different_weights_for_height_, linked_operations_);
-  RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
-      code_flt8, "main_function", *creation_context.context,
-      *creation_context.device, &kernel_flt8_));
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_));
   return OkStatus();
 }
 
-CLKernel* ConvBuffer1x1::GetKernel(int width) {
-  if (width % 2 == 0) {
-    return &kernel_flt8_;
-  } else {
-    return &kernel_flt4_;
-  }
-}
-
 Status ConvBuffer1x1::BindArguments() {
-  CLKernel* kernel = GetKernel(dst_[0]->Width());
-  kernel->ResetBindingCounter();
-  RETURN_IF_ERROR(kernel->SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel->SetMemoryAuto(weights_.GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel->SetMemoryAuto(biases_.GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(kernel, linked_operations_));
-  RETURN_IF_ERROR(kernel->SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  int4 src_size = int4(
-      src_[0]->Width() * src_[0]->Batch(), src_[0]->Height(), src_[0]->Slices(),
-      GetGridWidth(src_[0]->Width()) * src_[0]->Height() * src_[0]->Batch());
-  RETURN_IF_ERROR(kernel->SetBytesAuto(src_size));
-  RETURN_IF_ERROR(kernel->SetBytesAuto(dst_[0]->GetWBatchedHSB()));
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(biases_.GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
+  const int src_width_elements = IntegralDivideRoundUp(
+      src_[0]->Width() * src_[0]->Batch(), (conv_params_.element_size / 4));
+  int4 src_size = int4(src_width_elements, src_[0]->Height(), src_[0]->Slices(),
+                       src_width_elements * src_[0]->Height());
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_size));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWBatchedHSB()));
   return OkStatus();
 }
 
 int3 ConvBuffer1x1::GetGridSize() const {
-  const int fltx_count =
-      dst_[0]->Width() % 2 == 0 ? flt8_x_count_ : flt4_x_count_;
-  const int flty_count =
-      dst_[0]->Width() % 2 == 0 ? flt8_y_count_ : flt4_y_count_;
-  const int grid_x = IntegralDivideRoundUp(
-      GetGridWidth(dst_[0]->Width()) * dst_[0]->Batch(), fltx_count);
-  const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), flty_count);
-  const int grid_z = dst_[0]->Slices();
+  const int dst_width_elements = IntegralDivideRoundUp(
+      dst_[0]->Width() * dst_[0]->Batch(), (conv_params_.element_size / 4));
+  const int grid_x =
+      IntegralDivideRoundUp(dst_width_elements, conv_params_.block_size.x);
+  const int grid_y =
+      IntegralDivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
+  const int grid_z =
+      IntegralDivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z);
   return int3(grid_x, grid_y, grid_z);
 }
 
 Status ConvBuffer1x1::Tune(const TuningParameters& params) {
   RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroupConv(params, *GetKernel(src_[0]->Width()),
-                              GetGridSize(), &work_group_size_);
+  return GetBestWorkGroupConv(params, kernel_, GetGridSize(),
+                              &conv_params_.work_group_size);
 }
 
 Status ConvBuffer1x1::AddToQueue(CLCommandQueue* queue) {
   RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(*GetKernel(src_[0]->Width()), GetGridSize(),
-                                 work_group_size_);
+  return queue->DispatchImplicit(kernel_, GetGridSize(),
+                                 conv_params_.work_group_size);
 }
 
 bool IsConvBuffer1x1Supported(const OperationDef& definition,
@@ -317,53 +354,63 @@ bool IsConvBuffer1x1Supported(const OperationDef& definition,
 Status CreateConvBuffer1x1(const CreationContext& creation_context,
                            const OperationDef& definition,
                            const Convolution2DAttributes& attr,
-                           ConvBuffer1x1* result) {
+                           ConvBuffer1x1* result, const BHWC* shape) {
   if (!IsConvBuffer1x1Supported(definition, attr)) {
     return InvalidArgumentError("ConvBuffer1x1 doesn't supported");
   }
-  int flt4_x_count = 1;
-  int flt4_y_count = 1;
-  int flt8_x_count = 1;
-  int flt8_y_count = 1;
-  if (creation_context.device->vendor() == Vendor::MALI) {
-    if (definition.precision == CalculationsPrecision::F16 &&
-        creation_context.device->GetInfo().compute_units_count <= 4) {
-      flt4_x_count = 2;
-      flt8_x_count = 2;
-    }
+  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  ConvBuffer1x1::ConvParams conv_params;
+  if (shape) {
+    conv_params = GetBestParams(*creation_context.device, definition, *shape,
+                                src_depth, dst_depth);
+  } else {
+    conv_params = GetBestParams(*creation_context.device, definition, src_depth,
+                                dst_depth);
   }
-  *result = ConvBuffer1x1(definition, flt4_x_count, flt4_y_count, flt8_x_count,
-                          flt8_y_count);
+  *result = ConvBuffer1x1(definition, conv_params);
   return result->UploadData(attr.weights, attr.bias, creation_context.context);
 }
 
 Status CreateConvBuffer1x1(const CreationContext& creation_context,
                            const OperationDef& definition,
                            const FullyConnectedAttributes& attr,
-                           ConvBuffer1x1* result) {
-  int flt4_x_count = 1;
-  int flt4_y_count = 1;
-  int flt8_x_count = 1;
-  int flt8_y_count = 1;
-  if (creation_context.device->vendor() == Vendor::MALI) {
-    if (definition.precision == CalculationsPrecision::F16 &&
-        creation_context.device->GetInfo().compute_units_count <= 4) {
-      flt4_x_count = 2;
-      flt8_x_count = 2;
-    }
+                           ConvBuffer1x1* result, const BHWC* shape) {
+  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  ConvBuffer1x1::ConvParams conv_params;
+  if (shape) {
+    conv_params = GetBestParams(*creation_context.device, definition, *shape,
+                                src_depth, dst_depth);
+  } else {
+    conv_params = GetBestParams(*creation_context.device, definition, src_depth,
+                                dst_depth);
   }
-  *result = ConvBuffer1x1(definition, flt4_x_count, flt4_y_count, flt8_x_count,
-                          flt8_y_count);
+  conv_params.block_size.x *= conv_params.block_size.y;
+  conv_params.block_size.y = 1;
+  *result = ConvBuffer1x1(definition, conv_params);
   return result->UploadData(attr.weights, attr.bias, creation_context.context);
 }
 
 Status CreateConvBuffer1x1Wino4x4To6x6(const CreationContext& creation_context,
                                        const OperationDef& definition,
                                        const Convolution2DAttributes& attr,
-                                       ConvBuffer1x1* result) {
-  *result = ConvBuffer1x1(definition, 4 /*flt4_x_count*/, 1 /*flt4_y_count*/,
-                          2 /*flt8_x_count*/, 1 /*flt8_y_count*/);
-  result->different_weights_for_height_ = true;
+                                       ConvBuffer1x1* result,
+                                       const BHWC* shape) {
+  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  ConvBuffer1x1::ConvParams conv_params;
+  if (shape) {
+    conv_params = GetBestParams(*creation_context.device, definition, *shape,
+                                src_depth, dst_depth);
+  } else {
+    conv_params = GetBestParams(*creation_context.device, definition, src_depth,
+                                dst_depth);
+  }
+  conv_params.block_size.x *= conv_params.block_size.y;
+  conv_params.block_size.y = 1;
+  conv_params.different_weights_for_height = true;
+  *result = ConvBuffer1x1(definition, conv_params);
   return result->UploadDataForWinograd4x4To6x6(
       attr.weights, *creation_context.device, creation_context.context);
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
index f90439e082d..54e99d29ec7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h
@@ -50,20 +50,32 @@ class ConvBuffer1x1 : public GPUOperation {
 
   Status Compile(const CreationContext& creation_context) override;
 
+  struct ConvParams {
+    int3 block_size = int3(1, 1, 1);
+    int element_size = 4;  // can be 4, 8 or 16
+
+    // By default in 2d convolution we have the same weights for WH dims, but in
+    // some cases we need separate weights for H dimension and convolution
+    // kernel requires very small modifications to support it.
+    bool different_weights_for_height = false;
+
+    int3 work_group_size = int3(2, 4, 1);
+  };
+
  private:
-  ConvBuffer1x1(const OperationDef& definition, int flt4_x_count,
-                int flt4_y_count, int flt8_x_count, int flt8_y_count);
+  ConvBuffer1x1(const OperationDef& definition, const ConvParams& conv_params);
   friend Status CreateConvBuffer1x1(const CreationContext& creation_context,
                                     const OperationDef& definition,
                                     const Convolution2DAttributes& attr,
-                                    ConvBuffer1x1* result);
+                                    ConvBuffer1x1* result, const BHWC* shape);
   friend Status CreateConvBuffer1x1(const CreationContext& creation_context,
                                     const OperationDef& definition,
                                     const FullyConnectedAttributes& attr,
-                                    ConvBuffer1x1* result);
+                                    ConvBuffer1x1* result, const BHWC* shape);
   friend Status CreateConvBuffer1x1Wino4x4To6x6(
       const CreationContext& creation_context, const OperationDef& definition,
-      const Convolution2DAttributes& attr, ConvBuffer1x1* result);
+      const Convolution2DAttributes& attr, ConvBuffer1x1* result,
+      const BHWC* shape);
 
   template <DataType T>
   Status UploadData(const ::tflite::gpu::Tensor<OHWI, T>& weights,
@@ -81,25 +93,11 @@ class ConvBuffer1x1 : public GPUOperation {
   Status BindArguments();
   int3 GetGridSize() const;
 
-  CLKernel* GetKernel(int width);
-
   Buffer weights_;
   LinearStorage biases_;
 
-  CLKernel kernel_flt4_;
-  int flt4_x_count_;
-  int flt4_y_count_;
-
-  CLKernel kernel_flt8_;
-  int flt8_x_count_;
-  int flt8_y_count_;
-
-  // By default in 2d convolution we have the same weights for WH dims, but in
-  // some cases we need separate weights for H dimension and convolution kernel
-  // requires very small modifications to support it.
-  bool different_weights_for_height_;
-
-  int3 work_group_size_;
+  ConvParams conv_params_;
+  CLKernel kernel_;
 };
 
 template <DataType T>
@@ -139,22 +137,22 @@ Status ConvBuffer1x1::UploadWeights(
   const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
   const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
 
-  const int float4_size = definition_.precision == CalculationsPrecision::F32
-                              ? sizeof(float4)
-                              : sizeof(half4);
+  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
+  const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);
 
+  const int dst_depth_aligned = AlignByN(dst_depth, conv_params_.block_size.z);
   const int elements_count =
-      weights.shape.h * weights.shape.w * src_depth * dst_depth * 4;
+      weights.shape.h * weights.shape.w * src_depth * dst_depth_aligned * 4;
 
-  if (definition_.GetDataType() == DataType::FLOAT32) {
+  if (f32_weights) {
     std::vector<float4> gpu_data(elements_count);
-    RearrangeWeightsToOHWIOGroupI4O4(weights, /*out_group_size*/ 1,
+    RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
                                      absl::MakeSpan(gpu_data));
     return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
                                 context, &weights_);
   } else {
     std::vector<half4> gpu_data(elements_count);
-    RearrangeWeightsToOHWIOGroupI4O4(weights, /*out_group_size*/ 1,
+    RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
                                      absl::MakeSpan(gpu_data));
     return CreateReadOnlyBuffer(float4_size * elements_count, gpu_data.data(),
                                 context, &weights_);
@@ -167,17 +165,18 @@ bool IsConvBuffer1x1Supported(const OperationDef& definition,
 Status CreateConvBuffer1x1(const CreationContext& creation_context,
                            const OperationDef& definition,
                            const Convolution2DAttributes& attr,
-                           ConvBuffer1x1* result);
+                           ConvBuffer1x1* result, const BHWC* shape = nullptr);
 
 Status CreateConvBuffer1x1(const CreationContext& creation_context,
                            const OperationDef& definition,
                            const FullyConnectedAttributes& attr,
-                           ConvBuffer1x1* result);
+                           ConvBuffer1x1* result, const BHWC* shape = nullptr);
 
 Status CreateConvBuffer1x1Wino4x4To6x6(const CreationContext& creation_context,
                                        const OperationDef& definition,
                                        const Convolution2DAttributes& attr,
-                                       ConvBuffer1x1* result);
+                                       ConvBuffer1x1* result,
+                                       const BHWC* shape = nullptr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1_test.cc
index c7d1bac2b0f..828eafcc04f 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1_test.cc
@@ -57,7 +57,8 @@ TEST_F(OpenCLOperationTest, ConvBuffer1x1SimpleWeights) {
         {data_type, TensorStorageType::BUFFER, Layout::HWC});
     TensorFloat32 dst_tensor;
     ConvBuffer1x1 operation;
-    ASSERT_OK(CreateConvBuffer1x1(creation_context_, op_def, attr, &operation));
+    ASSERT_OK(CreateConvBuffer1x1(creation_context_, op_def, attr, &operation,
+                                  &src_tensor.shape));
     ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                   BHWC(1, 2, 1, 2), &dst_tensor));
     EXPECT_THAT(dst_tensor.data,
@@ -92,7 +93,8 @@ TEST_F(OpenCLOperationTest, ConvBuffer1x1) {
         {data_type, TensorStorageType::BUFFER, Layout::HWC});
     TensorFloat32 dst_tensor;
     ConvBuffer1x1 operation;
-    ASSERT_OK(CreateConvBuffer1x1(creation_context_, op_def, attr, &operation));
+    ASSERT_OK(CreateConvBuffer1x1(creation_context_, op_def, attr, &operation,
+                                  &src_tensor.shape));
     ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                   BHWC(1, 2, 1, 4), &dst_tensor));
     EXPECT_THAT(dst_tensor.data,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
index 78e9795bc63..c1860d6452f 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
@@ -130,21 +130,21 @@ std::string GenerateBlockCoords(const int3& block_size,
 
 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
                          const Convolution2DAttributes& attr,
-                         const CLDevice& device)
+                         const CLDevice& device, const BHWC* dst_shape)
     : GPUOperation(definition),
       stride_padding_(attr.strides.w, attr.strides.h, -attr.padding.prepended.w,
                       -attr.padding.prepended.h),
       kernel_dilation_(attr.weights.shape.w, attr.weights.shape.h,
                        attr.dilations.w, attr.dilations.h),
-      conv_params_(GuessBestParams(device, definition, attr)) {}
+      conv_params_(GuessBestParams(device, definition, attr, dst_shape)) {}
 
 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
                          const FullyConnectedAttributes& attr,
-                         const CLDevice& device)
+                         const CLDevice& device, const BHWC* dst_shape)
     : GPUOperation(definition),
       stride_padding_(1, 1, 0, 0),
       kernel_dilation_(1, 1, 1, 1),
-      conv_params_(GuessBestParams(device, definition, attr)) {}
+      conv_params_(GuessBestParams(device, definition, attr, dst_shape)) {}
 
 ConvPowerVR::ConvPowerVR(const OperationDef& definition)
     : GPUOperation(definition),
@@ -176,8 +176,9 @@ ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
 Status ConvPowerVR::Compile(const CreationContext& creation_context) {
   const bool stride_correction =
       definition_.IsBatchSupported() && stride_padding_.x != 1;
-  const std::string code = GenerateConvPowerVR1x1(
-      definition_, stride_correction, conv_params_, linked_operations_);
+  const std::string code =
+      GenerateConv(*creation_context.device, definition_, stride_correction,
+                   conv_params_, linked_operations_);
   std::vector<CompilerOptions> options;
   if (definition_.precision == CalculationsPrecision::F16 &&
       creation_context.device->IsPowerVR()) {
@@ -268,8 +269,8 @@ Status ConvPowerVR::AddToQueue(CLCommandQueue* queue) {
                                  conv_params_.work_group_size);
 }
 
-std::string GenerateConvPowerVR1x1(
-    const OperationDef& op_def, bool stride_correction,
+std::string GenerateConv(
+    const CLDevice& device, const OperationDef& op_def, bool stride_correction,
     const ConvPowerVR::ConvParams& conv_params,
     const std::vector<ElementwiseOperation*>& linked_operations) {
   std::string c = GetCommonDefines(op_def.precision);
@@ -455,19 +456,32 @@ std::string GenerateConvPowerVR1x1(
     }
   };
   const auto mode = TextureAddressMode::ZERO;
+  const bool conditional_read = device.IsMali();
   auto read_src = [&]() {
     for (int y = 0; y < block_size.y; ++y) {
       for (int x = 0; x < block_size.x; ++x) {
         if (buffer_type) {
           std::string id = std::to_string(y) + std::to_string(x);
-          std::string multiplier = is1x1
-                                       ? ""
-                                       : " * (FLT)(mx" + std::to_string(x) +
-                                             " && my" + std::to_string(y) + ")";
-          c += "    src" + id + " = " +
-               src_tensor.ReadAsType(conv_params.weights_data_type,
-                                     "src_a_" + id) +
-               multiplier + ";\n";
+          if (is1x1) {
+            c += "    src" + id + " = " +
+                 src_tensor.ReadAsType(conv_params.weights_data_type,
+                                       "src_a_" + id) +
+                 ";\n";
+          } else {
+            std::string condition =
+                "mx" + std::to_string(x) + " && my" + std::to_string(y);
+            if (conditional_read) {
+              c += "    src" + id + " = " + condition + " ? " +
+                   src_tensor.ReadAsType(conv_params.weights_data_type,
+                                         "src_a_" + id) +
+                   " : (FLT4)(0.0f);\n";
+            } else {
+              c += "    src" + id + " = " +
+                   src_tensor.ReadAsType(conv_params.weights_data_type,
+                                         "src_a_" + id) +
+                   " * (FLT)(" + condition + ");\n";
+            }
+          }
           c += "    src_a_" + id + " += src_layer_offset;\n";
         } else {
           std::string id = std::to_string(y) + std::to_string(x);
@@ -614,7 +628,7 @@ std::string GenerateConvPowerVR1x1(
 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     const CLDevice& device, const OperationDef& definition, int src_depth,
     int dst_depth, bool x_kernel_is_1, bool y_kernel_is_1,
-    bool different_weights_for_height) const {
+    bool different_weights_for_height, const BHWC* dst_shape) const {
   ConvParams conv_params;
   conv_params.linear_hw = false;
   conv_params.weights_data_type =
@@ -727,17 +741,45 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
       conv_params.src_depth_loop_size = 2;
     }
   } else if (device.IsMali()) {
-    conv_params.block_size = int3(2, 1, 1);
+    int block_size = 2;
+    if (dst_shape) {
+      int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
+      block_size = GetRecommendedBlockSizeForConv(device, definition.precision,
+                                                  task_size);
+    }
+    if (!x_kernel_is_1 || !y_kernel_is_1) {
+      block_size = std::min(block_size, 4);
+    }
+    if (block_size == 8) {
+      if (dst_depth == 1 || dst_depth == 3) {
+        conv_params.block_size = int3(2, 2, 1);
+      } else {
+        conv_params.block_size = int3(2, 2, 2);
+      }
+    } else if (block_size == 4) {
+      if (dst_depth == 1 || dst_depth == 3) {
+        conv_params.block_size = int3(2, 2, 1);
+      } else {
+        conv_params.block_size = int3(2, 1, 2);
+      }
+    } else if (block_size == 2) {
+      conv_params.block_size = int3(2, 1, 1);
+    } else {
+      conv_params.block_size = int3(1, 1, 1);
+    }
+    conv_params.src_depth_loop_size = 1;
+    MaliInfo mali_info = device.GetInfo().mali_info;
+    if (src_depth % 2 == 0 && block_size <= 2 && !mali_info.IsMidgard()) {
+      conv_params.src_depth_loop_size = 2;
+    }
+    if (src_depth % 4 == 0 && block_size == 1 && !mali_info.IsMidgard() &&
+        definition.precision == CalculationsPrecision::F16) {
+      conv_params.src_depth_loop_size = 4;
+    }
     conv_params.work_group_size = int3(4, 4, 1);
     conv_params.work_group_launch_order = int3(0, 1, 2);
     conv_params.fixed_work_group_size = false;
-    conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
-    if (dst_depth % 2 == 0 || dst_depth >= 4) {
-      conv_params.block_size.z = 2;
-    } else {
-      conv_params.block_size.z = 1;
-    }
   } else {
     conv_params.block_size = int3(1, 1, 4);
     conv_params.work_group_size = int3(8, 2, 1);
@@ -765,7 +807,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
 
 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     const CLDevice& device, const OperationDef& definition,
-    const Convolution2DAttributes& attr) const {
+    const Convolution2DAttributes& attr, const BHWC* dst_shape) const {
   const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
   const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
   const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 &&
@@ -777,16 +819,16 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
                              attr.padding.prepended.h == 0 &&
                              attr.padding.appended.h == 0;
   return GuessBestParams(device, definition, src_depth, dst_depth,
-                         x_kernel_is_1, y_kernel_is_1, false);
+                         x_kernel_is_1, y_kernel_is_1, false, dst_shape);
 }
 
 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     const CLDevice& device, const OperationDef& definition,
-    const FullyConnectedAttributes& attr) const {
+    const FullyConnectedAttributes& attr, const BHWC* dst_shape) const {
   const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
   const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
   ConvPowerVR::ConvParams params = GuessBestParams(
-      device, definition, src_depth, dst_depth, true, true, false);
+      device, definition, src_depth, dst_depth, true, true, false, dst_shape);
   params.work_group_size.x *= params.work_group_size.y;
   params.work_group_size.y = 1;
   params.block_size.x *= params.block_size.y;
@@ -796,11 +838,11 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
 
 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParamsWinograd(
     const CLDevice& device, const OperationDef& definition,
-    const Convolution2DAttributes& attr) const {
+    const Convolution2DAttributes& attr, const BHWC* dst_shape) const {
   const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
   const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
   ConvPowerVR::ConvParams params = GuessBestParams(
-      device, definition, src_depth, dst_depth, true, true, true);
+      device, definition, src_depth, dst_depth, true, true, true, dst_shape);
   params.block_size.x *= params.block_size.y;
   params.block_size.y = 1;
   return params;
@@ -809,26 +851,27 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParamsWinograd(
 Status CreateConvPowerVR(const CreationContext& creation_context,
                          const OperationDef& definition,
                          const Convolution2DAttributes& attr,
-                         ConvPowerVR* result) {
-  *result = ConvPowerVR(definition, attr, *creation_context.device);
+                         ConvPowerVR* result, const BHWC* dst_shape) {
+  *result = ConvPowerVR(definition, attr, *creation_context.device, dst_shape);
   return result->UploadData(attr.weights, attr.bias, creation_context.context);
 }
 
 Status CreateConvPowerVR(const CreationContext& creation_context,
                          const OperationDef& definition,
                          const FullyConnectedAttributes& attr,
-                         ConvPowerVR* result) {
-  *result = ConvPowerVR(definition, attr, *creation_context.device);
+                         ConvPowerVR* result, const BHWC* dst_shape) {
+  *result = ConvPowerVR(definition, attr, *creation_context.device, dst_shape);
   return result->UploadData(attr.weights, attr.bias, creation_context.context);
 }
 
 Status CreateConvPowerVRWino4x4To6x6(const CreationContext& creation_context,
                                      const OperationDef& definition,
                                      const Convolution2DAttributes& attr,
-                                     ConvPowerVR* result) {
+                                     ConvPowerVR* result,
+                                     const BHWC* dst_shape) {
   *result = ConvPowerVR(definition);
   result->conv_params_ = result->GuessBestParamsWinograd(
-      *creation_context.device, definition, attr);
+      *creation_context.device, definition, attr, dst_shape);
   return result->UploadDataForWinograd4x4To6x6(
       attr.weights, *creation_context.device, creation_context.context);
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
index 110b983940a..44145c585da 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
@@ -79,9 +79,11 @@ class ConvPowerVR : public GPUOperation {
   };
 
   ConvPowerVR(const OperationDef& definition,
-              const Convolution2DAttributes& attr, const CLDevice& device);
+              const Convolution2DAttributes& attr, const CLDevice& device,
+              const BHWC* dst_shape = nullptr);
   ConvPowerVR(const OperationDef& definition,
-              const FullyConnectedAttributes& attr, const CLDevice& device);
+              const FullyConnectedAttributes& attr, const CLDevice& device,
+              const BHWC* dst_shape = nullptr);
   explicit ConvPowerVR(const OperationDef& definition);
 
   template <DataType T>
@@ -100,36 +102,41 @@ class ConvPowerVR : public GPUOperation {
   friend Status CreateConvPowerVR(const CreationContext& creation_context,
                                   const OperationDef& definition,
                                   const Convolution2DAttributes& attr,
-                                  ConvPowerVR* result);
+                                  ConvPowerVR* result, const BHWC* dst_shape);
 
   friend Status CreateConvPowerVR(const CreationContext& creation_context,
                                   const OperationDef& definition,
                                   const FullyConnectedAttributes& attr,
-                                  ConvPowerVR* result);
+                                  ConvPowerVR* result, const BHWC* dst_shape);
 
   friend Status CreateConvPowerVRWino4x4To6x6(
       const CreationContext& creation_context, const OperationDef& definition,
-      const Convolution2DAttributes& attr, ConvPowerVR* result);
+      const Convolution2DAttributes& attr, ConvPowerVR* result,
+      const BHWC* dst_shape);
 
-  friend std::string GenerateConvPowerVR1x1(
-      const OperationDef& op_def, bool stride_correction,
-      const ConvParams& conv_params,
+  friend std::string GenerateConv(
+      const CLDevice& device, const OperationDef& op_def,
+      bool stride_correction, const ConvParams& conv_params,
       const std::vector<ElementwiseOperation*>& linked_operations);
 
   ConvParams GuessBestParams(const CLDevice& device,
                              const OperationDef& definition,
-                             const Convolution2DAttributes& attr) const;
+                             const Convolution2DAttributes& attr,
+                             const BHWC* dst_shape = nullptr) const;
   ConvParams GuessBestParams(const CLDevice& device,
                              const OperationDef& definition,
-                             const FullyConnectedAttributes& attr) const;
+                             const FullyConnectedAttributes& attr,
+                             const BHWC* dst_shape = nullptr) const;
   ConvParams GuessBestParamsWinograd(const CLDevice& device,
                                      const OperationDef& definition,
-                                     const Convolution2DAttributes& attr) const;
+                                     const Convolution2DAttributes& attr,
+                                     const BHWC* dst_shape = nullptr) const;
   ConvParams GuessBestParams(const CLDevice& device,
                              const OperationDef& definition, int src_depth,
                              int dst_depth, bool x_kernel_is_1,
                              bool y_kernel_is_1,
-                             bool different_weights_for_height) const;
+                             bool different_weights_for_height,
+                             const BHWC* dst_shape = nullptr) const;
 
   Status BindArguments();
   int3 GetGridSize() const;
@@ -206,17 +213,18 @@ Status ConvPowerVR::UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
 Status CreateConvPowerVR(const CreationContext& creation_context,
                          const OperationDef& definition,
                          const Convolution2DAttributes& attr,
-                         ConvPowerVR* result);
+                         ConvPowerVR* result, const BHWC* dst_shape = nullptr);
 
 Status CreateConvPowerVR(const CreationContext& creation_context,
                          const OperationDef& definition,
                          const FullyConnectedAttributes& attr,
-                         ConvPowerVR* result);
+                         ConvPowerVR* result, const BHWC* dst_shape = nullptr);
 
 Status CreateConvPowerVRWino4x4To6x6(const CreationContext& creation_context,
                                      const OperationDef& definition,
                                      const Convolution2DAttributes& attr,
-                                     ConvPowerVR* result);
+                                     ConvPowerVR* result,
+                                     const BHWC* dst_shape = nullptr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
index 0ce4c8985b2..921a257aa7e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/strings/substitute.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
@@ -220,6 +221,7 @@ std::string GenerateConvolutionTransposedCode(
   }
   c += "      for (int s = 0; s < src_size.z; ++s) {\n";
   const auto mode = GetFastestZeroMode(device);
+  const bool conditional_read = device.IsMali();
   for (int y = 0; y < block_size.y; ++y) {
     const std::string yindex = std::to_string(y);
     for (int x = 0; x < block_size.x; ++x) {
@@ -229,9 +231,15 @@ std::string GenerateConvolutionTransposedCode(
         c += "        FLT4 src" + id + " = " + src_tensor.Read("addr_" + id) +
              "; addr_" + id + " += dz_" + id + ";\n";
       } else if (manual_clamp) {
-        c += "        FLT4 src" + id + " = " + src_tensor.Read("addr_" + id) +
-             " * (FLT)(in_x" + xindex + " && in_y" + yindex + "); addr_" + id +
-             " += dz;\n";
+        if (conditional_read) {
+          c += "        FLT4 src" + id + " = in_x" + xindex + " && in_y" +
+               yindex + " ? " + src_tensor.Read("addr_" + id) +
+               " : (FLT4)(0.0f); addr_" + id + " += dz;\n";
+        } else {
+          c += "        FLT4 src" + id + " = " + src_tensor.Read("addr_" + id) +
+               " * (FLT)(in_x" + xindex + " && in_y" + yindex + "); addr_" +
+               id + " += dz;\n";
+        }
       } else {
         c += "        FLT4 src" + id + " = " +
              src_tensor.ReadWHSB("sx" + xindex, "sy" + yindex, "s", batch_id,
@@ -304,7 +312,24 @@ ConvolutionTransposed::ConvolutionTransposed(
       kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
       stride_(attr.stride.w, attr.stride.h),
       padding_(attr.padding.prepended.w, attr.padding.prepended.h),
-      block_size_(2, 2, 2) {}
+      block_size_(2, 2, 2) {
+  const bool is_f16 = definition.precision == CalculationsPrecision::F16;
+  if (device.IsMali()) {
+    MaliInfo mali_info = device.GetInfo().mali_info;
+    if (mali_info.IsMidgard()) {
+      block_size_ = is_f16 ? int3(2, 1, 2) : int3(2, 1, 1);
+    } else {
+      block_size_ = is_f16 ? int3(2, 2, 2) : int3(2, 2, 1);
+    }
+  }
+  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+  if (dst_depth == 1 || dst_depth == 3) {
+    if (!device.IsMali()) {
+      block_size_.y *= block_size_.z;
+    }
+    block_size_.z = 1;
+  }
+}
 
 ConvolutionTransposed::ConvolutionTransposed(ConvolutionTransposed&& operation)
     : GPUOperation(std::move(operation)),
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
index bd75614b6d4..7b19ac0ba38 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
@@ -30,7 +30,7 @@ namespace cl {
 namespace {
 
 std::string GenerateConvolutionTransposedCode(
-    const OperationDef& op_def,
+    const OperationDef& op_def, const LinearStorage& biases,
     const std::vector<ElementwiseOperation*>& linked_operations,
     ConvolutionTransposed3x3::WeightsUploadType weights_upload_type,
     int2 padding, int3 work_group_launch_order) {
@@ -82,7 +82,7 @@ std::string GenerateConvolutionTransposedCode(
   c += "__kernel void main_function(\n";
   c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
   c += "    " + weights_space + " FLT4* filters,\n";
-  c += "    __read_only image2d_t biases";
+  c += biases.GetDeclaration();
   c += GetArgsDeclaration(linked_operations);
   c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
   c += "    int4 src_size,             \n";
@@ -240,7 +240,7 @@ std::string GenerateConvolutionTransposedCode(
     c += "  if (DST_X >= dst_size.x || DST_Y >= dst_size.y || Z >= dst_size.z) "
          "return;\n";
   }
-  c += "  FLT4 bias_val = READ_IMAGE(biases, (int2)(Z, 0));\n";
+  c += "  FLT4 bias_val = " + biases.ReadLinearFLT4("Z") + ";\n";
   for (int y = 0; y < 2; ++y) {
     for (int x = 0; x < 2; ++x) {
       const std::string s_x = std::to_string(x);
@@ -307,7 +307,7 @@ ConvolutionTransposed3x3& ConvolutionTransposed3x3::operator=(
 Status ConvolutionTransposed3x3::Compile(
     const CreationContext& creation_context) {
   const auto code = GenerateConvolutionTransposedCode(
-      definition_, linked_operations_, weights_upload_type_, padding_,
+      definition_, biases_, linked_operations_, weights_upload_type_, padding_,
       work_group_launch_order_);
 
   std::vector<CompilerOptions> options;
@@ -387,6 +387,7 @@ Status CreateConvolutionTransposed3x3(
   LinearStorageCreateInfo create_info;
   create_info.storage_type = LinearStorageType::TEXTURE_2D;
   create_info.data_type = definition.GetDataType();
+  create_info.name = "biases";
   create_info.aligned_size = attr.weights.shape.o;
   RETURN_IF_ERROR(CreateLinearStorage(
       create_info, attr.bias, creation_context.context, &result->biases_));
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
index 2c0c81bc154..1e36be17778 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
@@ -30,7 +30,7 @@ namespace cl {
 namespace {
 
 std::string GenerateConvolutionTransposedCode(
-    const OperationDef& op_def,
+    const OperationDef& op_def, const LinearStorage& biases,
     const std::vector<ElementwiseOperation*>& linked_operations,
     ConvolutionTransposed4x4::WeightsUploadType weights_upload_type) {
   std::string c = GetCommonDefines(op_def.precision);
@@ -81,7 +81,7 @@ std::string GenerateConvolutionTransposedCode(
   c += "__kernel void main_function(\n";
   c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
   c += "    " + weights_space + " FLT4* filters,\n";
-  c += "    __read_only image2d_t biases";
+  c += biases.GetDeclaration();
   c += GetArgsDeclaration(linked_operations);
   c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
   c += "    int4 src_size,             \n";
@@ -231,7 +231,7 @@ std::string GenerateConvolutionTransposedCode(
   }
   c += "  Y = Y * 2 - 1;\n";
   c += "\n";
-  c += "  FLT4 bias_val = READ_IMAGE(biases, smp_none, (int2)(Z, 0));\n";
+  c += "  FLT4 bias_val = " + biases.ReadLinearFLT4("Z") + ";\n";
   c += "  if (X >= 0 && Y >= 0) {\n";
   c += "    FLT4 result = TO_FLT4(r0) + bias_val;\n";
   LinkingContext context{"result", "X", "Y", "Z"};
@@ -304,7 +304,7 @@ ConvolutionTransposed4x4& ConvolutionTransposed4x4::operator=(
 Status ConvolutionTransposed4x4::Compile(
     const CreationContext& creation_context) {
   const auto code = GenerateConvolutionTransposedCode(
-      definition_, linked_operations_, weights_upload_type_);
+      definition_, biases_, linked_operations_, weights_upload_type_);
 
   std::vector<CompilerOptions> options;
   if (definition_.precision == CalculationsPrecision::F16 &&
@@ -369,6 +369,7 @@ Status CreateConvolutionTransposed4x4(
   LinearStorageCreateInfo create_info;
   create_info.storage_type = LinearStorageType::TEXTURE_2D;
   create_info.data_type = definition.GetDataType();
+  create_info.name = "biases";
   create_info.aligned_size = attr.weights.shape.o;
   RETURN_IF_ERROR(CreateLinearStorage(
       create_info, attr.bias, creation_context.context, &result->biases_));
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.cc
index 35a2998e19c..e7bf31b0d37 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
 
 namespace tflite {
 namespace gpu {
@@ -75,6 +76,7 @@ std::string GetSrcValue(const TensorCodeGenerator& src_tensor,
 std::string GenerateDepthWiseConvolutionCode(
     const OperationDef& op_def, bool stride_correction,
     const LinearStorage& biases, int channel_multiplier,
+    bool weights_are_buffer,
     const std::vector<ElementwiseOperation*>& linked_operations,
     const CLDevice& device) {
   TensorCodeGenerator src_tensor(
@@ -92,7 +94,7 @@ std::string GenerateDepthWiseConvolutionCode(
 
   c += "__kernel void main_function(\n";
   c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  if (src_tensor_type == TensorStorageType::BUFFER) {
+  if (weights_are_buffer) {
     c += "    __global FLT4* filters,  \n";
   } else {
     c += "    __read_only image2d_t filters,  \n";
@@ -123,7 +125,7 @@ std::string GenerateDepthWiseConvolutionCode(
     c += "  int x_offseted = X * stride.x + padding.x;\n";
   }
   c += "  int y_offseted = Y * stride.y + padding.y;\n";
-  if (src_tensor_type == TensorStorageType::BUFFER) {
+  if (weights_are_buffer) {
     c += "  int fx_c = Z * kernel_size.x * kernel_size.y;\n";
   } else {
     c += "  int fx_c = 0;\n";
@@ -137,7 +139,7 @@ std::string GenerateDepthWiseConvolutionCode(
     c += "      int x_c = x_offseted + kx * dilation.x;\n";
     c += "      bool outside_x = x_c < 0 || x_c >= src_size.x;\n";
     c += "      if (!outside_x && !outside_y) {\n";
-    if (src_tensor_type == TensorStorageType::BUFFER) {
+    if (weights_are_buffer) {
       c += "        FLT4 f = filters[fx_c];\n";
     } else {
       c += "        FLT4 f = READ_IMAGE(filters, smp_none, (int2)(fx_c, Z));\n";
@@ -156,7 +158,11 @@ std::string GenerateDepthWiseConvolutionCode(
     c += "      int x_c = x_offseted + kx * dilation.x;\n";
     const auto access_mode = GetFastestZeroMode(device);
     c += GetSrcValue(src_tensor, channel_multiplier, access_mode);
-    c += "      FLT4 f = READ_IMAGE(filters, smp_none, (int2)(fx_c, Z));\n";
+    if (weights_are_buffer) {
+      c += "      FLT4 f = filters[fx_c];\n";
+    } else {
+      c += "      FLT4 f = READ_IMAGE(filters, smp_none, (int2)(fx_c, Z));\n";
+    }
     c += "      fx_c++;\n";
     c += "      r += TO_ACCUM_TYPE(src_final * f);\n";
     c += "    }\n";
@@ -175,8 +181,9 @@ std::string GenerateDepthWiseConvolutionCode(
 
 DepthWiseConvolution::DepthWiseConvolution(
     const OperationDef& definition,
-    const DepthwiseConvolution2DAttributes& attr)
+    const DepthwiseConvolution2DAttributes& attr, bool weights_are_buffer)
     : GPUOperation(definition),
+      weights_are_buffer_(weights_are_buffer),
       kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
       stride_(attr.strides.w, attr.strides.h),
       padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
@@ -186,6 +193,7 @@ DepthWiseConvolution::DepthWiseConvolution(
 
 DepthWiseConvolution::DepthWiseConvolution(DepthWiseConvolution&& operation)
     : GPUOperation(std::move(operation)),
+      weights_are_buffer_(operation.weights_are_buffer_),
       weights_tex2d_(std::move(operation.weights_tex2d_)),
       weights_buf_(std::move(operation.weights_buf_)),
       weights_(operation.weights_),
@@ -201,6 +209,7 @@ DepthWiseConvolution::DepthWiseConvolution(DepthWiseConvolution&& operation)
 DepthWiseConvolution& DepthWiseConvolution::operator=(
     DepthWiseConvolution&& operation) {
   if (this != &operation) {
+    std::swap(weights_are_buffer_, operation.weights_are_buffer_);
     weights_tex2d_ = std::move(operation.weights_tex2d_);
     weights_buf_ = std::move(operation.weights_buf_);
     std::swap(weights_, operation.weights_);
@@ -222,7 +231,7 @@ Status DepthWiseConvolution::Compile(const CreationContext& creation_context) {
       definition_.IsBatchSupported() && stride_.x != 1;
   const auto code = GenerateDepthWiseConvolutionCode(
       definition_, stride_correction, biases_, channel_multiplier_,
-      linked_operations_, *creation_context.device);
+      weights_are_buffer_, linked_operations_, *creation_context.device);
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
@@ -270,12 +279,13 @@ Status CreateDepthWiseConvolution(const CreationContext& creation_context,
                                   const OperationDef& definition,
                                   const DepthwiseConvolution2DAttributes& attr,
                                   DepthWiseConvolution* result) {
-  *result = DepthWiseConvolution(definition, attr);
+  bool weights_are_buffer = creation_context.device->IsMali();
+  *result = DepthWiseConvolution(definition, attr, weights_are_buffer);
   RETURN_IF_ERROR(
       result->UploadWeights(attr.weights, creation_context.context));
   LinearStorageCreateInfo create_info;
-  create_info.storage_type =
-      DeduceLinearStorageType(definition.GetPrimaryStorageType());
+  create_info.storage_type = weights_are_buffer ? LinearStorageType::BUFFER
+                                                : LinearStorageType::TEXTURE_2D;
   create_info.data_type = definition.GetDataType();
   create_info.name = "biases";
   create_info.aligned_size = attr.weights.shape.o * attr.weights.shape.i;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.h b/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.h
index 3f051cbcded..5915ed94502 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.h
@@ -54,8 +54,9 @@ class DepthWiseConvolution : public GPUOperation {
       const CreationContext& creation_context, const OperationDef& definition,
       const DepthwiseConvolution2DAttributes& attr,
       DepthWiseConvolution* result);
-  explicit DepthWiseConvolution(const OperationDef& definition,
-                                const DepthwiseConvolution2DAttributes& attr);
+  DepthWiseConvolution(const OperationDef& definition,
+                       const DepthwiseConvolution2DAttributes& attr,
+                       bool weights_are_buffer);
   template <DataType T>
   Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
                        CLContext* context);
@@ -67,6 +68,7 @@ class DepthWiseConvolution : public GPUOperation {
   Status BindArguments();
   int3 GetGridSize() const;
 
+  bool weights_are_buffer_;
   Texture2D weights_tex2d_;
   Buffer weights_buf_;
   cl_mem weights_;
@@ -93,16 +95,13 @@ Status DepthWiseConvolution::UploadWeights(
 
   const int elements_count = kernel_x * kernel_y * dst_depth;
 
-  bool is_buffer_storage =
-      definition_.GetPrimaryStorageType() == TensorStorageType::BUFFER;
+  const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
+  const int float4_size = fp32_weights ? 16 : 8;
 
-  const int float4_size =
-      definition_.precision == CalculationsPrecision::F32 ? 16 : 8;
-
-  if (definition_.GetDataType() == DataType::FLOAT32) {
+  if (fp32_weights) {
     std::vector<float4> gpu_data(elements_count);
     RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    if (is_buffer_storage) {
+    if (weights_are_buffer_) {
       RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
                                            gpu_data.data(), context,
                                            &weights_buf_));
@@ -114,7 +113,7 @@ Status DepthWiseConvolution::UploadWeights(
   } else {
     std::vector<half4> gpu_data(elements_count);
     RearrangeWeightsData(weights, absl::MakeSpan(gpu_data));
-    if (is_buffer_storage) {
+    if (weights_are_buffer_) {
       RETURN_IF_ERROR(CreateReadOnlyBuffer(float4_size * elements_count,
                                            gpu_data.data(), context,
                                            &weights_buf_));
@@ -125,7 +124,7 @@ Status DepthWiseConvolution::UploadWeights(
     }
   }
 
-  if (is_buffer_storage) {
+  if (weights_are_buffer_) {
     weights_ = weights_buf_.GetMemoryPtr();
   } else {
     weights_ = weights_tex2d_.GetMemoryPtr();
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc
index e235a4f0edd..c536e759210 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc
@@ -62,6 +62,8 @@ std::string GetFullyConnectedKernelCode(
   c += "    int2 depthes                  \n";
   c += ") {\n";
   c += "  int gid = get_global_id(0);\n";
+  c += "  bool inside = gid < depthes.y;\n";
+  c += "  gid = min(gid, depthes.y - 1);\n";
   c += "  int2 tid = (int2)(get_local_id(0), get_local_id(1));\n";
   c += "  ACCUM_FLT4 s = (ACCUM_FLT4)(0.0f);\n";
   c += "  for (uint c = tid.y; c < depthes.x; c += " + wg_y + ") {\n";
@@ -75,7 +77,7 @@ std::string GetFullyConnectedKernelCode(
   c += "  __local ACCUM_FLT4 temp[" + wg_x + "][" + wg_y + "];\n";
   c += "  temp[tid.x][tid.y] = s;\n";
   c += "  barrier(CLK_LOCAL_MEM_FENCE);\n";
-  c += "  if (tid.y == 0 && gid < depthes.y) {\n";
+  c += "  if (tid.y == 0 && inside) {\n";
   for (int i = 1; i < work_group_size.y; ++i) {
     c += "    s += temp[tid.x][" + std::to_string(i) + "];\n";
   }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.cc b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.cc
new file mode 100644
index 00000000000..f7751fac6ff
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.cc
@@ -0,0 +1,128 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h"
+
+#include <string>
+
+#include "absl/strings/str_cat.h"
+#include "absl/types/variant.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+QuantizeAndDequantize::QuantizeAndDequantize(
+    const OperationDef& definition, const QuantizeAndDequantizeAttributes& attr,
+    CalculationsPrecision scalar_precision)
+    : ElementwiseOperation(definition) {
+  min_ = FLT(scalar_precision, attr.min);
+  max_ = FLT(scalar_precision, attr.max);
+  scale_ = FLT(scalar_precision, attr.scale);
+}
+
+QuantizeAndDequantize::QuantizeAndDequantize(QuantizeAndDequantize&& operation)
+    : ElementwiseOperation(std::move(operation)),
+      min_(std::move(operation.min_)),
+      max_(std::move(operation.max_)),
+      scale_(std::move(operation.scale_)) {}
+
+QuantizeAndDequantize& QuantizeAndDequantize::operator=(
+    QuantizeAndDequantize&& operation) {
+  if (this != &operation) {
+    min_ = std::move(operation.min_);
+    max_ = std::move(operation.max_);
+    scale_ = std::move(operation.scale_);
+    ElementwiseOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+void QuantizeAndDequantize::SetLinkIndex(int index) {
+  min_.SetName(absl::StrCat("quantize_and_dequantize_min_", index));
+  max_.SetName(absl::StrCat("quantize_and_dequantize_max_", index));
+  scale_.SetName(absl::StrCat("quantize_and_dequantize_scale_", index));
+}
+
+std::string QuantizeAndDequantize::GetCoreCode(
+    const LinkingContext& context) const {
+  std::string scale_string, max_string, min_string;
+  if (!scale_.Active()) {
+    scale_string = "(FLT4)(1.0f)";
+  } else {
+    scale_string = absl::StrCat("(FLT4)(", scale_.GetName(), ")");
+  }
+  if (!max_.Active()) {
+    max_string = "(FLT4)(0.0f)";
+  } else {
+    max_string = absl::StrCat("(FLT4)(", max_.GetName(), ")");
+  }
+  if (!min_.Active()) {
+    min_string = "(FLT4)(0.0f)";
+  } else {
+    min_string = absl::StrCat("(FLT4)(", min_.GetName(), ")");
+  }
+  std::string clamped_value = absl::StrCat(
+      "min(", max_string, ", max(", min_string, ", ", context.var_name, "))");
+  std::string quantized_value = absl::StrCat(
+      "round((", clamped_value, " - ", min_string, ") / ", scale_string, ")");
+  std::string dequantized_value =
+      absl::StrCat(quantized_value, " * ", scale_string, " + ", min_string);
+
+  return absl::StrCat(context.var_name, " = ", dequantized_value, ";\n");
+}
+
+std::string QuantizeAndDequantize::GetArgsDeclaration() const {
+  return absl::StrCat(",\n    ", min_.GetDeclaration(), ",\n    ",
+                      max_.GetDeclaration(), ",\n    ",
+                      scale_.GetDeclaration());
+}
+
+Status QuantizeAndDequantize::BindArguments(CLKernel* kernel) {
+  RETURN_IF_ERROR(kernel->SetBytesAuto(min_));
+  RETURN_IF_ERROR(kernel->SetBytesAuto(max_));
+  RETURN_IF_ERROR(kernel->SetBytesAuto(scale_));
+  return OkStatus();
+}
+
+Status CreateQuantizeAndDequantize(const CreationContext& creation_context,
+                                   const OperationDef& definition,
+                                   const QuantizeAndDequantizeAttributes& attr,
+                                   QuantizeAndDequantize* result) {
+  const auto scalar_precision = creation_context.device->IsPowerVR()
+                                    ? CalculationsPrecision::F32
+                                    : definition.precision;
+  const bool is_fp16 = definition.precision == CalculationsPrecision::F16 ||
+                       definition.precision == CalculationsPrecision::F32_F16;
+  if (is_fp16 && attr.scale < 0.000062f) {
+    // The smallest positive normal number for Half-precision floating-point
+    // format is 2^-14 ~ 0.000062f. Therefore, if the scale is lesser than this
+    // number, we just reset it accordingly.
+    QuantizeAndDequantizeAttributes adjusted_attr = attr;
+    adjusted_attr.scale = 0.000062f;
+    *result =
+        QuantizeAndDequantize(definition, adjusted_attr, scalar_precision);
+  } else {
+    *result = QuantizeAndDequantize(definition, attr, scalar_precision);
+  }
+  result->SetLinkIndex(0);
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h
new file mode 100644
index 00000000000..07fa8f21773
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h
@@ -0,0 +1,100 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_QUANTIZE_AND_DEQUANTIZE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_QUANTIZE_AND_DEQUANTIZE_H_
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+// Performs the operation: {Quantize, Dequantize} on floating-point data.
+// We need this operation to emulate the error introduced by quantization
+// on the GPU, which cannot represent int8 tensors.
+//
+// Implemented as:
+// qvalue = round((min(qmax, max(qmin, src_val)) - qmin) * (1/qscale) + 0.5)
+// dq_value = qvalue * qscale + qmin
+// Here, qmin, qmax & qscale refer to the quantization values as implemented in
+// TensorFlow Lite's 'FakeQuant' kernel. round(x + 0.5) ensures we round away
+// from zero.
+//
+// NOTE: We do not need to nudge min/max values in this op, since they would
+// already be adjusted while generating the quantized model.
+class QuantizeAndDequantize : public ElementwiseOperation {
+ public:
+  QuantizeAndDequantize() = default;
+  // Move only
+  QuantizeAndDequantize(QuantizeAndDequantize&& operation);
+  QuantizeAndDequantize& operator=(QuantizeAndDequantize&& operation);
+  QuantizeAndDequantize(const QuantizeAndDequantize&) = delete;
+  QuantizeAndDequantize& operator=(const QuantizeAndDequantize&) = delete;
+
+  void SetLinkIndex(int index) override;
+  std::string GetCoreCode(const LinkingContext& context) const override;
+  std::string GetArgsDeclaration() const override;
+  Status BindArguments(CLKernel* kernel) override;
+
+  friend Status CreateQuantizeAndDequantize(
+      const CreationContext& creation_context, const OperationDef& definition,
+      const QuantizeAndDequantizeAttributes& attr,
+      QuantizeAndDequantize* result);
+
+ private:
+  QuantizeAndDequantize(const OperationDef& definition,
+                        const QuantizeAndDequantizeAttributes& attr,
+                        CalculationsPrecision scalar_precision);
+
+  template <DataType T>
+  Status UploadParameters(const ::tflite::gpu::Tensor<Linear, T>& parameters,
+                          CLContext* context);
+
+  FLT min_;
+  FLT max_;
+  FLT scale_;
+};
+
+Status CreateQuantizeAndDequantize(const CreationContext& creation_context,
+                                   const OperationDef& definition,
+                                   const QuantizeAndDequantizeAttributes& attr,
+                                   QuantizeAndDequantize* result);
+
+template <DataType T>
+Status QuantizeAndDequantize::UploadParameters(
+    const ::tflite::gpu::Tensor<Linear, T>& parameters, CLContext* context) {
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type =
+      DeduceLinearStorageType(definition_.GetPrimaryStorageType());
+  create_info.data_type = definition_.GetPrimaryDataType();
+  return OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_QUANTIZE_AND_DEQUANTIZE_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize_test.cc
new file mode 100644
index 00000000000..71d6d066b9b
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize_test.cc
@@ -0,0 +1,182 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, QuantAndDequant_Dim2Bits8) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 3, 2, 1);
+  src_tensor.data = {0.0f, 1.0f, 0.25f, 0.50f, 0.4444444f, 0.00001f};
+
+  // Unlike TFLite's FakeQuant kernel, we assume that the incoming values are
+  // pre-nudged, since this should be done during model conversion.
+  const int num_bits = 8;
+  const int quant_min = 0;
+  const int quant_max = (1 << num_bits) - 1;
+  QuantizeAndDequantizeAttributes attr;
+  NudgeQuantizationRange(/**original_min**/ 0.0, /**original_max**/ 1.0,
+                         quant_min, quant_max, &attr.min, &attr.max,
+                         &attr.scale);
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      QuantizeAndDequantize operation;
+      ASSERT_OK(CreateQuantizeAndDequantize(creation_context_, op_def, attr,
+                                            &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 3, 2, 1), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.0f, 1.0f, 0.25098f, 0.498039f,
+                                             0.443137f, 0.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, QuantAndDequant_Dim3Bits8_NegativeRange) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 3, 1, 2);
+  src_tensor.data = {0.0f, -0.9f, 0.25f, 0.50f, 0.4444444f, -0.00001f};
+
+  // Unlike TFLite's FakeQuant kernel, we assume that the incoming values are
+  // pre-nudged, since this should be done during model conversion.
+  const int num_bits = 8;
+  const int quant_min = 0;
+  const int quant_max = (1 << num_bits) - 1;
+  QuantizeAndDequantizeAttributes attr;
+  NudgeQuantizationRange(/**original_min**/ -0.9, /**original_max**/ 0.9,
+                         quant_min, quant_max, &attr.min, &attr.max,
+                         &attr.scale);
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      QuantizeAndDequantize operation;
+      ASSERT_OK(CreateQuantizeAndDequantize(creation_context_, op_def, attr,
+                                            &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 3, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.0f, -0.896471f, 0.247059f,
+                                             0.501176f, 0.444706f, 0.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, QuantAndDequant_Dim3Bits16) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 3, 1, 2);
+  src_tensor.data = {0.0f, 1.0f, 0.25f, 0.50f, 0.4444444f, 0.00001f};
+
+  // Unlike TFLite's FakeQuant kernel, we assume that the incoming values are
+  // pre-nudged, since this should be done during model conversion.
+  const int num_bits = 16;
+  const int quant_min = 0;
+  const int quant_max = (1 << num_bits) - 1;
+  QuantizeAndDequantizeAttributes attr;
+  NudgeQuantizationRange(/**original_min**/ 0.0, /**original_max**/ 1.0,
+                         quant_min, quant_max, &attr.min, &attr.max,
+                         &attr.scale);
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      QuantizeAndDequantize operation;
+      ASSERT_OK(CreateQuantizeAndDequantize(creation_context_, op_def, attr,
+                                            &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 3, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.0f, 1.0f, 0.250004f, 0.500008f,
+                                             0.44445f, 1.5259e-05f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, QuantAndDequant_Dim2Bits16_NegativeRange) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 3, 2, 1);
+  src_tensor.data = {0.0f, -0.9f, 0.25f, 0.50f, 0.4444444f, -0.00001f};
+
+  // Unlike TFLite's FakeQuant kernel, we assume that the incoming values are
+  // pre-nudged, since this should be done during model conversion.
+  const int num_bits = 16;
+  const int quant_min = 0;
+  const int quant_max = (1 << num_bits) - 1;
+  QuantizeAndDequantizeAttributes attr;
+  NudgeQuantizationRange(/**original_min**/ -0.9, /**original_max**/ 0.9,
+                         quant_min, quant_max, &attr.min, &attr.max,
+                         &attr.scale);
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      QuantizeAndDequantize operation;
+      ASSERT_OK(CreateQuantizeAndDequantize(creation_context_, op_def, attr,
+                                            &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 3, 2, 1), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.0f, -0.900014f, 0.249998f,
+                                             0.499995f, 0.444431f, 0.0f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/util.cc b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
index 8cb8d615787..c3ea54865b3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 
+#include <cfloat>
 #include <cmath>
 #include <string>
 #include <vector>
@@ -721,6 +722,80 @@ int3 GetFirstSuitableWorkGroup(const std::vector<int3>& wgs, int max_wg_size) {
   return {1, 1, 1};
 }
 
+int GetRecommendedBlockSizeForConv(const CLDevice& device,
+                                   CalculationsPrecision precision,
+                                   int task_size) {
+  const float task_size_per_cu =
+      task_size / static_cast<float>(device.GetInfo().compute_units_count);
+  int block_size = 1;
+  float threshold_1 = FLT_MAX;
+  float threshold_2 = FLT_MAX;
+  float threshold_4 = FLT_MAX;
+  if (!device.IsMali()) {
+    return 1;
+  }
+  MaliInfo mali_info = device.GetInfo().mali_info;
+  switch (precision) {
+    case CalculationsPrecision::F16:
+      if (mali_info.IsBifrostGen1()) {
+        threshold_1 = 256.0f;
+        threshold_2 = 256.0f * 4.0f;
+        threshold_4 = 256.0f * 8.0f;
+      } else if (mali_info.IsBifrostGen2()) {
+        threshold_1 = 256.0f * 2.0f;
+        threshold_2 = 256.0f * 8.0f;
+        threshold_4 = 256.0f * 16.0f;
+      } else if (mali_info.IsBifrostGen3() || mali_info.IsValhall()) {
+        threshold_1 = 256.0f;
+        threshold_2 = 256.0f * 6.0f;
+        threshold_4 = 256.0f * 16.0f;
+      } else if (mali_info.IsMidgard()) {
+        threshold_1 = 256.0f * 4.0f;
+        threshold_2 = 256.0f * 16.0f;
+      }
+      break;
+    case CalculationsPrecision::F32_F16:
+      if (mali_info.IsBifrostGen1()) {
+        threshold_1 = 256.0f;
+        threshold_2 = 256.0f * 3.0f;
+        threshold_4 = 256.0f * 32.0f;
+      } else if (mali_info.IsBifrostGen2()) {
+        threshold_1 = 256.0f * 2.0f;
+        threshold_2 = 256.0f * 8.0f;
+      } else if (mali_info.IsBifrostGen3() || mali_info.IsValhall()) {
+        threshold_1 = 256.0f;
+        threshold_2 = 256.0f * 8.0f;
+      } else if (mali_info.IsMidgard()) {
+        threshold_1 = 256.0f * 4.0f;
+      }
+      break;
+    case CalculationsPrecision::F32:
+      if (mali_info.IsBifrostGen1()) {
+        threshold_1 = 256.0f;
+        threshold_2 = 256.0f * 4.0f;
+      } else if (mali_info.IsBifrostGen2()) {
+        threshold_1 = 128.0f;
+        threshold_2 = 256.0f * 4.0f;
+      } else if (mali_info.IsBifrostGen3() || mali_info.IsValhall()) {
+        threshold_1 = 256.0f;
+        threshold_2 = 256.0f * 12.0f;
+      } else if (mali_info.IsMidgard()) {
+        threshold_1 = 256.0f * 16.0f;
+      }
+      break;
+  }
+  if (task_size_per_cu <= threshold_1) {
+    block_size = 1;
+  } else if (task_size_per_cu <= threshold_2) {
+    block_size = 2;
+  } else if (task_size_per_cu <= threshold_4) {
+    block_size = 4;
+  } else {
+    block_size = 8;
+  }
+  return block_size;
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/util.h b/tensorflow/lite/delegates/gpu/cl/kernels/util.h
index f5f3d532896..87ae44e4734 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/util.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.h
@@ -305,6 +305,11 @@ float4 GetMaskForLastPlane(int channels);
 // returns first work group from wgs that has size not bigger than max_wg_size
 // if no suitable groups among wgs, returns {1, 1, 1}
 int3 GetFirstSuitableWorkGroup(const std::vector<int3>& wgs, int max_wg_size);
+
+// task_size as amount of FLT4 processed elements.
+int GetRecommendedBlockSizeForConv(const CLDevice& device,
+                                   CalculationsPrecision precision,
+                                   int task_size);
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
index 627c781e5c7..3b471ce816c 100644
--- a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
+++ b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
@@ -171,6 +171,11 @@ void LoadOpenCLFunctions(void* libopencl, bool is_pixel) {
 
   // cl_khr_egl_event extension
   LoadFunction(clCreateEventFromEGLSyncKHR);
+
+  // EGL sharing
+  LoadFunction(clCreateFromEGLImageKHR);
+  LoadFunction(clEnqueueAcquireEGLObjectsKHR);
+  LoadFunction(clEnqueueReleaseEGLObjectsKHR);
 }
 
 // No OpenCL support, do not set function addresses
@@ -277,13 +282,20 @@ PFN_clCreateCommandQueue clCreateCommandQueue;
 PFN_clCreateSampler clCreateSampler;
 PFN_clEnqueueTask clEnqueueTask;
 
+// OpenGL sharing
 PFN_clCreateFromGLBuffer clCreateFromGLBuffer;
 PFN_clCreateFromGLTexture clCreateFromGLTexture;
 PFN_clEnqueueAcquireGLObjects clEnqueueAcquireGLObjects;
 PFN_clEnqueueReleaseGLObjects clEnqueueReleaseGLObjects;
 
+// cl_khr_egl_event extension
 PFN_clCreateEventFromEGLSyncKHR clCreateEventFromEGLSyncKHR;
 
+// EGL sharing
+PFN_clCreateFromEGLImageKHR clCreateFromEGLImageKHR;
+PFN_clEnqueueAcquireEGLObjectsKHR clEnqueueAcquireEGLObjectsKHR;
+PFN_clEnqueueReleaseEGLObjectsKHR clEnqueueReleaseEGLObjectsKHR;
+
 cl_mem CreateImage2DLegacy(cl_context context, cl_mem_flags flags,
                            const cl_image_format* image_format,
                            const cl_image_desc* image_desc, void* host_ptr,
diff --git a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h
index c0fa0368152..daf7f76773b 100644
--- a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h
+++ b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_OPENCL_WRAPPER_H_
 
 #include <CL/cl.h>
+#include <CL/cl_egl.h>
 #include <CL/cl_ext.h>
 #include <CL/cl_gl.h>
 #include <CL/cl_platform.h>
@@ -488,7 +489,6 @@ typedef cl_int(CL_API_CALL *PFN_clEnqueueAcquireGLObjects)(
     cl_command_queue /* command_queue */, cl_uint /* num_objects */,
     const cl_mem * /* mem_objects */, cl_uint /* num_events_in_wait_list */,
     const cl_event * /* event_wait_list */, cl_event * /* event */);
-
 typedef cl_int(CL_API_CALL *PFN_clEnqueueReleaseGLObjects)(
     cl_command_queue /* command_queue */, cl_uint /* num_objects */,
     const cl_mem * /* mem_objects */, cl_uint /* num_events_in_wait_list */,
@@ -507,6 +507,21 @@ typedef cl_event(CL_API_CALL *PFN_clCreateEventFromEGLSyncKHR)(
     cl_context /* context */, CLeglSyncKHR /* sync */,
     CLeglDisplayKHR /* display */, cl_int * /* errcode_ret */);
 
+// EGL sharing
+typedef cl_mem(CL_API_CALL *PFN_clCreateFromEGLImageKHR)(
+    cl_context /*context*/, CLeglDisplayKHR /*display*/,
+    CLeglImageKHR /*image*/, cl_mem_flags /*flags*/,
+    const cl_egl_image_properties_khr * /*properties*/,
+    cl_int * /*errcode_ret*/);
+typedef cl_int(CL_API_CALL *PFN_clEnqueueAcquireEGLObjectsKHR)(
+    cl_command_queue /*command_queue*/, cl_uint /*num_objects*/,
+    const cl_mem * /*mem_objects*/, cl_uint /*num_events_in_wait_list*/,
+    const cl_event * /*event_wait_list*/, cl_event * /*event*/);
+typedef cl_int(CL_API_CALL *PFN_clEnqueueReleaseEGLObjectsKHR)(
+    cl_command_queue /*command_queue*/, cl_uint /*num_objects*/,
+    const cl_mem * /*mem_objects*/, cl_uint /*num_events_in_wait_list*/,
+    const cl_event * /*event_wait_list*/, cl_event * /*event*/);
+
 extern PFN_clGetPlatformIDs clGetPlatformIDs;
 extern PFN_clGetPlatformInfo clGetPlatformInfo;
 extern PFN_clGetDeviceIDs clGetDeviceIDs;
@@ -620,6 +635,11 @@ extern PFN_clEnqueueReleaseGLObjects clEnqueueReleaseGLObjects;
 // cl_khr_egl_event extension
 extern PFN_clCreateEventFromEGLSyncKHR clCreateEventFromEGLSyncKHR;
 
+// EGL sharing
+extern PFN_clCreateFromEGLImageKHR clCreateFromEGLImageKHR;
+extern PFN_clEnqueueAcquireEGLObjectsKHR clEnqueueAcquireEGLObjectsKHR;
+extern PFN_clEnqueueReleaseEGLObjectsKHR clEnqueueReleaseEGLObjectsKHR;
+
 // For convinient image creation
 // It uses clCreateImage if it available (clCreateImage available since cl 1.2)
 // otherwise it will use legacy clCreateImage2D
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/BUILD b/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
index 908c1b91583..6f9b52bd1c9 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
@@ -42,12 +42,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "default_selector",
+    hdrs = ["default_selector.h"],
+    deps = [
+        ":subgraph",
+        "//tensorflow/lite/delegates/gpu/cl:model_hints",
+        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
+        "//tensorflow/lite/delegates/gpu/cl/selectors/default:default_selector",
+        "//tensorflow/lite/delegates/gpu/common:model",
+        "//tensorflow/lite/delegates/gpu/common:status",
+    ],
+)
+
 cc_library(
     name = "dw_convolution_selector",
     srcs = ["dw_convolution_selector.cc"],
     hdrs = ["dw_convolution_selector.h"],
     deps = [
         "//tensorflow/lite/delegates/gpu/cl:cl_device",
+        "//tensorflow/lite/delegates/gpu/cl:precision",
         "//tensorflow/lite/delegates/gpu/cl/kernels:depth_wise_conv",
         "//tensorflow/lite/delegates/gpu/cl/kernels:depth_wise_conv_3x3",
         "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
@@ -83,12 +98,14 @@ cc_library(
         ":dw_convolution_selector",
         ":fully_connected_selector",
         ":simple_selectors",
+        ":subgraph",
         "//tensorflow/lite/delegates/gpu/cl:cl_device",
         "//tensorflow/lite/delegates/gpu/cl:model_hints",
         "//tensorflow/lite/delegates/gpu/cl:storage_type_util",
         "//tensorflow/lite/delegates/gpu/cl:tensor_type",
         "//tensorflow/lite/delegates/gpu/cl/kernels:elementwise",
         "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
+        "//tensorflow/lite/delegates/gpu/cl/selectors:default_selector",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:shape",
@@ -115,6 +132,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/cl/kernels:padding",
         "//tensorflow/lite/delegates/gpu/cl/kernels:pooling",
         "//tensorflow/lite/delegates/gpu/cl/kernels:prelu",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:quantize_and_dequantize",
         "//tensorflow/lite/delegates/gpu/cl/kernels:relu",
         "//tensorflow/lite/delegates/gpu/cl/kernels:reshape",
         "//tensorflow/lite/delegates/gpu/cl/kernels:reshapex4",
@@ -131,3 +149,14 @@ cc_library(
         "@com_google_absl//absl/memory",
     ],
 )
+
+cc_library(
+    name = "subgraph",
+    srcs = ["subgraph.cc"],
+    hdrs = ["subgraph.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
+        "//tensorflow/lite/delegates/gpu/common:model",
+    ],
+)
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
index 0103ca08b90..a420373f50a 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
@@ -88,40 +88,45 @@ Status SelectConvolutionPowerVR(const Convolution2DAttributes& attr,
 }
 
 Status SelectConvolutionMali(const Convolution2DAttributes& attr,
+                             const BHWC& dst_shape,
                              const CreationContext& creation_context,
                              const OperationDef& op_def,
                              std::unique_ptr<GPUOperation>* ptr) {
   if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER &&
       IsConvBuffer1x1Supported(op_def, attr)) {
     ConvBuffer1x1 conv;
-    RETURN_IF_ERROR(CreateConvBuffer1x1(creation_context, op_def, attr, &conv));
+    RETURN_IF_ERROR(
+        CreateConvBuffer1x1(creation_context, op_def, attr, &conv, &dst_shape));
     *ptr = absl::make_unique<ConvBuffer1x1>(std::move(conv));
   } else {
     ConvPowerVR conv;
-    RETURN_IF_ERROR(CreateConvPowerVR(creation_context, op_def, attr, &conv));
+    RETURN_IF_ERROR(
+        CreateConvPowerVR(creation_context, op_def, attr, &conv, &dst_shape));
     *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
   }
   return OkStatus();
 }
 
 Status SelectConvolutionWinogradMali(const Convolution2DAttributes& attr,
+                                     const BHWC& dst_shape,
                                      const CreationContext& creation_context,
                                      const OperationDef& op_def,
                                      std::unique_ptr<GPUOperation>* ptr) {
   if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER) {
     ConvBuffer1x1 conv;
-    RETURN_IF_ERROR(
-        CreateConvBuffer1x1Wino4x4To6x6(creation_context, op_def, attr, &conv));
+    RETURN_IF_ERROR(CreateConvBuffer1x1Wino4x4To6x6(creation_context, op_def,
+                                                    attr, &conv, &dst_shape));
     *ptr = absl::make_unique<ConvBuffer1x1>(std::move(conv));
   } else {
     ConvPowerVR conv;
-    RETURN_IF_ERROR(
-        CreateConvPowerVRWino4x4To6x6(creation_context, op_def, attr, &conv));
+    RETURN_IF_ERROR(CreateConvPowerVRWino4x4To6x6(creation_context, op_def,
+                                                  attr, &conv, &dst_shape));
     *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
   }
 
   return OkStatus();
 }
+
 }  // namespace
 
 Status SelectConvolution(const Convolution2DAttributes& attr,
@@ -139,7 +144,8 @@ Status SelectConvolution(const Convolution2DAttributes& attr,
     case Vendor::NVIDIA:
       return SelectConvolutionNVidia(attr, creation_context, op_def, ptr);
     case Vendor::MALI:
-      return SelectConvolutionMali(attr, creation_context, op_def, ptr);
+      return SelectConvolutionMali(attr, dst_shape, creation_context, op_def,
+                                   ptr);
     default:
       return SelectConvolutionAdreno(attr, dst_shape, creation_context, op_def,
                                      hints, ptr);
@@ -166,7 +172,8 @@ Status SelectConvolutionForWinograd(const Convolution2DAttributes& attr,
       return OkStatus();
     }
     case Vendor::MALI:
-      return SelectConvolutionWinogradMali(attr, creation_context, op_def, ptr);
+      return SelectConvolutionWinogradMali(attr, dst_shape, creation_context,
+                                           op_def, ptr);
     default:
       return SelectConvolutionWinogradAdreno(attr, dst_shape, creation_context,
                                              op_def, hints, ptr);
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/default/BUILD b/tensorflow/lite/delegates/gpu/cl/selectors/default/BUILD
new file mode 100644
index 00000000000..1334599d75d
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/default/BUILD
@@ -0,0 +1,19 @@
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "default_selector",
+    srcs = ["default_selector.cc"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/cl:model_hints",
+        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
+        "//tensorflow/lite/delegates/gpu/cl/selectors:subgraph",
+        "//tensorflow/lite/delegates/gpu/common:model",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/strings",
+    ],
+)
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/default/default_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/default/default_selector.cc
new file mode 100644
index 00000000000..9fe7aa9732e
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/default/default_selector.cc
@@ -0,0 +1,42 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/model_hints.h"
+#include "tensorflow/lite/delegates/gpu/cl/selectors/subgraph.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+Status SelectDefault(const CreationContext& creation_context,
+                     const OperationDef& op_def, ModelHints hints,
+                     const std::vector<Value<TensorRef<BHWC>>*>& inputs,
+                     const std::vector<Value<TensorRef<BHWC>>*>& outputs,
+                     const Node& node, GPUOperationsSubgraph* gpu_subgraph) {
+  return UnimplementedError(
+      absl::StrCat("No selector for ", node.operation.type));
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/default_selector.h b/tensorflow/lite/delegates/gpu/cl/selectors/default_selector.h
new file mode 100644
index 00000000000..b4b996cc4fb
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/default_selector.h
@@ -0,0 +1,42 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_DEFAULT_SELECTOR_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_DEFAULT_SELECTOR_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/model_hints.h"
+#include "tensorflow/lite/delegates/gpu/cl/selectors/subgraph.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+Status SelectDefault(const CreationContext& creation_context,
+                     const OperationDef& op_def, ModelHints hints,
+                     const std::vector<Value<TensorRef<BHWC>>*>& inputs,
+                     const std::vector<Value<TensorRef<BHWC>>*>& outputs,
+                     const Node& node, GPUOperationsSubgraph* gpu_subgraph);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_DEFAULT_SELECTOR_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.cc
index 92dd9bf0d2f..85afa3fff43 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/depth_wise_conv_3x3.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
 
 namespace tflite {
 namespace gpu {
@@ -44,9 +45,9 @@ Status SelectDWConvolutionAdreno(const DepthwiseConvolution2DAttributes& attr,
 }
 
 Status SelectDWConvolutionPowerVR(const DepthwiseConvolution2DAttributes& attr,
-                                 const CreationContext& creation_context,
-                                 const OperationDef& op_def,
-                                 std::unique_ptr<GPUOperation>* ptr) {
+                                  const CreationContext& creation_context,
+                                  const OperationDef& op_def,
+                                  std::unique_ptr<GPUOperation>* ptr) {
   if (!op_def.IsBatchSupported() && IsDepthWiseConv3x3Supported(attr)) {
     DepthWiseConv3x3 dw_conv;
     RETURN_IF_ERROR(
@@ -62,13 +63,26 @@ Status SelectDWConvolutionPowerVR(const DepthwiseConvolution2DAttributes& attr,
 }
 
 Status SelectDWConvolutionMali(const DepthwiseConvolution2DAttributes& attr,
-                                 const CreationContext& creation_context,
-                                 const OperationDef& op_def,
-                                 std::unique_ptr<GPUOperation>* ptr) {
-  DepthWiseConvolution dw_conv;
-  RETURN_IF_ERROR(
-      CreateDepthWiseConvolution(creation_context, op_def, attr, &dw_conv));
-  *ptr = absl::make_unique<DepthWiseConvolution>(std::move(dw_conv));
+                               const CreationContext& creation_context,
+                               const OperationDef& op_def,
+                               std::unique_ptr<GPUOperation>* ptr) {
+  const auto storage_type = op_def.src_tensors[0].storage_type;
+  bool buffer_type = storage_type == TensorStorageType::BUFFER ||
+                     storage_type == TensorStorageType::IMAGE_BUFFER;
+  MaliInfo mali_info = creation_context.device->GetInfo().mali_info;
+  if (IsDepthWiseConv3x3Supported(attr) && !mali_info.IsMidgard() &&
+      !buffer_type && !op_def.IsBatchSupported() &&
+      op_def.precision != CalculationsPrecision::F32) {
+    DepthWiseConv3x3 dw_conv;
+    RETURN_IF_ERROR(
+        CreateDepthWiseConv3x3(creation_context, op_def, attr, &dw_conv));
+    *ptr = absl::make_unique<DepthWiseConv3x3>(std::move(dw_conv));
+  } else {
+    DepthWiseConvolution dw_conv;
+    RETURN_IF_ERROR(
+        CreateDepthWiseConvolution(creation_context, op_def, attr, &dw_conv));
+    *ptr = absl::make_unique<DepthWiseConvolution>(std::move(dw_conv));
+  }
   return OkStatus();
 }
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
index 29c246a2744..2fcb90fc8d1 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h"
 #include "tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h"
 #include "tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.h"
+#include "tensorflow/lite/delegates/gpu/cl/selectors/default_selector.h"
 #include "tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.h"
 #include "tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.h"
 #include "tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h"
@@ -64,10 +65,12 @@ bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr,
   const bool suitable_attributes =
       attr.weights.shape.w == 3 && attr.weights.shape.h == 3 &&
       attr.dilations == HW(1, 1) && attr.strides == HW(1, 1);
-  const int min_depth = 32;
+  // Mali among other devices has smaller SIMD line size
+  const int min_depth = device.IsMali() ? 16 : 32;
+  const int min_hw = device.IsMali() ? 32 : 128;
   const bool recommended_channels =
       dst_depth % 4 == 0 && src_depth >= min_depth && dst_depth >= min_depth;
-  const bool recommended_hw = tiles_x * tiles_y >= 128;
+  const bool recommended_hw = tiles_x * tiles_y >= min_hw;
   return suitable_attributes && recommended_channels && recommended_hw;
 }
 
@@ -141,23 +144,6 @@ Status WinogradFromNode(const CreationContext& creation_context,
   return OkStatus();
 }
 
-std::unique_ptr<GPUOperation>* InitSingleOpSubgraph(
-    const std::vector<Value<TensorRef<BHWC>>*>& inputs,
-    const std::vector<Value<TensorRef<BHWC>>*>& outputs,
-    GPUOperationsSubgraph* gpu_subgraph) {
-  gpu_subgraph->operations.clear();
-  gpu_subgraph->new_tensors.clear();
-  gpu_subgraph->operations.push_back({});
-  for (int i = 0; i < inputs.size(); ++i) {
-    gpu_subgraph->operations[0].input_ids.push_back(i);
-  }
-  for (int i = 0; i < outputs.size(); ++i) {
-    gpu_subgraph->operations[0].output_ids.push_back(i);
-  }
-
-  return &gpu_subgraph->operations[0].operation;
-}
-
 }  // namespace
 
 Status GPUOperationFromNode(const CreationContext& creation_context,
@@ -219,7 +205,7 @@ Status GPUOperationFromNode(const CreationContext& creation_context,
         return OkStatus();
       } else {
         gpu_op = InitSingleOpSubgraph(inputs, outputs, gpu_subgraph);
-        return SelectConvolution(attr, input_shape, creation_context, op_def,
+        return SelectConvolution(attr, output_shape, creation_context, op_def,
                                  hints, gpu_op);
       }
     }
@@ -293,6 +279,12 @@ Status GPUOperationFromNode(const CreationContext& creation_context,
       auto attr = absl::any_cast<PReLUAttributes>(node.operation.attributes);
       return SelectPReLU(attr, creation_context, op_def, gpu_op);
     }
+    case OperationType::QUANTIZE_AND_DEQUANTIZE: {
+      auto attr = absl::any_cast<QuantizeAndDequantizeAttributes>(
+          node.operation.attributes);
+      return SelectQuantizeAndDequantize(attr, creation_context, op_def,
+                                         gpu_op);
+    }
     case OperationType::RELU: {
       auto attr = absl::any_cast<ReLUAttributes>(node.operation.attributes);
       SelectReLU(creation_context, attr, op_def, gpu_op);
@@ -363,8 +355,8 @@ Status GPUOperationFromNode(const CreationContext& creation_context,
       return OkStatus();
     }
     default:
-      return UnimplementedError(
-          absl::StrCat("No selector for ", node.operation.type));
+      return SelectDefault(creation_context, op_def, hints, inputs, outputs,
+                           node, gpu_subgraph);
   }
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h
index 55a11e882b7..bcb46c1e0c4 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/cl/model_hints.h"
+#include "tensorflow/lite/delegates/gpu/cl/selectors/subgraph.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
@@ -28,21 +29,6 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-struct GPUOperationWithRefs {
-  std::unique_ptr<GPUOperation> operation;
-
-  // input and output ids can be positive or negative.
-  // if we have positive id, we will use preallocated tensor from GraphFloat32
-  // otherwise, we will use ids for newly allocated tensors
-  std::vector<int> input_ids;
-  std::vector<int> output_ids;
-};
-
-struct GPUOperationsSubgraph {
-  std::vector<GPUOperationWithRefs> operations;
-  std::vector<std::pair<BHWC, TensorDescriptor>> new_tensors;
-};
-
 Status GPUOperationFromNode(const CreationContext& creation_context,
                             const OperationDef& op_def, ModelHints hints,
                             const std::vector<Value<TensorRef<BHWC>>*>& inputs,
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
index 22244351bd7..ff26a3be601 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/kernels/padding.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/pooling.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/prelu.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/quantize_and_dequantize.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/relu.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/reshape.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/reshapex4.h"
@@ -218,6 +219,17 @@ Status SelectWinograd36To4x4(
   return OkStatus();
 }
 
+Status SelectQuantizeAndDequantize(const QuantizeAndDequantizeAttributes& attr,
+                                   const CreationContext& creation_context,
+                                   const OperationDef& op_def,
+                                   std::unique_ptr<GPUOperation>* ptr) {
+  QuantizeAndDequantize operation;
+  RETURN_IF_ERROR(
+      CreateQuantizeAndDequantize(creation_context, op_def, attr, &operation));
+  *ptr = absl::make_unique<QuantizeAndDequantize>(std::move(operation));
+  return OkStatus();
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
index fd29ebc0e91..d9a5365fc9e 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
@@ -100,6 +100,11 @@ Status SelectWinograd36To4x4(
     const ::tflite::gpu::Tensor<Linear, DataType::FLOAT32>& biases,
     std::unique_ptr<GPUOperation>* ptr);
 
+Status SelectQuantizeAndDequantize(const QuantizeAndDequantizeAttributes& attr,
+                                   const CreationContext& creation_context,
+                                   const OperationDef& op_def,
+                                   std::unique_ptr<GPUOperation>* ptr);
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/subgraph.cc b/tensorflow/lite/delegates/gpu/cl/selectors/subgraph.cc
new file mode 100644
index 00000000000..41660102770
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/subgraph.cc
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/selectors/subgraph.h"
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+std::unique_ptr<GPUOperation>* InitSingleOpSubgraph(
+    const std::vector<Value<TensorRef<BHWC>>*>& inputs,
+    const std::vector<Value<TensorRef<BHWC>>*>& outputs,
+    GPUOperationsSubgraph* gpu_subgraph) {
+  gpu_subgraph->operations.clear();
+  gpu_subgraph->new_tensors.clear();
+  gpu_subgraph->operations.push_back({});
+  for (int i = 0; i < inputs.size(); ++i) {
+    gpu_subgraph->operations[0].input_ids.push_back(i);
+  }
+  for (int i = 0; i < outputs.size(); ++i) {
+    gpu_subgraph->operations[0].output_ids.push_back(i);
+  }
+
+  return &gpu_subgraph->operations[0].operation;
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/subgraph.h b/tensorflow/lite/delegates/gpu/cl/selectors/subgraph.h
new file mode 100644
index 00000000000..7b23df2bdda
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/subgraph.h
@@ -0,0 +1,54 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_SUBGRAPH_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_SUBGRAPH_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+struct GPUOperationWithRefs {
+  std::unique_ptr<GPUOperation> operation;
+
+  // input and output ids can be positive or negative.
+  // if we have positive id, we will use preallocated tensor from GraphFloat32
+  // otherwise, we will use ids for newly allocated tensors
+  std::vector<int> input_ids;
+  std::vector<int> output_ids;
+};
+
+struct GPUOperationsSubgraph {
+  std::vector<GPUOperationWithRefs> operations;
+  std::vector<std::pair<BHWC, TensorDescriptor>> new_tensors;
+};
+
+std::unique_ptr<GPUOperation>* InitSingleOpSubgraph(
+    const std::vector<Value<TensorRef<BHWC>>*>& inputs,
+    const std::vector<Value<TensorRef<BHWC>>*>& outputs,
+    GPUOperationsSubgraph* gpu_subgraph);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_SELECTORS_SUBGRAPH_H_
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index 703343e7a0b..cc108ea022b 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -20,9 +20,12 @@ limitations under the License.
 #include <algorithm>
 #include <cstdint>
 #include <cstring>
+#include <list>
 #include <memory>
+#include <set>
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include <fp16.h>
@@ -36,6 +39,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/context.h"
+#include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/delegates/gpu/common/custom_parsers.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
@@ -1281,7 +1285,8 @@ class FullyConnectedOperationParser : public TFLiteOperationParser {
       conv = graph->NewNode();  // reset conv pointer!
       Value<TensorRef<BHWC>>* reshaped_value = graph->NewValue();
       reshaped_value->tensor.type = DataType::FLOAT32;
-      reshaped_value->tensor.shape = BHWC(1, 1, 1, weights.shape.w);
+      reshaped_value->tensor.shape =
+          BHWC(input->tensor.shape.b, 1, 1, weights.shape.w);
       RETURN_IF_ERROR(graph->SetProducer(reshape->id, reshaped_value->id));
       reshape->operation.type = ToString(OperationType::RESHAPE);
       ReshapeAttributes attr;
@@ -2745,14 +2750,310 @@ std::unique_ptr<TFLiteOperationParser> NewOperationParser(
   return absl::make_unique<UnsupportedOperationParser>();
 }
 
-}  // namespace
-
-Status ConvertTfLiteTensorToTensorRef(const TfLiteTensor& tflite_tensor,
-                                      TensorRef<BHWC>* tensor_ref) {
-  tensor_ref->type = ToDataType(tflite_tensor.type);
-  return ExtractTensorShape(tflite_tensor, &tensor_ref->shape);
+Status GetNodeAndRegistration(TfLiteContext* context, int node_id,
+                              TfLiteNode** tflite_node,
+                              TfLiteRegistration** registration) {
+  if (context->GetNodeAndRegistration(context, node_id, tflite_node,
+                                      registration) != kTfLiteOk) {
+    return InvalidArgumentError(absl::StrCat(
+        "Couldn't get node and registration info for op: ", node_id));
+  }
+  return OkStatus();
 }
 
+using IsNodeSupportedFn =
+    std::function<Status(TfLiteContext*, TfLiteNode*, TfLiteRegistration*)>;
+
+// A utility class to help model graph parition and decide the partition to be
+// offloaded to GPU.
+// TODO(b/151152967): move the following to lite/delegates/utils
+class GraphPartitionHelper {
+ public:
+  GraphPartitionHelper(TfLiteContext* context,
+                       IsNodeSupportedFn is_node_supported_fn)
+      : is_node_supported_fn_(is_node_supported_fn), context_(context) {}
+
+  virtual ~GraphPartitionHelper() { TfLiteIntArrayFree(supported_nodes_); }
+
+  // Partitions the graph into multiple subgraphs, each of which is in
+  // dependency order with others
+  virtual Status Partition(std::set<std::string>* unsupported_nodes_info) {
+    RETURN_IF_ERROR(PrepareSupportedNodes(unsupported_nodes_info));
+
+    TfLiteDelegateParams* partition_params_array_ = nullptr;
+    int num_partitions_ = 0;
+    if (context_->PreviewDelegatePartitioning(context_, supported_nodes_,
+                                              &partition_params_array_,
+                                              &num_partitions_) != kTfLiteOk) {
+      return InvalidArgumentError("Unable to preview delegate partition.");
+    }
+
+    for (int i = 0; i < num_partitions_; ++i) {
+      partitions_.push_back(partition_params_array_ + i);
+    }
+
+    return OkStatus();
+  }
+
+  // Returns the first n largest partitions or all if #partitions is less than
+  // 'n'. Note that partitions are ranked according to the number of nodes that
+  // a partition has, and the returned TfLiteDelegateParams objects are *owned*
+  // by the TfLite runtime.
+  std::vector<TfLiteDelegateParams*> GetFirstNLargestPartitions(int n) {
+    const int total = num_partitions();
+    // We only sort partitions according to their sizes if necessary.
+    if (n < total) {
+      partitions_.sort(CompareTwoPartitions);
+    }
+    std::vector<TfLiteDelegateParams*> results;
+    auto p_it = partitions_.begin();
+    for (int i = 0; i < std::min(total, n); ++i, ++p_it) {
+      results.push_back(*p_it);
+    }
+    return results;
+  }
+
+  int num_total_nodes() const { return num_total_nodes_; }
+  int num_partitions() const { return partitions_.size(); }
+
+ private:
+  static bool CompareTwoPartitions(TfLiteDelegateParams* left,
+                                   TfLiteDelegateParams* right) {
+    // Reverse sort
+    return left->nodes_to_replace->size > right->nodes_to_replace->size;
+  }
+
+  Status PrepareSupportedNodes(
+      std::set<std::string>* unsupported_nodes_info = nullptr) {
+    TfLiteIntArray* execution_plan = nullptr;
+    if (context_->GetExecutionPlan(context_, &execution_plan) != kTfLiteOk) {
+      return InvalidArgumentError("Unable to get graph execution plan.");
+    }
+
+    num_total_nodes_ = execution_plan->size;
+    supported_nodes_ = TfLiteIntArrayCreate(num_total_nodes_);
+    supported_nodes_->size = 0;
+    for (int node_id : TfLiteIntArrayView(execution_plan)) {
+      TfLiteNode* node;
+      TfLiteRegistration* registration;
+      auto status =
+          GetNodeAndRegistration(context_, node_id, &node, &registration);
+      if (!status.ok()) {
+        supported_nodes_->size = 0;
+        return status;
+      }
+
+      status = IsNodeSupported(context_, node, registration, node_id);
+      if (status.ok()) {
+        supported_nodes_->data[supported_nodes_->size++] = node_id;
+      } else if (unsupported_nodes_info) {
+        unsupported_nodes_info->insert(
+            absl::StrCat(GetOpNameByRegistration(*registration), ": ",
+                         status.error_message()));
+      }
+    }
+    return OkStatus();
+  }
+
+  // The number of total nodes passed in for partition (i.e. the
+  // execution_plan size)
+  int num_total_nodes_ = 0;
+
+  // Tells whether a node is replaceable.
+  const IsNodeSupportedFn is_node_supported_fn_;
+  TfLiteIntArray* supported_nodes_;  // owns the memory
+
+ protected:
+  virtual Status IsNodeSupported(TfLiteContext* context, TfLiteNode* node,
+                                 TfLiteRegistration* registration,
+                                 int node_id) {
+    return is_node_supported_fn_(context, node, registration);
+  }
+
+  TfLiteContext* const context_ = nullptr;
+
+  // Doesn't own the memory of each TfLiteDelegateParams object as it's
+  // managed by the TfLite runtime itself. See
+  // TfLiteContext::PreviewDelegatePartitioning for details.
+  std::list<TfLiteDelegateParams*> partitions_;
+};
+
+class GraphWithDequantPartitionHelper : public GraphPartitionHelper {
+ public:
+  GraphWithDequantPartitionHelper(TfLiteContext* context,
+                                  IsNodeSupportedFn is_node_supported_fn)
+      : GraphPartitionHelper(context, std::move(is_node_supported_fn)) {}
+
+  Status Partition(std::set<std::string>* unsupported_nodes_info) override {
+    auto status = GraphPartitionHelper::Partition(unsupported_nodes_info);
+    // Clean up those partitions that have a single dequant op. NoteThose
+    // removed dequant ops have to be reserved in the graph and should not be
+    // delegated.
+    RemoveSingleDequantNodePartitions();
+    return status;
+  }
+
+  // Returns a list of node indices of all nodes from the first n largest
+  // partitions. If there are fewer paritions than n, all nodes will be
+  // returned. The partition is ranked according to the number of nodes.
+  std::vector<int> GetNodesOfFirstNLargestPartitions(int n) {
+    // We first get partitions to reduce the number of nodes to be checked in
+    // deciding which dequant ops could actually be replaced. And then we
+    // remap input-tensor to dequant nodes' inputs and remove those
+    // to-be-reserved dequant nodes.
+    auto first_nps = GetFirstNLargestPartitions(n);
+    std::vector<int> ops_to_replace;
+    for (const auto p : first_nps) {
+      auto nodes = p->nodes_to_replace;
+      ops_to_replace.insert(ops_to_replace.end(), nodes->data,
+                            nodes->data + nodes->size);
+    }
+    RemapInputTensors(ops_to_replace);
+    RemoveReservedDequantsFromNodes(&ops_to_replace);
+    return ops_to_replace;
+  }
+
+ protected:
+  Status IsNodeSupported(TfLiteContext* context, TfLiteNode* node,
+                         TfLiteRegistration* registration,
+                         int node_id) override {
+    // If we need to handle dequant nodes, we have to remap input tensors of
+    // this node if some of them come from a dequant node before testing if
+    // the node is supported.
+    std::vector<int> orig_inputs;
+    if (RecordAndRemapInputTensors(registration->builtin_code, node_id, node,
+                                   &orig_inputs)) {
+      // We have a dequant op here. Note that we retrun an Ok status because a
+      // dequant node is first added as supported. Later, this dequant node
+      // will be removed if it has to be preserved in the graph which happens
+      // when its immediate downstream nodes cannot be supported.
+      return OkStatus();
+    }
+    const auto status = GraphPartitionHelper::IsNodeSupported(
+        context, node, registration, node_id);
+    RestoreToOrigInputTensors(node, orig_inputs);
+    return status;
+  }
+
+ private:
+  // Record 'node' if it is a dequant op (i.e. a fp16 one here) and return true.
+  // When it's not a dequant op, remap its inputs to the inputs of the preceding
+  // dequant if there's a one and returns false. 'orig_inputs' records original
+  // input tensor ids of this node if any input is remapped.
+  bool RecordAndRemapInputTensors(int32_t op_code, int node_id,
+                                  TfLiteNode* node,
+                                  std::vector<int>* orig_inputs) {
+    orig_inputs->clear();
+    // Record the dequant node.
+    if (op_code == kTfLiteBuiltinDequantize &&
+        context_->tensors[node->inputs->data[0]].type ==
+            TfLiteType::kTfLiteFloat16) {
+      dequant_nodes_[node->outputs->data[0]] = node->inputs->data[0];
+      return true;
+    }
+    // For a dequantize op, there's no need to remap its input tensors.
+    if (dequant_nodes_.empty()) return false;
+    RemapInputTensors(node, orig_inputs);
+    return false;
+  }
+
+  // Restore inputs of 'node' to 'orig_inputs' only if two sizes match.
+  void RestoreToOrigInputTensors(TfLiteNode* node,
+                                 const std::vector<int>& orig_inputs) {
+    if (node->inputs->size != orig_inputs.size()) return;
+    for (int j = 0; j < node->inputs->size; ++j) {
+      node->inputs->data[j] = orig_inputs[j];
+    }
+  }
+
+  // Remap input tensors of every node in 'nodes' (i.e. node indices) if some of
+  // them are from dequant ops.
+  void RemapInputTensors(const std::vector<int>& nodes) const {
+    for (int node_id : nodes) {
+      TfLiteNode* node;
+      TfLiteRegistration* registration;
+      GetNodeAndRegistration(context_, node_id, &node, &registration)
+          .IgnoreError();
+      RemapInputTensors(node, nullptr /* orig_inputs*/);
+    }
+  }
+
+  void RemoveSingleDequantNodePartitions() {
+    auto it = partitions_.begin();
+    while (it != partitions_.end()) {
+      auto p = *it;
+      if (p->nodes_to_replace->size != 1) {
+        ++it;
+        continue;
+      }
+      int node_id = p->nodes_to_replace->data[0];
+      TfLiteNode* node = nullptr;
+      TfLiteRegistration* registration = nullptr;
+      GetNodeAndRegistration(context_, node_id, &node, &registration)
+          .IgnoreError();
+      if (registration->builtin_code != kTfLiteBuiltinDequantize) {
+        ++it;
+        continue;
+      }
+      // Note such dequant nodes have to be preserved in the graph as dequant
+      // ops are not actually supported in the GPU delegate.
+      dequant_nodes_to_save_.insert(node_id);
+      it = partitions_.erase(it);
+    }
+  }
+
+  void RemoveReservedDequantsFromNodes(std::vector<int>* nodes) {
+    if (dequant_nodes_to_save_.empty()) return;
+    auto it = nodes->begin();
+    while (it != nodes->end()) {
+      if (dequant_nodes_to_save_.find(*it) == dequant_nodes_to_save_.end()) {
+        ++it;
+        continue;
+      }
+      it = nodes->erase(it);
+    }
+  }
+
+  // Remap input tensors of a single 'node' if some of come from a dequant op.
+  // If 'orig_inputs' isn't nullptr, it records original input tensor ids of
+  // this node if any input is remapped.
+  void RemapInputTensors(TfLiteNode* node,
+                         std::vector<int>* orig_inputs) const {
+    TfLiteIntArray* inputs = node->inputs;
+    auto inputs_view = TfLiteIntArrayView(inputs);
+    // Prepopulate 'orig_inputs' first and clear it if there's no input from a
+    // dequant op.
+    if (orig_inputs) {
+      orig_inputs->clear();
+      orig_inputs->reserve(inputs->size);
+      for (auto tid : inputs_view) {
+        orig_inputs->push_back(tid);
+      }
+    }
+    // Fix this node's inputs (i.e. prune out the preceding dequantize node) in
+    // order to test if it is supported.
+    bool is_remapped = false;
+    for (int j = 0; j < inputs->size; ++j) {
+      const int input_tid = inputs->data[j];
+      const auto it = dequant_nodes_.find(input_tid);
+      if (it != dequant_nodes_.end()) {
+        inputs->data[j] = it->second;
+        is_remapped = true;
+      }
+    }
+    if (!is_remapped && orig_inputs) orig_inputs->clear();
+  }
+
+  // A map recording dequantize nodes's input/output tensors of this selected
+  // graph. The key is the output tensor id, and the value is the input tensor
+  // id.
+  std::unordered_map<int, int> dequant_nodes_;
+
+  // A set of dequant nodes as in node indices that have to be preserved in the
+  // graph.
+  std::set<int> dequant_nodes_to_save_;
+};
+
 Status IsSupported(const TfLiteContext* context, TfLiteNode* node,
                    const TfLiteRegistration* registration) {
   return NewOperationParser(registration)
@@ -2772,191 +3073,63 @@ bool IsAllFloatTensors(const TfLiteContext* context,
   return true;
 }
 
-std::string GetOpNameByRegistration(const TfLiteRegistration* registration) {
-  auto op = registration->builtin_code;
-  std::string result =
-      EnumNameBuiltinOperator(static_cast<BuiltinOperator>(op));
-  if (op == kTfLiteBuiltinCustom) {
-    result += " " + std::string(registration->custom_name);
-  }
-  return result;
-}
+}  // namespace
 
-Status GetNodeAndRegistration(TfLiteContext* context, int node_id,
-                              TfLiteNode** tflite_node,
-                              TfLiteRegistration** registration) {
-  if (context->GetNodeAndRegistration(context, node_id, tflite_node,
-                                      registration) != kTfLiteOk) {
-    return InvalidArgumentError(absl::StrCat(
-        "Couldn't get node and registration info for op: ", node_id));
-  }
-  return OkStatus();
-}
-
-TfLiteIntArray* GetOpsToReplaceFromGraphWithDequantize(TfLiteContext* context) {
-  TfLiteIntArray* execution_plan = nullptr;
-  if (context->GetExecutionPlan(context, &execution_plan) != kTfLiteOk) {
-    context->ReportError(context, "Unable to get graph execution plan.");
-    return nullptr;
-  }
-  std::set<std::string> errors;
-  std::unordered_map<int, int> dequant_nodes;
-  std::vector<int> ops_to_replace;
-  std::vector<int> dequant_nodes_to_save;
-
-  // Map the output tensor of a Dequantize nodes to its input tensor.
-  std::unordered_map<int, int> node_map;
-  for (int i = 0; i < execution_plan->size; ++i) {
-    bool replace_node = false;
-    // Keep track of any inputs from a Dequantize node.
-    std::vector<int> inputs_from_dequant;
-    std::vector<int> orig_inputs;
-
-    const int node_id = execution_plan->data[i];
-    TfLiteNode* node = nullptr;
-    TfLiteRegistration* registration = nullptr;
-    auto status =
-        GetNodeAndRegistration(context, node_id, &node, &registration);
-    if (!status.ok()) {
-      context->ReportError(context, status.error_message().c_str());
-      return nullptr;
-    }
-    if (registration->builtin_code == kTfLiteBuiltinDequantize &&
-        context->tensors[node->inputs->data[0]].type ==
-            TfLiteType::kTfLiteFloat16) {
-      // Record the output->input mapping for the op.
-      node_map[node->outputs->data[0]] = node->inputs->data[0];
-      // For now, add the node to the list of ops to replace.
-      ops_to_replace.push_back(node_id);
-      // Record the dequant node id, indexed by output id.
-      dequant_nodes[node->outputs->data[0]] = node_id;
-      continue;
-    }
-    TfLiteIntArray* inputs = node->inputs;
-    // Fix the node's inputs (i.e. prune out the preceding dequantize node)
-    // in order to test if it is supported on the GPU.
-    for (int j = 0; j < inputs->size; ++j) {
-      orig_inputs.push_back(inputs->data[j]);
-      if (node_map.find(inputs->data[j]) != node_map.end()) {
-        inputs_from_dequant.push_back(dequant_nodes[inputs->data[j]]);
-        // Remap inputs of this node to the inputs of the preceding dequant.
-        inputs->data[j] = node_map[inputs->data[j]];
-      }
-    }
-    status = IsSupported(context, node, registration);
-    if (status.ok() &&
-        // TODO(eignasheva): resolve sub operation support for metal delegate
-        // registration->builtin_code != kTfLiteBuiltinSub &&
-        IsAllFloatTensors(context, node->inputs) &&
-        IsAllFloatTensors(context, node->outputs) && errors.empty()) {
-      // Node is supported and there were no previous errors.
-      replace_node = true;
-      ops_to_replace.push_back(i);
-    } else {
-      // Unable to replace this node. Restore the inputs to the original
-      // if they were modified.
-      if (!inputs_from_dequant.empty()) {
-        TfLiteIntArray* inputs = node->inputs;
-        for (int j = 0; j < inputs->size; ++j) {
-          inputs->data[j] = orig_inputs[j];
-        }
-      }
-      errors.insert(GetOpNameByRegistration(registration) + ": " +
-                    status.error_message());
-    }
-    // if any input is the output of a dequantize node AND we failed to
-    // replace this op, mark the corresponding dequantize node as a node to
-    // save.
-    if (!replace_node && !inputs_from_dequant.empty()) {
-      dequant_nodes_to_save.insert(dequant_nodes_to_save.end(),
-                                   inputs_from_dequant.begin(),
-                                   inputs_from_dequant.end());
-    }
-  }
-  if (!errors.empty()) {
-    std::string unsupported = absl::StrJoin(errors, "\n");
-    std::string error_message =
-        "Next operations are not supported by GPU delegate:\n" + unsupported +
-        "\nFirst " + std::to_string(ops_to_replace.size()) +
-        " operations will run on the GPU, and the remaining " +
-        std::to_string(execution_plan->size - ops_to_replace.size()) +
-        " on the CPU.";
-    context->ReportError(context, error_message.c_str());
-  }
-  // Pop all dequantize nodes that must be preserved.
-  for (int i = 0; i < dequant_nodes_to_save.size(); ++i) {
-    auto it = std::find(ops_to_replace.begin(), ops_to_replace.end(),
-                        dequant_nodes_to_save[i]);
-    if (it != ops_to_replace.end()) {
-      ops_to_replace.erase(it);
-    }
-  }
-  return ConvertVectorToTfLiteIntArray(ops_to_replace);
+Status ConvertTfLiteTensorToTensorRef(const TfLiteTensor& tflite_tensor,
+                                      TensorRef<BHWC>* tensor_ref) {
+  tensor_ref->type = ToDataType(tflite_tensor.type);
+  return ExtractTensorShape(tflite_tensor, &tensor_ref->shape);
 }
 
 // TODO(impjdi): Check number of input/output tensors and their dimensions.
 // TODO(impjdi): Check ops' parameters.
 TfLiteIntArray* GetOpsToReplace(TfLiteContext* context) {
-  TfLiteIntArray* execution_plan = nullptr;
-  if (context->GetExecutionPlan(context, &execution_plan) != kTfLiteOk) {
-    context->ReportError(context, "Unable to get graph execution plan.");
+  IsNodeSupportedFn node_supported_fn =
+      [=](TfLiteContext* context, TfLiteNode* node,
+          TfLiteRegistration* registration) -> Status {
+    RETURN_IF_ERROR(IsSupported(context, node, registration));
+    return (IsAllFloatTensors(context, node->inputs) &&
+            IsAllFloatTensors(context, node->outputs))
+               ? OkStatus()
+               : FailedPreconditionError(
+                     "OP is supported, but tensor type isn't matched!");
+  };
+
+  GraphWithDequantPartitionHelper partition_helper(context, node_supported_fn);
+  std::set<std::string> unsupported_nodes_info;
+  auto status = partition_helper.Partition(&unsupported_nodes_info);
+  if (!status.ok()) {
+    TF_LITE_KERNEL_LOG(context, status.error_message().c_str());
     return nullptr;
   }
 
-  // Dispatch to another function if graph has Dequantize nodes.
-  for (int i = 0; i < execution_plan->size; ++i) {
-    const int node_id = execution_plan->data[i];
-    TfLiteNode* node = nullptr;
-    TfLiteRegistration* registration = nullptr;
-    auto status =
-        GetNodeAndRegistration(context, node_id, &node, &registration);
-    if (!status.ok()) {
-      context->ReportError(context, status.error_message().c_str());
-      return nullptr;
-    }
-    if (registration->builtin_code == kTfLiteBuiltinDequantize &&
-        context->tensors[node->inputs->data[0]].type ==
-            TfLiteType::kTfLiteFloat16) {
-      return GetOpsToReplaceFromGraphWithDequantize(context);
-    }
-  }
+  // We simply get 1st largest partition, but we could later explore whether
+  // getting more partitions could lead to better performance, i.e. by
+  // parameterizing '1' here.
+  std::vector<int> ops_to_replace =
+      partition_helper.GetNodesOfFirstNLargestPartitions(1);
 
-  // No Dequantize nodes. Iterate through graph and find ops to replace.
-  TfLiteIntArray* subgraph = TfLiteIntArrayCreate(execution_plan->size);
-  subgraph->size = 0;
-  std::set<std::string> errors;
-  for (int i = 0; i < execution_plan->size; ++i) {
-    const int node_id = execution_plan->data[i];
-    TfLiteNode* node;
-    TfLiteRegistration* registration;
-    auto status =
-        GetNodeAndRegistration(context, node_id, &node, &registration);
-    if (!status.ok()) {
-      context->ReportError(context, status.error_message().c_str());
-      return nullptr;
-    }
-    status = IsSupported(context, node, registration);
-    if (status.ok() &&
-        // TODO(eignasheva): resolve sub operation support for metal delegate
-        // registration->builtin_code != kTfLiteBuiltinSub &&
-        IsAllFloatTensors(context, node->inputs) &&
-        IsAllFloatTensors(context, node->outputs)) {
-      if (errors.empty()) subgraph->data[subgraph->size++] = node_id;
+  if (!unsupported_nodes_info.empty()) {
+    std::string unsupported = absl::StrJoin(unsupported_nodes_info, "\n");
+    std::string error_message = absl::StrCat(
+        "Following operations are not supported by GPU delegate:\n",
+        unsupported, "\n");
+    if (!ops_to_replace.empty()) {
+      absl::StrAppendFormat(
+          &error_message,
+          "%d operations will run on the GPU (first node: "
+          "%d, last node: %d), and the remaining %d",
+          ops_to_replace.size(), ops_to_replace.front(), ops_to_replace.back(),
+          partition_helper.num_total_nodes() - ops_to_replace.size());
     } else {
-      errors.insert(absl::StrCat(GetOpNameByRegistration(registration), ": ",
-                                 status.error_message()));
+      absl::StrAppend(&error_message,
+                      "No operations will run on the GPU, and all ",
+                      partition_helper.num_total_nodes());
     }
+    absl::StrAppend(&error_message, " operations will run on the CPU.");
+    TF_LITE_KERNEL_LOG(context, error_message.c_str());
   }
-  if (!errors.empty()) {
-    std::string unsupported = absl::StrJoin(errors, "\n");
-    std::string error_message =
-        "Next operations are not supported by GPU delegate:\n" + unsupported +
-        "\nFirst " + std::to_string(subgraph->size) +
-        " operations will run on the GPU, and the remaining " +
-        std::to_string(execution_plan->size - subgraph->size) + " on the CPU.";
-    context->ReportError(context, error_message.c_str());
-  }
-  return subgraph;
+  return ConvertVectorToTfLiteIntArray(ops_to_replace);
 }
 
 Status BuildModel(TfLiteContext* context,
@@ -2996,7 +3169,7 @@ Status BuildModel(TfLiteContext* context,
     const auto status =
         operations[i]->Parse(tflite_node, registration, graph, &reader);
     if (!status.ok()) {
-      return InternalError(absl::StrCat(GetOpNameByRegistration(registration),
+      return InternalError(absl::StrCat(GetOpNameByRegistration(*registration),
                                         ": ", status.error_message()));
     }
   }
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_test.cc b/tensorflow/lite/delegates/gpu/common/model_builder_test.cc
index d2851829a99..5cad4d186aa 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_test.cc
@@ -120,9 +120,52 @@ TEST(ModelBuilderTest, ConvertTfLiteTensorToTensorRefFailsForRankGT3) {
   EXPECT_FALSE(status.ok());
 }
 
-class InterpreterFp16 {
+class DelegatedInterpreter {
  public:
-  explicit InterpreterFp16(TfLiteBuiltinOperator op) {
+  explicit DelegatedInterpreter(int num_nodes) {
+    exec_plan_ = TfLiteIntArrayCreate(num_nodes);
+  }
+  virtual ~DelegatedInterpreter() {
+    TfLiteIntArrayFree(exec_plan_);
+    for (auto params : delegate_params_) {
+      TfLiteIntArrayFree(params.nodes_to_replace);
+      TfLiteIntArrayFree(params.input_tensors);
+      TfLiteIntArrayFree(params.output_tensors);
+    }
+  }
+
+  // Get the TfLiteContext to be mocked for swapping out functions that have to
+  // be called inside delegate (i.e. in delegat kernel mode).
+  TfLiteContext* context() { return interpreter_.primary_subgraph().context(); }
+
+  std::vector<std::pair<TfLiteNode, TfLiteRegistration>>&
+  nodes_and_registration() {
+    return interpreter_.primary_subgraph().nodes_and_registration();
+  }
+
+  TfLiteIntArray* exec_plan() const { return exec_plan_; }
+  TfLiteDelegateParams* add_delegate_params() {
+    delegate_params_.push_back(TfLiteDelegateParams());
+    return &delegate_params_.back();
+  }
+  TfLiteDelegateParams* delegate_params() { return &delegate_params_.front(); }
+  int num_delegate_params() { return delegate_params_.size(); }
+
+ protected:
+  Interpreter interpreter_;
+
+ private:
+  // The manually-set execution plan for this delegated interpreter.
+  TfLiteIntArray* exec_plan_;
+
+  // The TfLiteDelegateParams object that's manually populated inside the mocked
+  // TfLiteContext::PreviewDelegatePartitioning.
+  std::vector<TfLiteDelegateParams> delegate_params_;
+};
+
+class InterpreterFp16 : public DelegatedInterpreter {
+ public:
+  explicit InterpreterFp16(TfLiteBuiltinOperator op) : DelegatedInterpreter(3) {
     void* builtin_data = malloc(sizeof(int));
     EXPECT_EQ(interpreter_.AddTensors(5), kTfLiteOk);
     EXPECT_EQ(interpreter_.SetInputs({0, 1}), kTfLiteOk);
@@ -187,22 +230,23 @@ class InterpreterFp16 {
             3, TfLiteType::kTfLiteFloat32, "t3", dims, quantization, false),
         kTfLiteOk);
 
-    exec_plan_ = TfLiteIntArrayCreate(3);
-    exec_plan_->data[0] = 0;
-    exec_plan_->data[1] = 1;
-    exec_plan_->data[2] = 2;
+    exec_plan()->data[0] = 0;
+    exec_plan()->data[1] = 1;
+    exec_plan()->data[2] = 2;
   }
-
-  ~InterpreterFp16() { TfLiteIntArrayFree(exec_plan_); }
-
-  Subgraph* GetSubgraph() { return interpreter_.subgraph(0); }
-  TfLiteIntArray* exec_plan() const { return exec_plan_; }
-
- private:
-  Interpreter interpreter_;
-  TfLiteIntArray* exec_plan_;
 };
 
+// **NOTE**: we have several interpreter instances created at global scope to
+// test *exactly* the GetOpsToReplace function alone, and not the sequence of
+// function calls that includes GetOpsToReplace when calling
+// ModifyGraphWithDelegate. A TfLiteContext is needed to test GetOpsToReplace,
+// but TfLiteContexts intentionally make it difficult to call certain functions
+// in a non-delegate context (see tensorflow/lite/subgraph/subgraph.cc for
+// details) We create our own GetExecutionPlan, GetNodeAndRegistration and
+// PreviewDelegatePartitioning lambdas inside each test, but we can't use local
+// captures without changing the function signature. Therefore, this test data
+// lives at global scope in order to be accessible inside the lambda.
+
 InterpreterFp16* interpreter_fp16_add_op =
     new InterpreterFp16(kTfLiteBuiltinAdd);
 
@@ -218,7 +262,8 @@ TEST(ModelBuilderTest, GetOpsToReplacePrunesFp16DequantizeNodes) {
   //   t0 (FP16) --> Add -> t4
   //   t2 (FP16) --/
   //
-  TfLiteContext* context = interpreter_fp16_add_op->GetSubgraph()->context();
+  TfLiteContext* context = interpreter_fp16_add_op->context();
+
   // These functions are meant to be called inside delegates. Swap out
   // for similar functions to permit direct calling of GetOpsToReplace.
   context->GetExecutionPlan = [](struct TfLiteContext* context,
@@ -229,12 +274,30 @@ TEST(ModelBuilderTest, GetOpsToReplacePrunesFp16DequantizeNodes) {
   context->GetNodeAndRegistration = [](struct TfLiteContext*, int node_index,
                                        TfLiteNode** node,
                                        TfLiteRegistration** registration) {
-    auto& node_and_reg = interpreter_fp16_add_op->GetSubgraph()
-                             ->nodes_and_registration()[node_index];
+    auto& node_and_reg =
+        interpreter_fp16_add_op->nodes_and_registration()[node_index];
     *node = &node_and_reg.first;
     *registration = &node_and_reg.second;
     return kTfLiteOk;
   };
+  context->PreviewDelegatePartitioning =
+      [](struct TfLiteContext* context, const TfLiteIntArray* nodes_to_replace,
+         TfLiteDelegateParams** partition_params_array, int* num_partitions) {
+        auto params = interpreter_fp16_add_op->add_delegate_params();
+        params->nodes_to_replace = TfLiteIntArrayCreate(3);
+        params->nodes_to_replace->data[0] = 0;
+        params->nodes_to_replace->data[1] = 1;
+        params->nodes_to_replace->data[2] = 2;
+        params->input_tensors = TfLiteIntArrayCreate(2);
+        params->input_tensors->data[0] = 0;
+        params->input_tensors->data[1] = 2;
+        params->output_tensors = TfLiteIntArrayCreate(1);
+        params->output_tensors->data[0] = 4;
+
+        *partition_params_array = interpreter_fp16_add_op->delegate_params();
+        *num_partitions = interpreter_fp16_add_op->num_delegate_params();
+        return kTfLiteOk;
+      };
 
   TfLiteIntArray* ops_to_replace = GetOpsToReplace(context);
 
@@ -251,17 +314,6 @@ TEST(ModelBuilderTest, GetOpsToReplacePrunesFp16DequantizeNodes) {
   TfLiteIntArrayFree(ops_to_replace);
 }
 
-// This interpreter instance is created at global scope to test *exactly*
-// the GetOpsToReplace function alone, and not the sequence of function calls
-// that includes GetOpsToReplace when calling ModifyGraphWithDelegate.
-// A TfLiteContext is needed to test GetOpsToReplace, but TfLiteContexts
-// intentionally make it difficult to call certain functions in a
-// non-delegate context (see tensorflow/lite/subgraph/subgraph.cc for details)
-// We create our own GetExecutionPlan and GetNodeAndRegistration lambdas
-// inside each test, but we can't use local captures without changing the
-// function signature. Therefore, this test data lives at global scope
-// in order to be accessible inside the lambda.
-
 InterpreterFp16* interpreter_fp16_gt_op =
     new InterpreterFp16(kTfLiteBuiltinGreater);
 
@@ -274,7 +326,7 @@ TEST(ModelBuilderTest, GetOpsToReplaceKeepsFp16DequantizeNodes) {
   // Because there is no GPU equivalent for the Greater op, we don't prune
   // the Dequantize nodes.
 
-  TfLiteContext* context = interpreter_fp16_gt_op->GetSubgraph()->context();
+  TfLiteContext* context = interpreter_fp16_gt_op->context();
   // These functions are meant to be called inside delegates. Swap out
   // for similar functions to permit direct calling of GetOpsToReplace.
   context->GetExecutionPlan = [](struct TfLiteContext* context,
@@ -285,12 +337,37 @@ TEST(ModelBuilderTest, GetOpsToReplaceKeepsFp16DequantizeNodes) {
   context->GetNodeAndRegistration = [](struct TfLiteContext*, int node_index,
                                        TfLiteNode** node,
                                        TfLiteRegistration** registration) {
-    auto& node_and_reg = interpreter_fp16_gt_op->GetSubgraph()
-                             ->nodes_and_registration()[node_index];
+    auto& node_and_reg =
+        interpreter_fp16_gt_op->nodes_and_registration()[node_index];
     *node = &node_and_reg.first;
     *registration = &node_and_reg.second;
     return kTfLiteOk;
   };
+  context->PreviewDelegatePartitioning =
+      [](struct TfLiteContext* context, const TfLiteIntArray* nodes_to_replace,
+         TfLiteDelegateParams** partition_params_array, int* num_partitions) {
+        auto params = interpreter_fp16_gt_op->add_delegate_params();
+        // First partition for DequantNode (t0->t1)
+        params->nodes_to_replace = TfLiteIntArrayCreate(1);
+        params->nodes_to_replace->data[0] = 0;
+        params->input_tensors = TfLiteIntArrayCreate(1);
+        params->input_tensors->data[0] = 0;
+        params->output_tensors = TfLiteIntArrayCreate(1);
+        params->output_tensors->data[0] = 1;
+
+        // Second partition for DequantNode (t2->t3)
+        params = interpreter_fp16_add_op->add_delegate_params();
+        params->nodes_to_replace = TfLiteIntArrayCreate(1);
+        params->nodes_to_replace->data[0] = 0;
+        params->input_tensors = TfLiteIntArrayCreate(1);
+        params->input_tensors->data[0] = 0;
+        params->output_tensors = TfLiteIntArrayCreate(1);
+        params->output_tensors->data[0] = 1;
+
+        *partition_params_array = interpreter_fp16_gt_op->delegate_params();
+        *num_partitions = interpreter_fp16_gt_op->num_delegate_params();
+        return kTfLiteOk;
+      };
 
   TfLiteIntArray* ops_to_replace = GetOpsToReplace(context);
 
@@ -309,9 +386,9 @@ TEST(ModelBuilderTest, GetOpsToReplaceKeepsFp16DequantizeNodes) {
   TfLiteIntArrayFree(ops_to_replace);
 }
 
-class InterpreterFp32 {
+class InterpreterFp32 : public DelegatedInterpreter {
  public:
-  InterpreterFp32() {
+  InterpreterFp32() : DelegatedInterpreter(2) {
     void* builtin_data = malloc(sizeof(int));
     EXPECT_EQ(interpreter_.AddTensors(4), kTfLiteOk);
     EXPECT_EQ(interpreter_.SetInputs({0, 2}), kTfLiteOk);
@@ -363,34 +440,24 @@ class InterpreterFp32 {
         interpreter_.SetTensorParametersReadWrite(
             2, TfLiteType::kTfLiteFloat32, "t2", dims, quantization, false),
         kTfLiteOk);
-    exec_plan_ = TfLiteIntArrayCreate(2);
-    exec_plan_->data[0] = 0;
-    exec_plan_->data[1] = 1;
+
+    exec_plan()->data[0] = 0;
+    exec_plan()->data[1] = 1;
   }
-
-  ~InterpreterFp32() { TfLiteIntArrayFree(exec_plan_); }
-
-  Subgraph* GetSubgraph() { return interpreter_.subgraph(0); }
-  TfLiteIntArray* exec_plan() const { return exec_plan_; }
-
- private:
-  Interpreter interpreter_;
-  TfLiteIntArray* exec_plan_;
 };
 
 InterpreterFp32* interpreter_fp32 = new InterpreterFp32();
 
 TEST(ModelBuilderTest, GetOpsToReplaceDoesNotPruneUint8) {
-  // A graph with a Dequant node with uint8 input
-  // is not pruned. The delegate will attempt to replace it
-  // with a GPU op, but this op is currently not supported on
-  // the GPU. Therefore, the Dequant op and all downstream ops
-  // will be scheduled to run on the CPU.
+  // A graph with a Dequant node with uint8 input is not pruned. As this op is
+  // currently not supported on the GPU. Therefore, the Dequant op will be
+  // scheduled to run on the CPU while the remaining supported op Add on the
+  // GPU.
   //
   //   t0 (uint8) --> Dequant --> t1 (FP32) --> Add -> t3
   //                              t2 (FP32) --/
   //
-  TfLiteContext* context = interpreter_fp32->GetSubgraph()->context();
+  TfLiteContext* context = interpreter_fp32->context();
 
   // These functions are meant to be called inside delegates. Swap out
   // for similar functions to permit direct calling of GetOpsToReplace.
@@ -402,24 +469,43 @@ TEST(ModelBuilderTest, GetOpsToReplaceDoesNotPruneUint8) {
   context->GetNodeAndRegistration = [](struct TfLiteContext*, int node_index,
                                        TfLiteNode** node,
                                        TfLiteRegistration** registration) {
-    auto& node_and_reg =
-        interpreter_fp32->GetSubgraph()->nodes_and_registration()[node_index];
+    auto& node_and_reg = interpreter_fp32->nodes_and_registration()[node_index];
     *node = &node_and_reg.first;
     *registration = &node_and_reg.second;
     return kTfLiteOk;
   };
+  context->PreviewDelegatePartitioning =
+      [](struct TfLiteContext* context, const TfLiteIntArray* nodes_to_replace,
+         TfLiteDelegateParams** partition_params_array, int* num_partitions) {
+        auto params = interpreter_fp32->add_delegate_params();
+        params->nodes_to_replace = TfLiteIntArrayCreate(1);
+        params->nodes_to_replace->data[0] = 1;
+        params->input_tensors = TfLiteIntArrayCreate(2);
+        params->input_tensors->data[0] = 1;
+        params->input_tensors->data[1] = 2;
+        params->output_tensors = TfLiteIntArrayCreate(1);
+        params->output_tensors->data[0] = 3;
+
+        *partition_params_array = interpreter_fp32->delegate_params();
+        *num_partitions = interpreter_fp32->num_delegate_params();
+        return kTfLiteOk;
+      };
 
   TfLiteIntArray* ops_to_replace = GetOpsToReplace(context);
 
-  // No ops are run on the GPU, since the Dequant op is not pruned and must run
-  // on the CPU.
-  EXPECT_EQ(ops_to_replace->size, 0);
+  // As the Dequant op is not pruned and the ADD op could run on GPU, we have
+  // 1 partition.
+  EXPECT_EQ(ops_to_replace->size, 1);
+  // ADD at index 1.
+  EXPECT_EQ(1, ops_to_replace->data[0]);
+
   TfLiteIntArrayFree(ops_to_replace);
 }
 
-class InterpreterMultiNode {
+class InterpreterMultiNode : public DelegatedInterpreter {
  public:
-  explicit InterpreterMultiNode(bool add_op_first = true) {
+  explicit InterpreterMultiNode(bool add_op_first = true)
+      : DelegatedInterpreter(5) {
     void* builtin_data = malloc(sizeof(int));
     EXPECT_EQ(interpreter_.AddTensors(8), kTfLiteOk);
     EXPECT_EQ(interpreter_.SetInputs({0, 1, 2}), kTfLiteOk);
@@ -552,38 +638,29 @@ class InterpreterMultiNode {
         interpreter_.SetTensorParametersReadWrite(
             7, TfLiteType::kTfLiteFloat32, "t5", dims, quantization, false),
         kTfLiteOk);
-    exec_plan_ = TfLiteIntArrayCreate(5);
-    exec_plan_->data[0] = 0;
-    exec_plan_->data[1] = 1;
-    exec_plan_->data[2] = 2;
-    exec_plan_->data[3] = 3;
-    exec_plan_->data[4] = 4;
+
+    exec_plan()->data[0] = 0;
+    exec_plan()->data[1] = 1;
+    exec_plan()->data[2] = 2;
+    exec_plan()->data[3] = 3;
+    exec_plan()->data[4] = 4;
   }
-
-  ~InterpreterMultiNode() { TfLiteIntArrayFree(exec_plan_); }
-
-  Subgraph* GetSubgraph() { return interpreter_.subgraph(0); }
-  TfLiteIntArray* exec_plan() const { return exec_plan_; }
-
- private:
-  Interpreter interpreter_;
-  TfLiteIntArray* exec_plan_;
 };
 
 InterpreterMultiNode* interpreter_mn = new InterpreterMultiNode();
 
-TEST(ModelBuilderTest, GetOpsToReplaceSelectsCorrectDequants) {
+TEST(ModelBuilderTest, GetOpsToReplaceSelectsCorrectDequantsAddFirst) {
   // A graph with three Dequant nodes feeding two ops, 'Add' and 'Greater'.
   // 'Add' can be replaced by the GPU delegate, but 'Greater' can not.
-  //   t0 (FP16) --> Dequant --> t3 (FP32) --> Greater -> t6
-  //   t1 (FP16) --> Dequant --> t4 (FP32) --/
-  //                                       --\
-  //   t3 (FP16) --> Dequant --> t5 (FP32) --> Add -> t7
+  //   t0 (FP16) --> Dequant(0) --> t3 (FP32) --> Greater(4) -> t6
+  //   t1 (FP16) --> Dequant(1) --> t4 (FP32) --/
+  //                                          --\
+  //   t3 (FP16) --> Dequant(2) --> t5 (FP32) --> Add(3) -> t7
   //
   //  OpsToReplace should replace the 'Add' op and the Dequant outputing
   //  t5, but leave the other Dequant nodes because 'Greater' must run
   //  on the CPU.
-  TfLiteContext* context = interpreter_mn->GetSubgraph()->context();
+  TfLiteContext* context = interpreter_mn->context();
 
   // These functions are meant to be called inside delegates. Swap out
   // for similar functions to permit direct calling of GetOpsToReplace.
@@ -595,12 +672,48 @@ TEST(ModelBuilderTest, GetOpsToReplaceSelectsCorrectDequants) {
   context->GetNodeAndRegistration = [](struct TfLiteContext*, int node_index,
                                        TfLiteNode** node,
                                        TfLiteRegistration** registration) {
-    auto& node_and_reg =
-        interpreter_mn->GetSubgraph()->nodes_and_registration()[node_index];
+    auto& node_and_reg = interpreter_mn->nodes_and_registration()[node_index];
     *node = &node_and_reg.first;
     *registration = &node_and_reg.second;
     return kTfLiteOk;
   };
+  context->PreviewDelegatePartitioning =
+      [](struct TfLiteContext* context, const TfLiteIntArray* nodes_to_replace,
+         TfLiteDelegateParams** partition_params_array, int* num_partitions) {
+        auto params = interpreter_mn->add_delegate_params();
+        // First partition for DequantNode(0)
+        params->nodes_to_replace = TfLiteIntArrayCreate(1);
+        params->nodes_to_replace->data[0] = 0;
+        params->input_tensors = TfLiteIntArrayCreate(1);
+        params->input_tensors->data[0] = 0;
+        params->output_tensors = TfLiteIntArrayCreate(1);
+        params->output_tensors->data[0] = 3;
+
+        // Second partition for DequantNode(1)
+        params = interpreter_mn->add_delegate_params();
+        params->nodes_to_replace = TfLiteIntArrayCreate(1);
+        params->nodes_to_replace->data[0] = 1;
+        params->input_tensors = TfLiteIntArrayCreate(1);
+        params->input_tensors->data[0] = 1;
+        params->output_tensors = TfLiteIntArrayCreate(1);
+        params->output_tensors->data[0] = 4;
+
+        // Third partition for DequantNode(1), DequantNode(2), ADD(3)
+        params = interpreter_mn->add_delegate_params();
+        params->nodes_to_replace = TfLiteIntArrayCreate(3);
+        params->nodes_to_replace->data[0] = 1;
+        params->nodes_to_replace->data[1] = 2;
+        params->nodes_to_replace->data[2] = 3;
+        params->input_tensors = TfLiteIntArrayCreate(2);
+        params->input_tensors->data[0] = 1;
+        params->input_tensors->data[0] = 3;
+        params->output_tensors = TfLiteIntArrayCreate(1);
+        params->output_tensors->data[0] = 7;
+
+        *partition_params_array = interpreter_mn->delegate_params();
+        *num_partitions = interpreter_mn->num_delegate_params();
+        return kTfLiteOk;
+      };
 
   TfLiteIntArray* ops_to_replace = GetOpsToReplace(context);
 
@@ -624,19 +737,22 @@ TEST(ModelBuilderTest, GetOpsToReplaceSelectsCorrectDequants) {
 
 InterpreterMultiNode* interpreter_mn2 =
     new InterpreterMultiNode(/*add_op_first=*/false);
-
-TEST(ModelBuilderTest, GetOpsToReplaceRestoresInputsOnErrors) {
-  // A graph with three Dequant nodes feeding two ops, 'Greater' and 'Add'.
+TEST(ModelBuilderTest, GetOpsToReplaceSelectsCorrectDequantsGreaterFirst) {
+  // A graph with three Dequant nodes feeding two ops, 'Add' and 'Greater'.
   // 'Add' can be replaced by the GPU delegate, but 'Greater' can not.
-  //   t0 (FP16) --> Dequant --> t3 (FP32) --> Greater -> t6
-  //   t1 (FP16) --> Dequant --> t4 (FP32) --/
-  //                                       --\
-  //   t3 (FP16) --> Dequant --> t5 (FP32) --> Add -> t7
+  //   t0 (FP16) --> Dequant(0) --> t3 (FP32) --> Greater(3) -> t6
+  //   t1 (FP16) --> Dequant(1) --> t4 (FP32) --/
+  //                                          --\
+  //   t3 (FP16) --> Dequant(2) --> t5 (FP32) --> Add(4) -> t7
   //
-  // 'Greater' comes first in the execution plan though, so Add should not
-  // be scheduled to run on the Gpu. Further, it's inputs should remain t4
-  // and t5.
-  TfLiteContext* context = interpreter_mn->GetSubgraph()->context();
+  // Note: the graph dependency is exactly same w/ that in
+  // GetOpsToReplaceSelectsCorrectDequantsAddFirst, but the unsupported
+  // 'Greater' op appears first in the execution plan. Despite this,
+  // OpsToReplace should still replace the 'Add' op and the Dequant outputing
+  //  t5, but leave the other Dequant nodes because 'Greater' must run
+  //  on the CPU.
+
+  TfLiteContext* context = interpreter_mn2->context();
 
   // These functions are meant to be called inside delegates. Swap out
   // for similar functions to permit direct calling of GetOpsToReplace.
@@ -648,27 +764,67 @@ TEST(ModelBuilderTest, GetOpsToReplaceRestoresInputsOnErrors) {
   context->GetNodeAndRegistration = [](struct TfLiteContext*, int node_index,
                                        TfLiteNode** node,
                                        TfLiteRegistration** registration) {
-    auto& node_and_reg =
-        interpreter_mn2->GetSubgraph()->nodes_and_registration()[node_index];
+    auto& node_and_reg = interpreter_mn2->nodes_and_registration()[node_index];
     *node = &node_and_reg.first;
     *registration = &node_and_reg.second;
     return kTfLiteOk;
   };
 
+  context->PreviewDelegatePartitioning =
+      [](struct TfLiteContext* context, const TfLiteIntArray* nodes_to_replace,
+         TfLiteDelegateParams** partition_params_array, int* num_partitions) {
+        auto params = interpreter_mn2->add_delegate_params();
+        // First partition for DequantNode(0)
+        params->nodes_to_replace = TfLiteIntArrayCreate(1);
+        params->nodes_to_replace->data[0] = 0;
+        params->input_tensors = TfLiteIntArrayCreate(1);
+        params->input_tensors->data[0] = 0;
+        params->output_tensors = TfLiteIntArrayCreate(1);
+        params->output_tensors->data[0] = 3;
+
+        // Second partition for DequantNode(1)
+        params = interpreter_mn2->add_delegate_params();
+        params->nodes_to_replace = TfLiteIntArrayCreate(1);
+        params->nodes_to_replace->data[0] = 1;
+        params->input_tensors = TfLiteIntArrayCreate(1);
+        params->input_tensors->data[0] = 1;
+        params->output_tensors = TfLiteIntArrayCreate(1);
+        params->output_tensors->data[0] = 4;
+
+        // Third partition for DequantNode(1), DequantNode(2), ADD(4)
+        params = interpreter_mn2->add_delegate_params();
+        params->nodes_to_replace = TfLiteIntArrayCreate(3);
+        params->nodes_to_replace->data[0] = 1;
+        params->nodes_to_replace->data[1] = 2;
+        params->nodes_to_replace->data[2] = 4;
+        params->input_tensors = TfLiteIntArrayCreate(2);
+        params->input_tensors->data[0] = 1;
+        params->input_tensors->data[0] = 3;
+        params->output_tensors = TfLiteIntArrayCreate(1);
+        params->output_tensors->data[0] = 7;
+
+        *partition_params_array = interpreter_mn2->delegate_params();
+        *num_partitions = interpreter_mn2->num_delegate_params();
+        return kTfLiteOk;
+      };
+
   TfLiteIntArray* ops_to_replace = GetOpsToReplace(context);
 
-  // Verify that no ops will be replaced.
-  EXPECT_EQ(ops_to_replace->size, 0);
+  EXPECT_EQ(ops_to_replace->size, 2);
+  // Op at index 2 is the Dequant op (t3 -> t5).
+  EXPECT_EQ(ops_to_replace->data[0], 2);
+  // Op at index 4 is the Add op.
+  EXPECT_EQ(ops_to_replace->data[1], 4);
 
   TfLiteNode* node = nullptr;
   TfLiteRegistration* registration = nullptr;
-  // Verify that Add op has fp32 inputs.
-  context->GetNodeAndRegistration(context, 4, &node, &registration);
-  EXPECT_EQ(registration->builtin_code, kTfLiteBuiltinAdd);
+  // Verify that Add op has fp16 inputs.
+  context->GetNodeAndRegistration(context, ops_to_replace->data[1], &node,
+                                  &registration);
   EXPECT_EQ(context->tensors[node->inputs->data[0]].type,
-            TfLiteType::kTfLiteFloat32);
+            TfLiteType::kTfLiteFloat16);
   EXPECT_EQ(context->tensors[node->inputs->data[1]].type,
-            TfLiteType::kTfLiteFloat32);
+            TfLiteType::kTfLiteFloat16);
   TfLiteIntArrayFree(ops_to_replace);
 }
 
diff --git a/tensorflow/lite/delegates/gpu/common/operations.cc b/tensorflow/lite/delegates/gpu/common/operations.cc
index fa5cdc54047..b20b24d28c3 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.cc
+++ b/tensorflow/lite/delegates/gpu/common/operations.cc
@@ -118,6 +118,8 @@ std::string ToString(enum OperationType op) {
       return "pow";
     case OperationType::PRELU:
       return "prelu";
+    case OperationType::QUANTIZE_AND_DEQUANTIZE:
+      return "quantize_and_dequantize";
     case OperationType::RELU:
       return "relu";
     case OperationType::RESHAPE:
@@ -183,6 +185,7 @@ OperationType OperationTypeFromString(const std::string& name) {
           {"pooling_2d", OperationType::POOLING_2D},
           {"pow", OperationType::POW},
           {"prelu", OperationType::PRELU},
+          {"quantize_and_dequantize", OperationType::QUANTIZE_AND_DEQUANTIZE},
           {"relu", OperationType::RELU},
           {"resize", OperationType::RESIZE},
           {"reshape", OperationType::RESHAPE},
diff --git a/tensorflow/lite/delegates/gpu/common/operations.h b/tensorflow/lite/delegates/gpu/common/operations.h
index c5be9897fed..16016d334cf 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.h
+++ b/tensorflow/lite/delegates/gpu/common/operations.h
@@ -57,6 +57,8 @@ enum class OperationType {
   POOLING_2D,
   POW,
   PRELU,
+  // Used to accurately run inference on quantized models.
+  QUANTIZE_AND_DEQUANTIZE,
   RELU,
   RESHAPE,
   RESIZE,
@@ -478,6 +480,14 @@ struct SpaceToDepthAttributes {
   int block_size;
 };
 
+// These help perform a combination of Quantize & Dequantize to adjust float
+// values like quantized inference would.
+struct QuantizeAndDequantizeAttributes {
+  float min = 0;
+  float max = 0;
+  float scale = 0;
+};
+
 }  // namespace gpu
 }  // namespace tflite
 
diff --git a/tensorflow/lite/delegates/gpu/delegate.cc b/tensorflow/lite/delegates/gpu/delegate.cc
index 23bfb9ab149..452f81f536d 100644
--- a/tensorflow/lite/delegates/gpu/delegate.cc
+++ b/tensorflow/lite/delegates/gpu/delegate.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <thread>  // NOLINT(build/c++11)
 #include <vector>
 
 #include "absl/types/span.h"
@@ -71,6 +72,8 @@ class Delegate {
 
   Status Prepare(TfLiteContext* context,
                  const TfLiteDelegateParams* delegate_params) {
+    thread_id_prepare_ = std::this_thread::get_id();
+
     // Extract TFLite delegate execution plan from the context and convert it
     // into FlowGraph32.
     GraphFloat32 graph;
@@ -144,6 +147,16 @@ class Delegate {
   }
 
   Status Invoke(TfLiteContext* context) {
+    if (thread_id_prepare_ != std::this_thread::get_id()) {
+      TFLITE_LOG(tflite::TFLITE_LOG_WARNING,
+                 "GpuDelegate invoke thread != prepare thread");
+      if (enforce_same_thread_) {
+        return FailedPreconditionError(
+            "GpuDelegate must run on the same thread where it was "
+            "initialized.");
+      }
+    }
+
     RETURN_IF_ERROR(SetInputsAndOutputs(context));
     return runner_->Run();
   }
@@ -210,6 +223,7 @@ class Delegate {
     options.priority3 = ToPriority(options_.inference_priority3);
     RETURN_IF_ERROR(gl_environment_->NewInferenceBuilder(std::move(*graph),
                                                          options, builder));
+    enforce_same_thread_ = true;
     TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
                          "Initialized OpenGL-based API.");
     return OkStatus();
@@ -230,6 +244,9 @@ class Delegate {
   std::unique_ptr<InferenceRunner> runner_;
   std::vector<int64_t> input_indices_;
   std::vector<int64_t> output_indices_;
+
+  std::thread::id thread_id_prepare_;  // thread id used for Prapare()
+  bool enforce_same_thread_ = false;   // flag to enforce same thread for Invoke
 };
 
 inline Delegate* GetDelegate(TfLiteNode* node) {
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
index 68ae9dfd4dc..30d759df724 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
@@ -451,6 +451,38 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "quantize_and_dequantize",
+    srcs = ["quantize_and_dequantize.cc"],
+    hdrs = ["quantize_and_dequantize.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/common:convert",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+        "//tensorflow/lite/delegates/gpu/gl:node_shader",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+cc_test(
+    name = "quantize_and_dequantize_test",
+    srcs = ["quantize_and_dequantize_test.cc"],
+    tags = tf_gpu_tests_tags() + [
+        "notap",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":quantize_and_dequantize",
+        ":test_util",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/kernels/internal:quantization_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "relu",
     srcs = ["relu.cc"],
@@ -699,6 +731,7 @@ TFLITE_GPU_BINARY_RELEASE_OPERATORS = [
     "pad",
     "pooling",
     "prelu",
+    "quantize_and_dequantize",
     "relu",
     "mean",
     "reshape",
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/mean.cc b/tensorflow/lite/delegates/gpu/gl/kernels/mean.cc
index aaceb61b4e1..9328351f169 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/mean.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/mean.cc
@@ -47,8 +47,14 @@ class Mean : public NodeShader {
         {"input_data_0_w", input->tensor.shape.w}};
 
     std::string source = R"(
-      vec4 sum = vec4(0.0);
-      float size = float($input_data_0_w$ * $input_data_0_h$);
+      // Shaders may be compiled with a precision hint mediump, which means that
+      // GLSL compiler may drop the size of float data type from 32 to 16 bits.
+      // If "sum" and "size" variables are 16bit floats, their values range
+      // become not enough for providing a good results accuracy. That is why
+      // their precision is forced to be 32bit by using highp qualifier.
+
+      highp vec4 sum = vec4(0.0);
+      highp float size = float($input_data_0_w$ * $input_data_0_h$);
       for (int w = 0; w < $input_data_0_w$; w++) {
         for (int h = 0; h < $input_data_0_h$; h++) {
           sum += $input_data_0[w, h, gid.z]$;
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize.cc b/tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize.cc
new file mode 100644
index 00000000000..3f21124aee9
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize.cc
@@ -0,0 +1,74 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize.h"
+
+#include <memory>
+#include <string>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+namespace {
+
+class QuantizeAndDequantize : public NodeShader {
+ public:
+  Status GenerateCode(const GenerationContext& ctx,
+                      GeneratedCode* generated_code) const final {
+    std::string code;
+    // Constants
+    code += "vec4 scale = vec4($quant_scale$);";
+    code += "vec4 min_bound = vec4($quant_min$);";
+    code += "vec4 max_bound = vec4($quant_max$);";
+    // Quantize
+    code += "value_0 = clamp(value_0, min_bound, max_bound);";
+    code += "value_0 = (value_0 - min_bound) / scale;";
+    code += "value_0 = floor(value_0 + vec4(0.5));";
+    // Dequantize
+    code += "value_0 = value_0 * scale + min_bound;";
+
+    auto attr = absl::any_cast<const QuantizeAndDequantizeAttributes&>(
+        ctx.node->operation.attributes);
+    *generated_code = {
+        /*parameters=*/{{"quant_min", attr.min},
+                        {"quant_max", attr.max},
+                        {"quant_scale", attr.scale}},
+        /*objects=*/{},
+        /*shared_variables=*/{},
+        /*workload=*/uint3(),
+        /*workgroup=*/uint3(),
+        /*source_code=*/code,
+        /*input=*/IOStructure::AUTO,
+        /*output=*/IOStructure::AUTO,
+    };
+    return OkStatus();
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<NodeShader> NewQuantizeAndDequantizeNodeShader() {
+  return absl::make_unique<QuantizeAndDequantize>();
+}
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize.h b/tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize.h
new file mode 100644
index 00000000000..1fa6ad918c4
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_QUANTIZE_AND_DEQUANTIZE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_QUANTIZE_AND_DEQUANTIZE_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+// Performs the operation: {Quantize, Dequantize} on floating-point data.
+// We need this operation to emulate the error introduced by quantization
+// on the GPU, which cannot represent int8 tensors.
+//
+// Implemented as:
+// qvalue = round((min(qmax, max(qmin, src_val)) - qmin) * (1/qscale) + 0.5)
+// dq_value = qvalue * qscale + qmin
+// Here, qmin, qmax & qscale refer to the quantization values as implemented in
+// TensorFlow Lite's 'FakeQuant' kernel. round(x + 0.5) ensures we round away
+// from zero.
+//
+// NOTE: We do not need to nudge min/max values in this op, since they would
+// already be adjusted while generating the quantized model.
+std::unique_ptr<NodeShader> NewQuantizeAndDequantizeNodeShader();
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_QUANTIZE_AND_DEQUANTIZE_H_
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize_test.cc b/tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize_test.cc
new file mode 100644
index 00000000000..916f9166175
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize_test.cc
@@ -0,0 +1,159 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/gl/kernels/test_util.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+namespace {
+
+TEST(QuantizeAndDequantizeTest, Dim2Bits8) {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 3, 2, 1);
+
+  // Unlike TFLite's FakeQuant kernel, we assume that the incoming values are
+  // pre-nudged, since this should be done during model conversion.
+  const int num_bits = 8;
+  const int quant_min = 0;
+  const int quant_max = (1 << num_bits) - 1;
+  QuantizeAndDequantizeAttributes attr;
+  NudgeQuantizationRange(/**original_min**/ 0.0, /**original_max**/ 1.0,
+                         quant_min, quant_max, &attr.min, &attr.max,
+                         &attr.scale);
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 1;
+  output.shape = BHWC(1, 3, 2, 1);
+
+  SingleOpModel model({ToString(OperationType::QUANTIZE_AND_DEQUANTIZE), attr},
+                      {input}, {output});
+  ASSERT_TRUE(
+      model.PopulateTensor(0, {0.0, 1.0, 0.25, 0.50, 0.4444444, 0.00001}));
+  ASSERT_OK(model.Invoke(*NewQuantizeAndDequantizeNodeShader()));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6),
+                        {0.0f, 1.0f, 0.25098f, 0.498039f, 0.443137f, 0.0f}));
+}
+
+TEST(QuantizeAndDequantizeTest, Dim3Bits8_NegativeRange) {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 3, 1, 2);
+
+  // Unlike TFLite's FakeQuant kernel, we assume that the incoming values are
+  // pre-nudged, since this should be done during model conversion.
+  const int num_bits = 8;
+  const int quant_min = 0;
+  const int quant_max = (1 << num_bits) - 1;
+  QuantizeAndDequantizeAttributes attr;
+  NudgeQuantizationRange(/**original_min**/ -0.9, /**original_max**/ 0.9,
+                         quant_min, quant_max, &attr.min, &attr.max,
+                         &attr.scale);
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 1;
+  output.shape = BHWC(1, 3, 1, 2);
+
+  SingleOpModel model({ToString(OperationType::QUANTIZE_AND_DEQUANTIZE), attr},
+                      {input}, {output});
+  ASSERT_TRUE(
+      model.PopulateTensor(0, {0.0, -0.9, 0.25, 0.50, 0.4444444, -0.00001}));
+  ASSERT_OK(model.Invoke(*NewQuantizeAndDequantizeNodeShader()));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6), {0.0f, -0.896471f, 0.247059f,
+                                          0.501176f, 0.444706f, 0.0f}));
+}
+
+TEST(QuantizeAndDequantizeTest, Dim3Bits16) {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 3, 1, 2);
+
+  // Unlike TFLite's FakeQuant kernel, we assume that the incoming values are
+  // pre-nudged, since this should be done during model conversion.
+  const int num_bits = 16;
+  const int quant_min = 0;
+  const int quant_max = (1 << num_bits) - 1;
+  QuantizeAndDequantizeAttributes attr;
+  NudgeQuantizationRange(/**original_min**/ 0.0, /**original_max**/ 1.0,
+                         quant_min, quant_max, &attr.min, &attr.max,
+                         &attr.scale);
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 1;
+  output.shape = BHWC(1, 3, 1, 2);
+
+  SingleOpModel model({ToString(OperationType::QUANTIZE_AND_DEQUANTIZE), attr},
+                      {input}, {output});
+  ASSERT_TRUE(
+      model.PopulateTensor(0, {0.0, 1.0, 0.25, 0.50, 0.4444444, 0.00001}));
+  ASSERT_OK(model.Invoke(*NewQuantizeAndDequantizeNodeShader()));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6), {0.0f, 1.0f, 0.250004f, 0.500008f,
+                                          0.44445f, 1.5259e-05f}));
+}
+
+TEST(QuantizeAndDequantizeTest, Dim2Bits16_NegativeRange) {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 3, 2, 1);
+
+  // Unlike TFLite's FakeQuant kernel, we assume that the incoming values are
+  // pre-nudged, since this should be done during model conversion.
+  const int num_bits = 16;
+  const int quant_min = 0;
+  const int quant_max = (1 << num_bits) - 1;
+  QuantizeAndDequantizeAttributes attr;
+  NudgeQuantizationRange(/**original_min**/ -0.9, /**original_max**/ 0.9,
+                         quant_min, quant_max, &attr.min, &attr.max,
+                         &attr.scale);
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 1;
+  output.shape = BHWC(1, 3, 2, 1);
+
+  SingleOpModel model({ToString(OperationType::QUANTIZE_AND_DEQUANTIZE), attr},
+                      {input}, {output});
+  ASSERT_TRUE(
+      model.PopulateTensor(0, {0.0, -0.9, 0.25, 0.50, 0.4444444, -0.00001}));
+  ASSERT_OK(model.Invoke(*NewQuantizeAndDequantizeNodeShader()));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6), {0.0f, -0.900014f, 0.249998f,
+                                          0.499995f, 0.444431f, 0.0f}));
+}
+
+}  // namespace
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc b/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc
index cb4bed369dc..6903abc0b26 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/gl/kernels/pad.h"
 #include "tensorflow/lite/delegates/gpu/gl/kernels/pooling.h"
 #include "tensorflow/lite/delegates/gpu/gl/kernels/prelu.h"
+#include "tensorflow/lite/delegates/gpu/gl/kernels/quantize_and_dequantize.h"
 #include "tensorflow/lite/delegates/gpu/gl/kernels/relu.h"
 #include "tensorflow/lite/delegates/gpu/gl/kernels/reshape.h"
 #include "tensorflow/lite/delegates/gpu/gl/kernels/resize.h"
@@ -85,6 +86,8 @@ class Registry : public NodeShader {
     insert_op(Type::PAD, NewPadNodeShader);
     insert_op(Type::POOLING_2D, NewPoolingNodeShader);
     insert_op(Type::PRELU, NewPReLUNodeShader);
+    insert_op(Type::QUANTIZE_AND_DEQUANTIZE,
+              NewQuantizeAndDequantizeNodeShader);
     insert_op(Type::RELU, NewReLUNodeShader);
     insert_op(Type::RESIZE, NewResizeNodeShader);
     insert_op(Type::RESHAPE, NewReshapeNodeShader);
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/softmax.cc b/tensorflow/lite/delegates/gpu/gl/kernels/softmax.cc
index efaf39390d9..e59343df7b6 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/softmax.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/softmax.cc
@@ -15,10 +15,9 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/gl/kernels/softmax.h"
 
-#include <algorithm>
-#include <cstdint>
-#include <cstring>
+#include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/memory/memory.h"
@@ -33,6 +32,13 @@ namespace gpu {
 namespace gl {
 namespace {
 
+float4 GetMask(int num_channels) {
+  float4 mask(0.0f);
+  const int remainder = num_channels % 4 == 0 ? 4 : num_channels % 4;
+  for (int i = 0; i < remainder; ++i) mask[i] = 1.0f;
+  return mask;
+}
+
 class Softmax : public NodeShader {
  public:
   Status GenerateCode(const GenerationContext& ctx,
@@ -42,37 +48,117 @@ class Softmax : public NodeShader {
     const auto& attr = absl::any_cast<const SoftmaxAttributes&>(
         ctx.node->operation.attributes);
     if (input->tensor.shape != output->tensor.shape) {
-      return InvalidArgumentError("Input and output shape does not match");
+      return InvalidArgumentError("Input and output shapes do not match.");
     }
     if (attr.axis != Axis::CHANNELS) {
       return UnimplementedError("Softmax is only supported for channels axis.");
     }
+    return input->tensor.shape.h == 1 && input->tensor.shape.w == 1
+               ? GenerateCodeFor1x1(ctx, generated_code)
+               : GenerateCodeGeneral(ctx, generated_code);
+  }
 
-    float4 mask(0.0f);
-    const int channels = output->tensor.shape.c;
-    const int reminder = (channels % 4 == 0) ? 4 : channels % 4;
-    for (int i = 0; i < reminder; ++i) {
-      mask[i] = 1.0f;
+ private:
+  Status GenerateCodeFor1x1(const GenerationContext& ctx,
+                            GeneratedCode* generated_code) const {
+    const auto* output = ctx.graph->FindOutputs(ctx.node->id)[0];
+    const int depth = IntegralDivideRoundUp(output->tensor.shape.c, 4);
+    std::vector<Variable> shared_variables = {
+        {"partial_sum", std::vector<float4>(8)},
+    };
+    std::vector<Variable> uniform_parameters = {
+        {"depth", depth},
+        {"depth_div_32", IntegralDivideRoundUp(depth, 32)},
+        {"mask", GetMask(output->tensor.shape.c)},
+    };
+    std::string source_code = R"(
+  highp vec4 kOnes = vec4(1.0);
+  highp float sum = 0.0;
+  int offset = 0;
+  int s = 0;
+  int tid = int(gl_LocalInvocationID.x);
+  do {
+    int z = offset + tid;
+    if (z < $depth$) {
+      highp vec4 mask_temp = z == $depth$ - 1 ? $mask$ : kOnes;
+      highp vec4 src = $input_data_0[0, 0, z]$;
+      sum += dot(mask_temp, exp(src));
+      offset += 32;
     }
+    s++;
+  } while (s < $depth_div_32$);
+
+  partial_sum[tid / 4][tid % 4] = sum;
+
+  memoryBarrierShared();
+  barrier();
+
+  if (tid == 0) {
+    sum = dot(kOnes, partial_sum[0]);
+    sum += dot(kOnes, partial_sum[1]);
+    sum += dot(kOnes, partial_sum[2]);
+    sum += dot(kOnes, partial_sum[3]);
+    sum += dot(kOnes, partial_sum[4]);
+    sum += dot(kOnes, partial_sum[5]);
+    sum += dot(kOnes, partial_sum[6]);
+    sum += dot(kOnes, partial_sum[7]);
+    partial_sum[0][0] = 1.0 / sum;
+  }
+
+  memoryBarrierShared();
+  barrier();
+
+  sum = partial_sum[0][0];
+
+  offset = 0;
+  s = 0;
+  do {
+    int z = offset + tid;
+    if (z < $depth$) {
+      highp vec4 src = $input_data_0[0, 0, z]$;
+      highp vec4 temp = exp(src) * sum;
+      $output_data_0[0, 0, z]$ = temp;
+      offset += 32;
+    }
+    s++;
+  } while (s < $depth_div_32$);
+)";
+    *generated_code = {
+        /*parameters=*/std::move(uniform_parameters),
+        /*objects=*/{},
+        /*shared_variables=*/std::move(shared_variables),
+        /*workload=*/uint3(32, 1, 1),
+        /*workgroup=*/uint3(32, 1, 1),
+        /*source_code=*/std::move(source_code),
+        /*input=*/IOStructure::ONLY_DEFINITIONS,
+        /*output=*/IOStructure::ONLY_DEFINITIONS,
+    };
+    return OkStatus();
+  }
+
+  Status GenerateCodeGeneral(const GenerationContext& ctx,
+                             GeneratedCode* generated_code) const {
+    const auto* output = ctx.graph->FindOutputs(ctx.node->id)[0];
     std::vector<Variable> parameters = {
         {"src_depth", IntegralDivideRoundUp(output->tensor.shape.c, 4)},
-        {"mask", mask},
+        {"mask", GetMask(output->tensor.shape.c)},
     };
 
-    std::string source = R"(
+    std::string source_code = R"(
+  highp vec4 kOnes = vec4(1.0);
   highp float sum = 0.0;
   for (int d = 0; d < $src_depth$ - 1; ++d) {
-    highp vec4 v = $input_data_0[gid.x, gid.y, d]$;
-    sum += dot(vec4(1.0), exp(v));
+    highp vec4 src = $input_data_0[gid.x, gid.y, d]$;
+    sum += dot(kOnes, exp(src));
   }
   {
     int d = $src_depth$ - 1;
-    highp vec4 v = $input_data_0[gid.x, gid.y, d]$;
-    sum += dot($mask$, exp(v));
+    highp vec4 src = $input_data_0[gid.x, gid.y, d]$;
+    sum += dot($mask$, exp(src));
   }
   for (int d = 0; d < $src_depth$; ++d) {
-    highp vec4 v = $input_data_0[gid.x, gid.y, d]$;
-    vec4 temp_sum = exp(v) / sum;
+    highp vec4 src = $input_data_0[gid.x, gid.y, d]$;
+    highp vec4 temp_sum = exp(src) / sum;
     $output_data_0[gid.x, gid.y, d] = temp_sum$;
   }
 )";
@@ -82,7 +168,7 @@ class Softmax : public NodeShader {
         /*shared_variables=*/{},
         /*workload=*/uint3(output->tensor.shape.w, output->tensor.shape.h, 1),
         /*workgroup=*/uint3(),
-        /*source_code=*/std::move(source),
+        /*source_code=*/std::move(source_code),
         /*input=*/IOStructure::ONLY_DEFINITIONS,
         /*output=*/IOStructure::ONLY_DEFINITIONS,
     };
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/softmax_test.cc b/tensorflow/lite/delegates/gpu/gl/kernels/softmax_test.cc
index 2e031c6db68..1707e1efb8f 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/softmax_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/softmax_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/gl/kernels/softmax.h"
 
+#include <cmath>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -47,9 +48,10 @@ TEST(SoftmaxTest, Softmax) {
 
   SingleOpModel model({ToString(OperationType::SOFTMAX), attr}, {input},
                       {output});
-  ASSERT_TRUE(model.PopulateTensor(0, {0.1, 0.2, 0.1, 0.2}));
+  ASSERT_TRUE(model.PopulateTensor(0, {0.1f, 0.2f, 0.3f, 0.4f}));
   ASSERT_OK(model.Invoke(*NewSoftmaxNodeShader()));
-  EXPECT_THAT(model.GetOutput(0), Pointwise(FloatNear(1e-6), {1, 1, 1, 1}));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6f), {1.0f, 1.0f, 1.0f, 1.0f}));
 }
 
 TEST(SoftmaxTest, DoesNotWorkForHeightAxis) {
@@ -68,7 +70,7 @@ TEST(SoftmaxTest, DoesNotWorkForHeightAxis) {
 
   SingleOpModel model({ToString(OperationType::SOFTMAX), attr}, {input},
                       {output});
-  ASSERT_TRUE(model.PopulateTensor(0, {0.1, 0.2, 0.3, 0.4}));
+  ASSERT_TRUE(model.PopulateTensor(0, {0.1f, 0.2f, 0.3f, 0.4f}));
   EXPECT_FALSE(model.Invoke(*NewSoftmaxNodeShader()).ok());
 }
 
@@ -88,7 +90,7 @@ TEST(SoftmaxTest, DoesNotWorkForWidthAxis) {
 
   SingleOpModel model({ToString(OperationType::SOFTMAX), attr}, {input},
                       {output});
-  ASSERT_TRUE(model.PopulateTensor(0, {0.1, 0.2, 0.3, 0.4}));
+  ASSERT_TRUE(model.PopulateTensor(0, {0.1f, 0.2f, 0.3f, 0.4f}));
   EXPECT_FALSE(model.Invoke(*NewSoftmaxNodeShader()).ok());
 }
 
@@ -106,17 +108,17 @@ TEST(SoftmaxTest, Softmax1x1) {
   SoftmaxAttributes attr;
   attr.axis = Axis::CHANNELS;
 
-  const double sum =
-      std::exp(0.1) + std::exp(0.2) + std::exp(0.3) + std::exp(0.4);
+  const float sum =
+      std::exp(0.1f) + std::exp(0.2f) + std::exp(0.3f) + std::exp(0.4f);
 
   SingleOpModel model({ToString(OperationType::SOFTMAX), attr}, {input},
                       {output});
-  ASSERT_TRUE(model.PopulateTensor(0, {0.1, 0.2, 0.3, 0.4}));
+  ASSERT_TRUE(model.PopulateTensor(0, {0.1f, 0.2f, 0.3f, 0.4f}));
   ASSERT_OK(model.Invoke(*NewSoftmaxNodeShader()));
-  EXPECT_THAT(
-      model.GetOutput(0),
-      Pointwise(FloatNear(1e-6), {std::exp(0.1) / sum, std::exp(0.2) / sum,
-                                  std::exp(0.3) / sum, std::exp(0.4) / sum}));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6f),
+                        {std::exp(0.1f) / sum, std::exp(0.2f) / sum,
+                         std::exp(0.3f) / sum, std::exp(0.4f) / sum}));
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/metal/api.cc b/tensorflow/lite/delegates/gpu/metal/api.cc
index 8608fa05338..5eb7d284ad1 100644
--- a/tensorflow/lite/delegates/gpu/metal/api.cc
+++ b/tensorflow/lite/delegates/gpu/metal/api.cc
@@ -97,6 +97,17 @@ std::vector<ComputeTaskDescriptorPtr> SelectDepthWiseConv(
   }
 }
 
+std::vector<ComputeTaskDescriptorPtr> SelectConvolutionTransposed(
+    int id, ValueId input_id, ValueId output_id,
+    const ConvolutionTransposedAttributes& attr,
+    const metal::RuntimeOptions& options) {
+  if (CheckConvolutionTransposed4x4Support(attr)) {
+    return ConvolutionTransposed4x4(id, input_id, output_id, attr, options);
+  } else {
+    return ConvolutionTransposed(id, input_id, output_id, attr, options);
+  }
+}
+
 std::vector<ComputeTaskDescriptorPtr> SelectPReLU(
     const GraphFloat32& graph, int id, ValueId input_id, ValueId output_id,
     const PReLUAttributes& attr, const metal::RuntimeOptions& options) {
@@ -178,11 +189,11 @@ Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
           options);
       break;
     case OperationType::CONVOLUTION_TRANSPOSED:
-      *tasks =
-          ConvolutionTransposed(node_id, inputs[0], outputs[0],
-                                absl::any_cast<ConvolutionTransposedAttributes>(
-                                    node->operation.attributes),
-                                options);
+      *tasks = SelectConvolutionTransposed(
+          node_id, inputs[0], outputs[0],
+          absl::any_cast<ConvolutionTransposedAttributes>(
+              node->operation.attributes),
+          options);
       break;
     case OperationType::DEPTHWISE_CONVOLUTION:
       *tasks =
@@ -294,6 +305,7 @@ Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
     case OperationType::BATCH_TO_SPACE:
     case OperationType::CONST:
     case OperationType::LSTM:
+    case OperationType::QUANTIZE_AND_DEQUANTIZE:
     case OperationType::SPACE_TO_BATCH:
     case OperationType::TRANSPOSE:
     case OperationType::UNKNOWN:
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc b/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc
index e53d1508d6d..9c3f91df7f2 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.cc
@@ -887,6 +887,168 @@ std::string GetDeconvolutionShared3x3(
                           dst_channels_aligned);
 }
 
+std::string GetDeconvolution4x4(const int2& block_size, bool use_local_mem) {
+  std::string c = R"(
+    #include <metal_stdlib>
+    using namespace metal;
+
+    struct uniforms {
+      int4 src_size;
+      int4 dst_size;
+      int filter_offset;
+      int3 dummy_0;
+    };
+)";
+  c += R"(
+    $0
+    kernel void ComputeFunction(
+                                $1
+                                uint3 group_id[[threadgroup_position_in_grid]],
+                                uint3 tid3d[[thread_position_in_threadgroup]],
+                                uint3 ugid[[thread_position_in_grid]]) {
+)";
+  c += "  int X = static_cast<int>(group_id.y * 8u + tid3d.x);\n";
+  c += "  int Y = static_cast<int>(group_id.z * 4u + tid3d.y);\n";
+  c += "  int Z = static_cast<int>(group_id.x * 1u + tid3d.z);\n";
+  c += "  X *= " + std::to_string(block_size.x) + ";\n";
+  c += "  Y *= " + std::to_string(block_size.y) + ";\n";
+  if (!use_local_mem) {
+    c += "  if (X * 2 > params.dst_size.x || Y * 2 > params.dst_size.y || Z >= "
+         "params.dst_size.z) return;\n";
+  }
+  for (int y = 0; y < block_size.y; ++y) {
+    for (int x = 0; x < block_size.x; ++x) {
+      const std::string block = std::to_string(x) + std::to_string(y);
+      c += "  ACCUM_FLT4 r_" + block +
+           "_00 = ACCUM_FLT4(0.0f, 0.0f, 0.0f, 0.0f);\n";
+      c += "  ACCUM_FLT4 r_" + block +
+           "_10 = ACCUM_FLT4(0.0f, 0.0f, 0.0f, 0.0f);\n";
+      c += "  ACCUM_FLT4 r_" + block +
+           "_01 = ACCUM_FLT4(0.0f, 0.0f, 0.0f, 0.0f);\n";
+      c += "  ACCUM_FLT4 r_" + block +
+           "_11 = ACCUM_FLT4(0.0f, 0.0f, 0.0f, 0.0f);\n";
+    }
+  }
+  c += "  int f_offset = Z * params.filter_offset;\n";
+  if (use_local_mem) {
+    c += "  threadgroup FLT4 weights_cache[64];\n";
+    c += "  int local_id = static_cast<int>(tid3d.y * 8u + tid3d.x);\n";
+  }
+  for (int x = 0; x < block_size.x + 1; ++x) {
+    const std::string sx = std::to_string(x);
+    const std::string xc =
+        x == 0 ? std::string("X - 1") : "X + " + std::to_string(x - 1);
+    c += "  bool in_x" + sx + " = " + xc + " >= 0 && " + xc +
+         " < params.src_size.x;\n";
+    c += "  int xc" + sx + " = clamp(" + xc + ", 0, params.src_size.x - 1);\n";
+  }
+  for (int y = 0; y < block_size.y + 1; ++y) {
+    const std::string sy = std::to_string(y);
+    const std::string yc =
+        y == 0 ? std::string("Y - 1") : "Y + " + std::to_string(y - 1);
+    c += "  bool in_y" + std::to_string(y) + " = " + yc + " >= 0 && " + yc +
+         " < params.src_size.y;\n";
+    c += "  int yc" + sy + " = clamp(" + yc + ", 0, params.src_size.y - 1);\n";
+  }
+  for (int y = 0; y < block_size.y + 1; ++y) {
+    for (int x = 0; x < block_size.x + 1; ++x) {
+      const std::string sx = std::to_string(x);
+      const std::string sy = std::to_string(y);
+      c += "  FLT m_" + sx + sy + " = in_x" + sx + " && in_y" + sy + ";\n";
+      c += "  device FLT4* src_ptr_" + sx + sy + " = src_buffer + yc" + sy +
+           " * params.src_size.x + xc" + sx + ";\n";
+    }
+  }
+  c += "  for (int s = 0; s < params.src_size.z; ++s) {\n";
+  if (use_local_mem) {
+    c += "    BARRIER(mem_flags::mem_none);\n";
+    c += "    weights_cache[local_id] = filters[f_offset + local_id];\n";
+    c += "    weights_cache[local_id + 32] = filters[f_offset + local_id + "
+         "32];\n";
+  } else {
+    c += "    device FLT4* weights_cache = filters + f_offset;\n";
+  }
+  for (int y = 0; y < block_size.y + 1; ++y) {
+    for (int x = 0; x < block_size.x + 1; ++x) {
+      const std::string id = std::to_string(x) + std::to_string(y);
+      c += "    FLT4 src_" + id + " = *src_ptr_" + id + " * m_" + id +
+           "; src_ptr_" + id + " += params.src_size.w;\n";
+    }
+  }
+  c += "    f_offset += 64;\n";
+  if (use_local_mem) {
+    c += "    BARRIER(mem_flags::mem_threadgroup);\n";
+  }
+  for (int i = 0; i < 16; ++i) {
+    const int result_sub_pixel_id = i % 4;
+    const int src_pixel_id = i / 4;
+    const int weights_offset = i * 4;
+    for (int y = 0; y < block_size.y; ++y) {
+      for (int x = 0; x < block_size.x; ++x) {
+        const std::string block = std::to_string(x) + std::to_string(y);
+        const std::string R = "r_" + block + "_" +
+                              std::to_string(result_sub_pixel_id % 2) +
+                              std::to_string(result_sub_pixel_id / 2);
+        const std::string S = "src_" + std::to_string(src_pixel_id % 2 + x) +
+                              std::to_string(src_pixel_id / 2 + y);
+        c += "    " + R + ".x += dot(" + S + ", weights_cache[" +
+             std::to_string(weights_offset + 0) + "]);\n";
+        c += "    " + R + ".y += dot(" + S + ", weights_cache[" +
+             std::to_string(weights_offset + 1) + "]);\n";
+        c += "    " + R + ".z += dot(" + S + ", weights_cache[" +
+             std::to_string(weights_offset + 2) + "]);\n";
+        c += "    " + R + ".w += dot(" + S + ", weights_cache[" +
+             std::to_string(weights_offset + 3) + "]);\n";
+      }
+    }
+  }
+  c += "  }\n";
+  c += "\n";
+  if (use_local_mem) {
+    c += "  if (X * 2 > params.dst_size.x || Y * 2 > params.dst_size.y || Z >= "
+         "params.dst_size.z) return;\n";
+  }
+  c += "  X = X * 2 - 1;\n";
+  c += "  Y = Y * 2 - 1;\n";
+  c += "\n";
+  c += "  const int dst_offset = (Z * params.dst_size.y + Y) * "
+       "params.dst_size.x "
+       "+ X;\n";
+  c += "  FLT4 bias_val = biases[Z];\n";
+  for (int y = 0; y < block_size.y; ++y) {
+    for (int x = 0; x < block_size.x; ++x) {
+      for (int sub_y = 0; sub_y < 2; ++sub_y) {
+        for (int sub_x = 0; sub_x < 2; ++sub_x) {
+          const int x_offset = x * 2 + sub_x;
+          const int y_offset = y * 2 + sub_y;
+          const std::string block = std::to_string(x) + std::to_string(y);
+          const std::string R = "r_" + block + "_" + std::to_string(sub_x) +
+                                std::to_string(sub_y);
+          const std::string dst_x = "X + " + std::to_string(x_offset);
+          const std::string dst_y = "Y + " + std::to_string(y_offset);
+          const std::string x_check = x_offset == 0
+                                          ? std::string("X >= 0")
+                                          : dst_x + " < params.dst_size.x";
+          const std::string y_check = y_offset == 0
+                                          ? std::string("Y >= 0")
+                                          : dst_y + " < params.dst_size.y";
+          c += "  if (" + x_check + " && " + y_check + ") {\n";
+          c += "    FLT4 value = FLT4(" + R + ") + bias_val;\n";
+          c += "    int linear_index = dst_offset + params.dst_size.x * " +
+               std::to_string(y_offset) + " + " + std::to_string(x_offset) +
+               ";\n";
+          c += "    uint3 gid = uint3(" + dst_x + ", " + dst_y + ", Z);\n";
+          c += "    $2\n";
+          c += "    dst_buffer[linear_index] = value;\n";
+          c += "  }\n";
+        }
+      }
+    }
+  }
+  c += "}\n";
+  return c;
+}
+
 }  // namespace
 
 std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed(
@@ -1184,6 +1346,135 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed3x3(
   return {border_desc, desc};
 }
 
+std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed4x4(
+    int id, ValueId input_id, ValueId output_id,
+    const ConvolutionTransposedAttributes& params,
+    const RuntimeOptions& options) {
+  const int src_depth = IntegralDivideRoundUp(params.weights.shape.i, 4);
+  const int dst_depth = IntegralDivideRoundUp(params.weights.shape.o, 4);
+  const int kernel_x = 4;
+  const int kernel_y = 4;
+
+  const int flt_count = kernel_x * kernel_y * src_depth * dst_depth * 4 * 4;
+  std::vector<float> gpu_data(flt_count);
+
+  const int remap[16] = {10, 11, 14, 15, 8, 9, 12, 13, 2, 3, 6, 7, 0, 1, 4, 5};
+
+  int counter = 0;
+  for (int d = 0; d < dst_depth; ++d) {
+    for (int s = 0; s < src_depth; ++s) {
+      for (int y = 0; y < kernel_y; ++y) {
+        for (int x = 0; x < kernel_x; ++x) {
+          const int kernel_index = remap[y * kernel_x + x];
+          const int kernel_index_x = kernel_index % kernel_x;
+          const int kernel_index_y = kernel_index / kernel_x;
+          float4 filters[4];
+          for (int j = 0; j < 4; ++j) {
+            for (int i = 0; i < 4; ++i) {
+              const int s_ch = s * 4 + i;
+              const int d_ch = d * 4 + j;
+              if (s_ch < params.weights.shape.i &&
+                  d_ch < params.weights.shape.o) {
+                const int f_index = params.weights.shape.LinearIndex(
+                    {d_ch, kernel_index_y, kernel_index_x, s_ch});
+                filters[j][i] = params.weights.data[f_index];
+              } else {
+                filters[j][i] = 0.0f;
+              }
+            }
+          }
+          for (int i = 0; i < 4; ++i) {
+            gpu_data[counter++] = filters[i].x;
+            gpu_data[counter++] = filters[i].y;
+            gpu_data[counter++] = filters[i].z;
+            gpu_data[counter++] = filters[i].w;
+          }
+        }
+      }
+    }
+  }
+
+  auto filters = GetByteBufferConverted(gpu_data, options.storage_precision);
+  auto biases = GetByteBufferConvertedResized(
+      params.bias.data, options.storage_precision, params.weights.shape.o);
+
+  auto desc = std::make_shared<ComputeTaskDescriptor>();
+  desc->id = id;
+  desc->is_linkable = false;
+
+  const auto gpu_type = GetGpuType();
+  const bool powervr = gpu_type == GpuType::kA7 || gpu_type == GpuType::kA8 ||
+                       gpu_type == GpuType::kA9 || gpu_type == GpuType::kA10;
+  const bool recommended_2x =
+      !powervr && options.storage_precision == RuntimeOptions::Precision::FP16;
+  const bool use_local_mem = powervr;
+  const int2 block_size(recommended_2x ? 2 : 1, 1);
+  desc->shader_source = GetDeconvolution4x4(block_size, use_local_mem);
+
+  desc->input_buffers = {
+      {input_id, "device FLT4* const src_buffer"},
+  };
+
+  desc->output_buffer = {
+      output_id, "device FLT4* dst_buffer",
+      [input_id, params](const std::map<ValueId, BHWC>& buffers) {
+        const auto& src_shape = buffers.find(input_id)->second;
+        BHWC dst_shape = CalculateOutputShape(src_shape, params);
+        return BHWC{src_shape.b, dst_shape.h, dst_shape.w, dst_shape.c};
+      }};
+
+  desc->immutable_buffers = {
+      {"device FLT4* const filters", filters},
+      {"device FLT4* const biases", biases},
+  };
+
+  desc->uniform_buffers = {
+      {"constant uniforms& params",
+       [input_id, output_id, params](const std::map<ValueId, BHWC>& buffers) {
+         const auto& src_shape = buffers.find(input_id)->second;
+         const auto& dst_shape = buffers.find(output_id)->second;
+         const int src_depth = IntegralDivideRoundUp(src_shape.c, 4);
+         std::vector<int> uniform_params{
+             src_shape.w,
+             src_shape.h,
+             src_depth,
+             src_shape.w * src_shape.h,
+             dst_shape.w,
+             dst_shape.h,
+             IntegralDivideRoundUp(dst_shape.c, 4),
+             0,
+             4 * 16 * src_depth,
+             0,
+             0,
+             0,
+         };
+         return GetByteBuffer(uniform_params);
+       }},
+  };
+
+  desc->resize_function = [output_id, block_size,
+                           params](const std::map<ValueId, BHWC>& buffers) {
+    const auto& dst_shape = buffers.find(output_id)->second;
+    const int grid_x = IntegralDivideRoundUp(dst_shape.w + 2, 2 * block_size.x);
+    const int grid_y = IntegralDivideRoundUp(dst_shape.h + 2, 2 * block_size.y);
+    const int grid_z = IntegralDivideRoundUp(dst_shape.c, 4);
+    const uint3 group_size{8, 4, 1};
+    int groups_x = IntegralDivideRoundUp(grid_x, group_size.x);
+    int groups_y = IntegralDivideRoundUp(grid_y, group_size.y);
+    int groups_z = IntegralDivideRoundUp(grid_z, group_size.z);
+    return std::make_pair(group_size, uint3{groups_z, groups_x, groups_y});
+  };
+
+  return {desc};
+}
+
+bool CheckConvolutionTransposed4x4Support(
+    const ConvolutionTransposedAttributes& attr) {
+  return attr.weights.shape.w == 4 && attr.weights.shape.h == 4 &&
+         attr.stride.w == 2 && attr.stride.h == 2 &&
+         attr.padding.prepended.w == 1 && attr.padding.prepended.h == 1;
+}
+
 }  // namespace metal
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.h b/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.h
index d74cc2f17a4..cffab3cf90e 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.h
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.h
@@ -37,6 +37,14 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed3x3(
     const ConvolutionTransposedAttributes& params,
     const RuntimeOptions& options);
 
+std::vector<ComputeTaskDescriptorPtr> ConvolutionTransposed4x4(
+    int id, ValueId input_id, ValueId output_id,
+    const ConvolutionTransposedAttributes& params,
+    const RuntimeOptions& options);
+
+bool CheckConvolutionTransposed4x4Support(
+    const ConvolutionTransposedAttributes& attr);
+
 }  // namespace metal
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv_test.mm b/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv_test.mm
index c2a8ac8b992..c1c1193fe0e 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv_test.mm
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv_test.mm
@@ -245,4 +245,45 @@ using ::tflite::gpu::metal::SingleOpModel;
   XCTAssertTrue(status.ok(), @"%s", status.error_message().c_str());
 }
 
+- (void)testTransposeConv4x4 {
+  TensorRef<BHWC> input;
+  input.type = DataType::FLOAT32;
+  input.ref = 0;
+  input.shape = BHWC(1, 2, 2, 1);
+
+  ConvolutionTransposedAttributes attr;
+  attr.padding.prepended = HW(1, 1);
+  attr.padding.appended = HW(1, 1);
+  attr.stride = HW(2, 2);
+
+  attr.weights.shape = OHWI(2, 4, 4, 1);
+  attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                      2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f,
+                       2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f};
+  attr.weights.id = 1;
+
+  attr.bias.shape = Linear(1);
+  attr.bias.data = {0.0f};
+  attr.bias.id = 2;
+
+  TensorRef<BHWC> output;
+  output.type = DataType::FLOAT32;
+  output.ref = 3;
+  output.shape = BHWC(1, 4, 4, 1);
+
+
+  SingleOpModel model({ToString(OperationType::CONVOLUTION_TRANSPOSED), std::move(attr)}, {input},
+                      {output});
+  XCTAssertTrue(model.PopulateTensor(0, {0.0f, 1.0f, 2.0f, 3.0f}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.error_message().c_str());
+  status = CompareVectors({0.0f, 0.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f,
+                           2.0f, 4.0f, 6.0f, 12.0f, 6.0f, 12.0f, 4.0f, 8.0f,
+                           2.0f, 4.0f, 6.0f, 12.0f, 6.0f, 12.0f, 4.0f, 8.0f,
+                           2.0f, 4.0f, 5.0f, 10.0f, 5.0f, 10.0f, 3.0f, 6.0f},
+                          model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.error_message().c_str());
+}
+
 @end
diff --git a/tensorflow/lite/delegates/gpu/metal_delegate.mm b/tensorflow/lite/delegates/gpu/metal_delegate.mm
index 6635a3a1388..f7f08b273ae 100644
--- a/tensorflow/lite/delegates/gpu/metal_delegate.mm
+++ b/tensorflow/lite/delegates/gpu/metal_delegate.mm
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/metal/api.h"
 #include "tensorflow/lite/delegates/gpu/metal/buffer_convert.h"
 #include "tensorflow/lite/delegates/gpu/metal/common.h"
+#include "tensorflow/lite/delegates/gpu/metal/environment.h"
 #include "tensorflow/lite/delegates/gpu/metal/compiled_model.h"
 #include "tensorflow/lite/delegates/gpu/metal/inference_context.h"
 #include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
@@ -295,7 +296,17 @@ class Delegate {
     if (options_.allow_precision_loss) {
       storage_type_size = sizeof(HalfBits);
       runtime_options.storage_precision = RuntimeOptions::Precision::FP16;
-      runtime_options.accumulator_precision = RuntimeOptions::Precision::FP16;
+      const auto gpu_type = GetGpuType();
+      const bool powervr = gpu_type == GpuType::kA7 || gpu_type == GpuType::kA8 ||
+                           gpu_type == GpuType::kA9 || gpu_type == GpuType::kA10;
+      if (powervr) {
+        // PowerVR gpus support only round to zero for floating-point operations,
+        // to increase precision we will use F32 accumulator in this case
+        runtime_options.accumulator_precision = RuntimeOptions::Precision::FP32;
+      } else {
+        // Apple own gpus support round to nearest and have better precision
+        runtime_options.accumulator_precision = RuntimeOptions::Precision::FP16;
+      }
     } else {
       storage_type_size = sizeof(float);
       runtime_options.storage_precision = RuntimeOptions::Precision::FP32;
diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD
index 021f74b6ab0..efe7a75d889 100644
--- a/tensorflow/lite/delegates/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/BUILD
@@ -70,6 +70,7 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/nnapi:nnapi_implementation",
+        "//tensorflow/lite/nnapi:nnapi_lib",
         "//tensorflow/lite/nnapi:nnapi_util",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:optional",
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
index 423490438a9..2bc43620d96 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
@@ -88,7 +88,7 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     // <max_number_delegated_partitions> of them will be actually accelerated.
     // The selection is currently done sorting partitions in decreasing order
     // of number of nodes and selecting them until the limit is reached.
-    int max_number_delegated_partitions = 0;
+    int max_number_delegated_partitions = 3;
   };
 
   // Uses default options.
diff --git a/tensorflow/lite/delegates/utils.cc b/tensorflow/lite/delegates/utils.cc
index d9a82f93335..c4e6d5fbec4 100644
--- a/tensorflow/lite/delegates/utils.cc
+++ b/tensorflow/lite/delegates/utils.cc
@@ -15,54 +15,30 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/utils.h"
 
-#include <algorithm>
-
 #include "tensorflow/lite/c/common.h"
 
 namespace tflite {
 namespace delegates {
 
-TfLiteStatus PruneContinuousSubsets(TfLiteContext* context,
-                                    const int max_subsets,
-                                    std::vector<int>* indices) {
-  if (!indices) {
-    context->ReportError(context, "indices cannot be nullptr");
+TfLiteStatus CreateNewTensorWithDifferentType(TfLiteContext* context,
+                                              const int original_tensor_index,
+                                              TfLiteType new_type,
+                                              TfLiteTensor** new_tensor,
+                                              int* new_tensor_index) {
+  const TfLiteTensor& original_tensor = context->tensors[original_tensor_index];
+  TF_LITE_ENSURE_STATUS(context->AddTensors(context, 1, new_tensor_index));
+  *new_tensor = &context->tensors[*new_tensor_index];
+  (*new_tensor)->type = new_type;
+  (*new_tensor)->allocation_type = kTfLiteArenaRw;
+  const auto* original_dims = original_tensor.dims;
+  TfLiteIntArray* dims = TfLiteIntArrayCreate(original_dims->size);
+  for (int i = 0; i < original_dims->size; ++i) {
+    dims->data[i] = original_dims->data[i];
+  }
+  if (context->ResizeTensor(context, *new_tensor, dims) != kTfLiteOk) {
+    TF_LITE_KERNEL_LOG(context, "Could not resize new delegate tensor");
     return kTfLiteError;
   }
-  if (indices->empty() || indices->size() < max_subsets) return kTfLiteOk;
-
-  // Sort indices just in case.
-  std::sort(indices->begin(), indices->end());
-
-  // Build a vector of subsets.
-  std::vector<std::vector<int>> continuous_subsets;
-  int last_index = indices->at(0) - 2;
-  for (const auto idx : *indices) {
-    if (idx > last_index + 1) {
-      continuous_subsets.emplace_back();
-    }
-    continuous_subsets.back().push_back(idx);
-    last_index = idx;
-  }
-
-  // Nothing to be done if number of subsets is already less than max_subsets.
-  if (continuous_subsets.size() <= max_subsets) return kTfLiteOk;
-
-  // Sort the vector of subsets in descending order of length.
-  std::sort(continuous_subsets.begin(), continuous_subsets.end(),
-            [](const std::vector<int>& a, const std::vector<int>& b) {
-              return a.size() > b.size();
-            });
-
-  // Re-build indices vector from top subsets.
-  indices->clear();
-  for (int i = 0; i < max_subsets; ++i) {
-    indices->reserve(indices->size() + continuous_subsets[i].size());
-    indices->insert(indices->end(), continuous_subsets[i].begin(),
-                    continuous_subsets[i].end());
-  }
-  std::sort(indices->begin(), indices->end());
-
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/delegates/utils.h b/tensorflow/lite/delegates/utils.h
index 297a57fc37a..d2881dfde37 100644
--- a/tensorflow/lite/delegates/utils.h
+++ b/tensorflow/lite/delegates/utils.h
@@ -23,15 +23,13 @@ limitations under the License.
 namespace tflite {
 namespace delegates {
 
-// Given a list(vector<int>) of indices, modifies it in-place to contain
-// max_subsets number of continuous subsets. Subsets are selected in descending
-// order of their length.
-// Resulting vector contains sorted list of pruned indices.
-//
-// This util can be used by delegates to avoid accepting too many node-subsets.
-TfLiteStatus PruneContinuousSubsets(TfLiteContext* context,
-                                    const int max_subsets,
-                                    std::vector<int>* indices);
+// Creates a new Read/Write tensor having the same shape as the original, but
+// with a different type.
+TfLiteStatus CreateNewTensorWithDifferentType(TfLiteContext* context,
+                                              const int original_tensor_index,
+                                              TfLiteType new_type,
+                                              TfLiteTensor** new_tensor,
+                                              int* new_tensor_index);
 
 }  // namespace delegates
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/utils_test.cc b/tensorflow/lite/delegates/utils_test.cc
index ea32dd069cb..25b36753222 100644
--- a/tensorflow/lite/delegates/utils_test.cc
+++ b/tensorflow/lite/delegates/utils_test.cc
@@ -24,81 +24,52 @@ namespace tflite {
 namespace delegates {
 namespace {
 
-using ::testing::ElementsAreArray;
-
-void ReportError(TfLiteContext* context, const char* format, ...) {}
-
-TEST(UtilsTest, PruneContinuousSubsets_NoSubsets) {
+TEST(UtilsTest, CreateNewTensorWithDifferentTypeTest) {
+  std::vector<TfLiteTensor> tensors(2);
+  // Data about original tensor.
+  // The same shape should be reflected in tensors[1] later.
+  tensors[0].dims = TfLiteIntArrayCreate(2);
+  tensors[0].dims->data[0] = 2;
+  tensors[0].dims->data[1] = 3;
+  tensors[0].type = kTfLiteFloat32;
+  // To simulate a valid TFLite Context.
   TfLiteContext context;
-  context.ReportError = ReportError;
-  std::vector<int> original_indices = {};
+  context.AddTensors = [](struct TfLiteContext*, int tensors_to_add,
+                          int* first_new_tensor_index) {
+    // The util should be adding exactly one tensor to the graph.
+    if (tensors_to_add != 1) {
+      return kTfLiteError;
+    }
+    // This ensures that the 'new tensor' is the second tensor in the vector
+    // above.
+    *first_new_tensor_index = 1;
+    return kTfLiteOk;
+  };
+  context.ResizeTensor = [](struct TfLiteContext*, TfLiteTensor* tensor,
+                            TfLiteIntArray* new_size) {
+    // Ensure dimensions are the same as the original tensor.
+    if (new_size->size != 2 || new_size->data[0] != 2 || new_size->data[1] != 3)
+      return kTfLiteError;
+    tensor->dims = new_size;
+    return kTfLiteOk;
+  };
+  context.tensors = tensors.data();
 
-  ASSERT_EQ(PruneContinuousSubsets(&context, 5, nullptr), kTfLiteError);
+  TfLiteTensor* new_tensor = nullptr;
+  int new_tensor_index = -1;
+  EXPECT_EQ(CreateNewTensorWithDifferentType(
+                &context, /**original_tensor_index**/ 0,
+                /**new_type**/ kTfLiteUInt8, &new_tensor, &new_tensor_index),
+            kTfLiteOk);
+  EXPECT_EQ(new_tensor_index, 1);
+  EXPECT_NE(new_tensor, nullptr);
+  EXPECT_NE(new_tensor->dims, nullptr);
+  EXPECT_EQ(new_tensor->type, kTfLiteUInt8);
+  EXPECT_EQ(new_tensor->allocation_type, kTfLiteArenaRw);
 
-  ASSERT_EQ(PruneContinuousSubsets(&context, 0, &original_indices), kTfLiteOk);
-  ASSERT_TRUE(original_indices.empty());
-
-  ASSERT_EQ(PruneContinuousSubsets(&context, 2, &original_indices), kTfLiteOk);
-  ASSERT_TRUE(original_indices.empty());
-}
-
-TEST(UtilsTest, PruneContinuousSubsets_SingleSubset) {
-  TfLiteContext context;
-  std::vector<int> original_indices = {0, 1, 2, 3};
-
-  std::vector<int> indices = original_indices;
-  ASSERT_EQ(PruneContinuousSubsets(&context, 1, &indices), kTfLiteOk);
-  EXPECT_THAT(indices, ElementsAreArray({0, 1, 2, 3}));
-
-  indices = original_indices;
-  ASSERT_EQ(PruneContinuousSubsets(&context, 0, &indices), kTfLiteOk);
-  ASSERT_TRUE(indices.empty());
-
-  indices = original_indices;
-  ASSERT_EQ(PruneContinuousSubsets(&context, 2, &indices), kTfLiteOk);
-  EXPECT_THAT(indices, ElementsAreArray({0, 1, 2, 3}));
-}
-
-TEST(UtilsTest, PruneContinuousSubsets_MultipleSubsets) {
-  TfLiteContext context;
-  // 5 subsets: (0, 1), (3, 4, 5), (7), (10, 11), (19).
-  std::vector<int> original_indices = {0, 1, 3, 4, 5, 7, 10, 11, 19};
-
-  std::vector<int> indices = original_indices;
-  ASSERT_EQ(PruneContinuousSubsets(&context, 4, &indices), kTfLiteOk);
-  EXPECT_THAT(indices, ElementsAreArray({0, 1, 3, 4, 5, 7, 10, 11}));
-
-  // Only the longest subset is selected.
-  indices = original_indices;
-  ASSERT_EQ(PruneContinuousSubsets(&context, 1, &indices), kTfLiteOk);
-  EXPECT_THAT(indices, ElementsAreArray({3, 4, 5}));
-
-  indices = original_indices;
-  ASSERT_EQ(PruneContinuousSubsets(&context, 0, &indices), kTfLiteOk);
-  ASSERT_TRUE(indices.empty());
-
-  indices = original_indices;
-  ASSERT_EQ(PruneContinuousSubsets(&context, 1000, &indices), kTfLiteOk);
-  EXPECT_THAT(indices, ElementsAreArray({0, 1, 3, 4, 5, 7, 10, 11, 19}));
-}
-
-TEST(UtilsTest, PruneContinuousSubsets_UnsortedIndices) {
-  TfLiteContext context;
-  // 5 subsets: (0, 1), (3, 4, 5), (7), (10, 11), (19).
-  std::vector<int> original_indices = {5, 7, 4, 10, 11, 19, 0, 1, 3};
-
-  std::vector<int> indices = original_indices;
-  ASSERT_EQ(PruneContinuousSubsets(&context, 4, &indices), kTfLiteOk);
-  EXPECT_THAT(indices, ElementsAreArray({0, 1, 3, 4, 5, 7, 10, 11}));
-
-  // Only the longest subset is selected.
-  indices = original_indices;
-  ASSERT_EQ(PruneContinuousSubsets(&context, 1, &indices), kTfLiteOk);
-  EXPECT_THAT(indices, ElementsAreArray({3, 4, 5}));
-
-  indices = original_indices;
-  ASSERT_EQ(PruneContinuousSubsets(&context, 0, &indices), kTfLiteOk);
-  ASSERT_TRUE(indices.empty());
+  // Cleanup.
+  TfLiteIntArrayFree(tensors[0].dims);
+  TfLiteIntArrayFree(tensors[1].dims);
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index 8f52d0a0be0..bb5c66899b8 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -98,7 +98,7 @@ class Subgraph {
         /*external_value_ids=*/context->tensors_size, /*flags=*/0,
         &subgraph_ptr);
     if (status != xnn_status_success) {
-      context->ReportError(context, "failed to create XNNPACK subgraph");
+      TF_LITE_KERNEL_LOG(context, "failed to create XNNPACK subgraph");
       return nullptr;
     }
 
@@ -138,7 +138,7 @@ class Subgraph {
     std::vector<uint32_t> xnnpack_tensors(tensors.back() + 1);
     for (int t : tensors) {
       if (context->tensors[t].type != kTfLiteFloat32) {
-        context->ReportError(
+        TF_LITE_KERNEL_LOG(
             context,
             "unsupported datatype (%s) of tensor %d in XNNPACK delegate",
             TfLiteTypeGetName(context->tensors[t].type), t);
@@ -168,8 +168,8 @@ class Subgraph {
           subgraph.get(), xnn_datatype_fp32, dims.size(), dims.data(), data,
           static_cast<uint32_t>(t), flags, &xnnpack_tensors[t]);
       if (status != xnn_status_success) {
-        context->ReportError(context,
-                             "failed to create XNNPACK Value for tensor %d", t);
+        TF_LITE_KERNEL_LOG(context,
+                           "failed to create XNNPACK Value for tensor %d", t);
         return nullptr;
       }
     }
@@ -194,7 +194,7 @@ class Subgraph {
     status = xnn_create_runtime_v2(subgraph.get(), threadpool, /*flags=*/0,
                                    &runtime_ptr);
     if (status != xnn_status_success) {
-      context->ReportError(context, "failed to create XNNPACK runtime");
+      TF_LITE_KERNEL_LOG(context, "failed to create XNNPACK runtime");
       return nullptr;
     }
 
@@ -216,7 +216,7 @@ class Subgraph {
       const xnn_status status = xnn_setup_runtime(
           runtime_.get(), external_values.size(), external_values.data());
       if (status != xnn_status_success) {
-        context->ReportError(context, "failed to setup XNNPACK runtime");
+        TF_LITE_KERNEL_LOG(context, "failed to setup XNNPACK runtime");
         return kTfLiteError;
       }
 
@@ -225,7 +225,7 @@ class Subgraph {
 
     const xnn_status status = xnn_invoke_runtime(runtime_.get());
     if (status != xnn_status_success) {
-      context->ReportError(context, "failed to invoke XNNPACK runtime");
+      TF_LITE_KERNEL_LOG(context, "failed to invoke XNNPACK runtime");
       return kTfLiteError;
     }
 
@@ -245,8 +245,8 @@ class Subgraph {
         return kTfLiteOk;
       default:
         if (context != nullptr) {
-          context->ReportError(context, "invalid padding mode (%d) in node #%d",
-                               static_cast<int>(padding), node_index);
+          TF_LITE_KERNEL_LOG(context, "invalid padding mode (%d) in node #%d",
+                             static_cast<int>(padding), node_index);
         }
         return kTfLiteError;
     }
@@ -274,30 +274,30 @@ class Subgraph {
         return kTfLiteOk;
       case kTfLiteActTanh:
         if (context != nullptr) {
-          context->ReportError(
-              context, "unsupported fused activation (Tanh) in node #%d",
-              node_index);
+          TF_LITE_KERNEL_LOG(context,
+                             "unsupported fused activation (Tanh) in node #%d",
+                             node_index);
         }
         return kTfLiteError;
       case kTfLiteActSignBit:
         if (context != nullptr) {
-          context->ReportError(
-              context, "unsupported fused activation (Sign) in node #%d",
-              node_index);
+          TF_LITE_KERNEL_LOG(context,
+                             "unsupported fused activation (Sign) in node #%d",
+                             node_index);
         }
         return kTfLiteError;
       case kTfLiteActSigmoid:
         if (context != nullptr) {
-          context->ReportError(
+          TF_LITE_KERNEL_LOG(
               context, "unsupported fused activation (Sigmoid) in node #%d",
               node_index);
         }
         return kTfLiteError;
       default:
         if (context != nullptr) {
-          context->ReportError(context,
-                               "invalid fused activation (%d) in node #%d",
-                               static_cast<int>(activation), node_index);
+          TF_LITE_KERNEL_LOG(context,
+                             "invalid fused activation (%d) in node #%d",
+                             static_cast<int>(activation), node_index);
         }
         return kTfLiteError;
     }
@@ -308,32 +308,32 @@ class Subgraph {
                                              int node_index) {
     if (params->stride_width <= 0) {
       if (context != nullptr) {
-        context->ReportError(context, "invalid stride width %d in node #%d",
-                             params->stride_width, node_index);
+        TF_LITE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
+                           params->stride_width, node_index);
       }
       return kTfLiteError;
     }
     if (params->stride_height <= 0) {
       if (context != nullptr) {
-        context->ReportError(context, "invalid stride height %d in node #%d",
-                             params->stride_height, node_index);
+        TF_LITE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
+                           params->stride_height, node_index);
       }
       return kTfLiteError;
     }
 
     if (params->dilation_width_factor <= 0) {
       if (context != nullptr) {
-        context->ReportError(context,
-                             "invalid dilation width factor %d in node #%d",
-                             params->dilation_width_factor, node_index);
+        TF_LITE_KERNEL_LOG(context,
+                           "invalid dilation width factor %d in node #%d",
+                           params->dilation_width_factor, node_index);
       }
       return kTfLiteError;
     }
     if (params->dilation_height_factor <= 0) {
       if (context != nullptr) {
-        context->ReportError(context,
-                             "invalid dilation height factor %d in node #%d",
-                             params->dilation_height_factor, node_index);
+        TF_LITE_KERNEL_LOG(context,
+                           "invalid dilation height factor %d in node #%d",
+                           params->dilation_height_factor, node_index);
       }
       return kTfLiteError;
     }
@@ -346,50 +346,93 @@ class Subgraph {
       int output_channels, int node_index) {
     if (params->stride_width <= 0) {
       if (context != nullptr) {
-        context->ReportError(context, "invalid stride width %d in node #%d",
-                             params->stride_width, node_index);
+        TF_LITE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
+                           params->stride_width, node_index);
       }
       return kTfLiteError;
     }
     if (params->stride_height <= 0) {
       if (context != nullptr) {
-        context->ReportError(context, "invalid stride height %d in node #%d",
-                             params->stride_height, node_index);
+        TF_LITE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
+                           params->stride_height, node_index);
       }
       return kTfLiteError;
     }
 
     if (params->depth_multiplier <= 0) {
       if (context != nullptr) {
-        context->ReportError(context, "invalid depth multiplier %d in node #%d",
-                             params->depth_multiplier, node_index);
+        TF_LITE_KERNEL_LOG(context, "invalid depth multiplier %d in node #%d",
+                           params->depth_multiplier, node_index);
       }
       return kTfLiteError;
     }
     if (output_channels % params->depth_multiplier != 0) {
       if (context != nullptr) {
-        context->ReportError(context,
-                             "depth multiplier %d is incompatible with "
-                             "number of output channels %d in node #%d",
-                             params->depth_multiplier, output_channels,
-                             node_index);
+        TF_LITE_KERNEL_LOG(context,
+                           "depth multiplier %d is incompatible with "
+                           "number of output channels %d in node #%d",
+                           params->depth_multiplier, output_channels,
+                           node_index);
       }
       return kTfLiteError;
     }
 
     if (params->dilation_width_factor <= 0) {
       if (context != nullptr) {
-        context->ReportError(context,
-                             "invalid dilation width factor %d in node #%d",
-                             params->dilation_width_factor, node_index);
+        TF_LITE_KERNEL_LOG(context,
+                           "invalid dilation width factor %d in node #%d",
+                           params->dilation_width_factor, node_index);
       }
       return kTfLiteError;
     }
     if (params->dilation_height_factor <= 0) {
       if (context != nullptr) {
-        context->ReportError(context,
-                             "invalid dilation height factor %d in node #%d",
-                             params->dilation_height_factor, node_index);
+        TF_LITE_KERNEL_LOG(context,
+                           "invalid dilation height factor %d in node #%d",
+                           params->dilation_height_factor, node_index);
+      }
+      return kTfLiteError;
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus CheckPoolingParams(TfLiteContext* context,
+                                         const TfLitePoolParams* params,
+                                         int node_index) {
+    if (params->stride_width <= 0) {
+      if (context != nullptr) {
+        TF_LITE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
+                           params->stride_width, node_index);
+      }
+      return kTfLiteError;
+    }
+    if (params->stride_height <= 0) {
+      if (context != nullptr) {
+        TF_LITE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
+                           params->stride_height, node_index);
+      }
+      return kTfLiteError;
+    }
+
+    if (params->filter_width <= 0) {
+      if (context != nullptr) {
+        TF_LITE_KERNEL_LOG(context, "invalid filter width %d in node #%d",
+                           params->filter_width, node_index);
+      }
+      return kTfLiteError;
+    }
+    if (params->filter_height <= 0) {
+      if (context != nullptr) {
+        TF_LITE_KERNEL_LOG(context, "invalid filter height %d in node #%d",
+                           params->filter_height, node_index);
+      }
+      return kTfLiteError;
+    }
+    if (params->filter_width == 1 && params->filter_height == 1) {
+      if (context != nullptr) {
+        TF_LITE_KERNEL_LOG(context, "meaningless 1x1 pooling in node #%d",
+                           node_index);
       }
       return kTfLiteError;
     }
@@ -404,15 +447,15 @@ class Subgraph {
                                                int node_index) {
     if (node->inputs->size != expected_num_inputs) {
       if (context != nullptr) {
-        context->ReportError(
-            context, "unexpected number of inputs (%d != %d) in node #%d",
-            node->inputs->size, expected_num_inputs, node_index);
+        TF_LITE_KERNEL_LOG(context,
+                           "unexpected number of inputs (%d != %d) in node #%d",
+                           node->inputs->size, expected_num_inputs, node_index);
       }
       return kTfLiteError;
     }
     if (node->outputs->size != expected_num_outputs) {
       if (context != nullptr) {
-        context->ReportError(
+        TF_LITE_KERNEL_LOG(
             context, "unexpected number of output (%d != %d) in node #%d",
             node->outputs->size, expected_num_outputs, node_index);
       }
@@ -426,7 +469,7 @@ class Subgraph {
                                            int tensor_index, int node_index) {
     if (tensor.type != kTfLiteFloat32) {
       if (context != nullptr) {
-        context->ReportError(
+        TF_LITE_KERNEL_LOG(
             context, "unsupported type %s in tensor #%d in node #%d",
             TfLiteTypeGetName(tensor.type), tensor_index, node_index);
       }
@@ -441,7 +484,7 @@ class Subgraph {
                                        int tensor_index) {
     if (tensor.dims->size != expected_num_dims) {
       if (context != nullptr) {
-        context->ReportError(
+        TF_LITE_KERNEL_LOG(
             context,
             "unexpected number of shape dimensions (%d != %d) in tensor #%d",
             tensor.dims->size, expected_num_dims, tensor_index);
@@ -450,9 +493,8 @@ class Subgraph {
     }
     for (int i = 0; i < tensor.dims->size; i++) {
       if (tensor.dims->data[i] <= 0) {
-        context->ReportError(context,
-                             "invalid dimension #%d (%d) in tensor #%d", i,
-                             tensor.dims->data[i], tensor_index);
+        TF_LITE_KERNEL_LOG(context, "invalid dimension #%d (%d) in tensor #%d",
+                           i, tensor.dims->data[i], tensor_index);
         return kTfLiteError;
       }
     }
@@ -464,11 +506,11 @@ class Subgraph {
                                             int tensor_index, int node_index) {
     if (tensor.dims->size < 1) {
       if (context != nullptr) {
-        context->ReportError(context,
-                             "unexpected number of shape dimensions (%d) in "
-                             "tensor #%d in node #%d: "
-                             "expected at least a 1D tensor",
-                             tensor.dims->size, tensor_index, node_index);
+        TF_LITE_KERNEL_LOG(context,
+                           "unexpected number of shape dimensions (%d) in "
+                           "tensor #%d in node #%d: "
+                           "expected at least a 1D tensor",
+                           tensor.dims->size, tensor_index, node_index);
       }
       return kTfLiteError;
     }
@@ -476,11 +518,11 @@ class Subgraph {
     for (int i = 0; i < tensor.dims->size - 1; i++) {
       if (tensor.dims->data[i] != 1) {
         if (context != nullptr) {
-          context->ReportError(context,
-                               "unexpected value %d of shape dimension #%d in "
-                               "tensor #%d in node #%d: "
-                               "expected 1 for non-channel dimensions",
-                               tensor.dims[i], i, tensor_index, node_index);
+          TF_LITE_KERNEL_LOG(context,
+                             "unexpected value %d of shape dimension #%d in "
+                             "tensor #%d in node #%d: "
+                             "expected 1 for non-channel dimensions",
+                             tensor.dims[i], i, tensor_index, node_index);
         }
         return kTfLiteError;
       }
@@ -494,11 +536,10 @@ class Subgraph {
     // TODO(b/149120844): remove checks once dynamic tensors are supported
     if (tensor.allocation_type == kTfLiteDynamic) {
       if (context != nullptr) {
-        context->ReportError(
-            context,
-            "invalid allocation type in tensor #%d in node #%d: "
-            "expected non-dynamic tensor",
-            tensor_index, node_index);
+        TF_LITE_KERNEL_LOG(context,
+                           "invalid allocation type in tensor #%d in node #%d: "
+                           "expected non-dynamic tensor",
+                           tensor_index, node_index);
       }
       return kTfLiteError;
     }
@@ -512,11 +553,10 @@ class Subgraph {
     if (tensor.allocation_type != kTfLiteMmapRo ||
         tensor.data.raw_const == nullptr) {
       if (context != nullptr) {
-        context->ReportError(
-            context,
-            "invalid allocation type in tensor #%d in node #%d: "
-            "expected static read-only tensor",
-            tensor_index, node_index);
+        TF_LITE_KERNEL_LOG(context,
+                           "invalid allocation type in tensor #%d in node #%d: "
+                           "expected static read-only tensor",
+                           tensor_index, node_index);
       }
       return kTfLiteError;
     }
@@ -541,6 +581,14 @@ class Subgraph {
         return VisitAddNode(subgraph, logging_context, node_index, node,
                             context->tensors, add_params, xnnpack_tensors);
       }
+      case kTfLiteBuiltinAveragePool2d: {
+        const TfLitePoolParams* pool_params =
+            static_cast<const TfLitePoolParams*>(node->builtin_data);
+
+        return VisitAveragePool2DNode(subgraph, logging_context, node_index,
+                                      node, context->tensors, pool_params,
+                                      xnnpack_tensors);
+      }
       case kTfLiteBuiltinConv2d: {
         const TfLiteConvParams* conv_params =
             static_cast<const TfLiteConvParams*>(node->builtin_data);
@@ -562,6 +610,14 @@ class Subgraph {
       case kTfLiteBuiltinLogistic:
         return VisitLogisticNode(subgraph, logging_context, node_index, node,
                                  context->tensors, xnnpack_tensors);
+      case kTfLiteBuiltinMaxPool2d: {
+        const TfLitePoolParams* pool_params =
+            static_cast<const TfLitePoolParams*>(node->builtin_data);
+
+        return VisitMaxPool2DNode(subgraph, logging_context, node_index, node,
+                                  context->tensors, pool_params,
+                                  xnnpack_tensors);
+      }
       case kTfLiteBuiltinMul: {
         const TfLiteMulParams* mul_params =
             static_cast<const TfLiteMulParams*>(node->builtin_data);
@@ -636,8 +692,66 @@ class Subgraph {
           /*input2_id=*/xnnpack_tensors[node->inputs->data[1]],
           /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
       if (status != xnn_status_success) {
-        logging_context->ReportError(
-            logging_context, "failed to delegate ADD node #%d", node_index);
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate ADD node #%d",
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitAveragePool2DNode(
+      xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
+      TfLiteNode* node, const TfLiteTensor* tensors,
+      const TfLitePoolParams* pool_params,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(
+        CheckNumInputsAndOutputs(logging_context, node, 1, 1, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    TF_LITE_ENSURE_STATUS(
+        CheckPoolingParams(logging_context, pool_params, node_index));
+
+    uint32_t flags = 0;
+    TF_LITE_ENSURE_STATUS(CalculatePadding(
+        logging_context, pool_params->padding, &flags, node_index));
+
+    float output_min = -std::numeric_limits<float>::infinity();
+    float output_max = +std::numeric_limits<float>::infinity();
+    TF_LITE_ENSURE_STATUS(ConvertActivationToOutputRange(
+        logging_context, node_index, pool_params->activation, &output_min,
+        &output_max));
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_average_pooling_2d(
+          subgraph,
+          /*input_padding_top=*/0,
+          /*input_padding_right=*/0,
+          /*input_padding_bottom=*/0,
+          /*input_padding_left=*/0,
+          static_cast<uint32_t>(pool_params->filter_height),
+          static_cast<uint32_t>(pool_params->filter_width),
+          static_cast<uint32_t>(pool_params->stride_height),
+          static_cast<uint32_t>(pool_params->stride_width), output_min,
+          output_max,
+          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], flags);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context,
+                           "failed to delegate AVERAGE_POOL_2D node #%d",
+                           node_index);
         return kTfLiteError;
       }
     }
@@ -722,8 +836,8 @@ class Subgraph {
           /*bias_id=*/xnnpack_tensors[node->inputs->data[2]],
           /*output_id=*/xnnpack_tensors[node->outputs->data[0]], flags);
       if (status != xnn_status_success) {
-        logging_context->ReportError(
-            logging_context, "failed to delegate CONV_2D node #%d", node_index);
+        TF_LITE_KERNEL_LOG(logging_context,
+                           "failed to delegate CONV_2D node #%d", node_index);
         return kTfLiteError;
       }
     }
@@ -810,9 +924,9 @@ class Subgraph {
           /*bias_id=*/xnnpack_tensors[node->inputs->data[2]],
           /*output_id=*/xnnpack_tensors[node->outputs->data[0]], flags);
       if (status != xnn_status_success) {
-        logging_context->ReportError(
-            logging_context, "failed to delegate DEPTHWISE_CONV_2D node #%d",
-            node_index);
+        TF_LITE_KERNEL_LOG(logging_context,
+                           "failed to delegate DEPTHWISE_CONV_2D node #%d",
+                           node_index);
         return kTfLiteError;
       }
     }
@@ -844,9 +958,9 @@ class Subgraph {
           subgraph, /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
           /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
       if (status != xnn_status_success) {
-        logging_context->ReportError(logging_context,
-                                     "failed to delegate HARD_SWISH node #%d",
-                                     node_index);
+        TF_LITE_KERNEL_LOG(logging_context,
+                           "failed to delegate HARD_SWISH node #%d",
+                           node_index);
         return kTfLiteError;
       }
     }
@@ -878,8 +992,67 @@ class Subgraph {
           subgraph, /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
           /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
       if (status != xnn_status_success) {
-        logging_context->ReportError(
-            logging_context, "failed to delegate SIGMOID node #%d", node_index);
+        TF_LITE_KERNEL_LOG(logging_context,
+                           "failed to delegate SIGMOID node #%d", node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus VisitMaxPool2DNode(
+      xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
+      TfLiteNode* node, const TfLiteTensor* tensors,
+      const TfLitePoolParams* pool_params,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(
+        CheckNumInputsAndOutputs(logging_context, node, 1, 1, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    TF_LITE_ENSURE_STATUS(
+        CheckPoolingParams(logging_context, pool_params, node_index));
+
+    uint32_t flags = 0;
+    TF_LITE_ENSURE_STATUS(CalculatePadding(
+        logging_context, pool_params->padding, &flags, node_index));
+
+    float output_min = -std::numeric_limits<float>::infinity();
+    float output_max = +std::numeric_limits<float>::infinity();
+    TF_LITE_ENSURE_STATUS(ConvertActivationToOutputRange(
+        logging_context, node_index, pool_params->activation, &output_min,
+        &output_max));
+
+    if (subgraph != nullptr) {
+      const xnn_status status = xnn_define_max_pooling_2d(
+          subgraph,
+          /*input_padding_top=*/0,
+          /*input_padding_right=*/0,
+          /*input_padding_bottom=*/0,
+          /*input_padding_left=*/0,
+          static_cast<uint32_t>(pool_params->filter_height),
+          static_cast<uint32_t>(pool_params->filter_width),
+          static_cast<uint32_t>(pool_params->stride_height),
+          static_cast<uint32_t>(pool_params->stride_width),
+          /*dilation_height=*/1,
+          /*dilation_width=*/1, output_min, output_max,
+          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], flags);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context,
+                           "failed to delegate MAX_POOL_2D node #%d",
+                           node_index);
         return kTfLiteError;
       }
     }
@@ -928,8 +1101,8 @@ class Subgraph {
           /*input2_id=*/xnnpack_tensors[node->inputs->data[1]],
           /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
       if (status != xnn_status_success) {
-        logging_context->ReportError(
-            logging_context, "failed to delegate MUL node #%d", node_index);
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate MUL node #%d",
+                           node_index);
         return kTfLiteError;
       }
     }
@@ -974,8 +1147,8 @@ class Subgraph {
           /*slope_id=*/xnnpack_tensors[node->inputs->data[1]],
           /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
       if (status != xnn_status_success) {
-        logging_context->ReportError(
-            logging_context, "failed to delegate PRELU node #%d", node_index);
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate PRELU node #%d",
+                           node_index);
         return kTfLiteError;
       }
     }
@@ -1008,8 +1181,8 @@ class Subgraph {
           /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
           /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
       if (status != xnn_status_success) {
-        logging_context->ReportError(
-            logging_context, "failed to delegate RELU node #%d", node_index);
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate RELU node #%d",
+                           node_index);
         return kTfLiteError;
       }
     }
@@ -1024,9 +1197,9 @@ class Subgraph {
       const std::vector<uint32_t>& xnnpack_tensors) {
     if (params->beta != 1.0f) {
       if (logging_context != nullptr) {
-        logging_context->ReportError(
-            logging_context, "unsupported beta value %.7f in SOFTMAX node #%d",
-            params->beta, node_index);
+        TF_LITE_KERNEL_LOG(logging_context,
+                           "unsupported beta value %.7f in SOFTMAX node #%d",
+                           params->beta, node_index);
       }
       return kTfLiteError;
     }
@@ -1051,8 +1224,8 @@ class Subgraph {
           subgraph, /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
           /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
       if (status != xnn_status_success) {
-        logging_context->ReportError(
-            logging_context, "failed to delegate SOFTMAX node #%d", node_index);
+        TF_LITE_KERNEL_LOG(logging_context,
+                           "failed to delegate SOFTMAX node #%d", node_index);
         return kTfLiteError;
       }
     }
@@ -1077,7 +1250,7 @@ class Subgraph {
 TfLiteIntArray* GetOpsToReplace(TfLiteContext* context) {
   TfLiteIntArray* execution_plan = nullptr;
   if (context->GetExecutionPlan(context, &execution_plan) != kTfLiteOk) {
-    context->ReportError(context, "Unable to get graph execution plan.");
+    TF_LITE_KERNEL_LOG(context, "Unable to get graph execution plan.");
     return nullptr;
   }
 
@@ -1091,9 +1264,9 @@ TfLiteIntArray* GetOpsToReplace(TfLiteContext* context) {
     TfLiteRegistration* registration = nullptr;
     if (context->GetNodeAndRegistration(context, node_index, &node,
                                         &registration) != kTfLiteOk) {
-      context->ReportError(context,
-                           "Unable to get node and registration for node %d.",
-                           node_index);
+      TF_LITE_KERNEL_LOG(context,
+                         "Unable to get node and registration for node %d.",
+                         node_index);
       continue;  // Soft error (skip this node).
     }
 
diff --git a/tensorflow/lite/examples/label_image/BUILD b/tensorflow/lite/examples/label_image/BUILD
index 3e3413e1131..a1d134e5b6a 100644
--- a/tensorflow/lite/examples/label_image/BUILD
+++ b/tensorflow/lite/examples/label_image/BUILD
@@ -29,9 +29,11 @@ cc_binary(
     }),
     deps = [
         ":bitmap_helpers",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/profiling:profiler",
         "//tensorflow/lite/tools/evaluation:utils",
@@ -89,6 +91,7 @@ cc_test(
     ],
     deps = [
         ":bitmap_helpers",
+        "//tensorflow/lite/c:common",
         "@com_google_googletest//:gtest",
     ],
 )
diff --git a/tensorflow/lite/examples/label_image/README.md b/tensorflow/lite/examples/label_image/README.md
index f38b03ecfe2..09e9e77b86a 100644
--- a/tensorflow/lite/examples/label_image/README.md
+++ b/tensorflow/lite/examples/label_image/README.md
@@ -105,7 +105,7 @@ Run the model with NNAPI delegate (`-a 1`), `adb shell
 then you should see something like the followings: `Loaded model
 /data/local/tmp/mobilenet_v1_1.0_224.tflite resolved reporter INFO: Initialized
 TensorFlow Lite runtime. INFO: Created TensorFlow Lite delegate for NNAPI.
-Applied NNAPI delegate.invoked average time: 10.348 ms 0.905401: 653 military
+Applied NNAPI delegate. invoked average time:10.348 ms 0.905401: 653 military
 uniform 0.0379589: 907 Windsor tie 0.00735866: 466 bulletproof vest 0.00605307:
 458 bow tie 0.00422573: 514 cornet`
 
@@ -125,4 +125,13 @@ average time: 8.307 ms 0.729412: 653 military uniform 0.0980392: 907 Windsor tie
 0.0313726: 466 bulletproof vest 0.0313726: 458 bow tie 0.0117647: 700 panpipe
 ```
 
+Run the model with the XNNPACK delegate (`-x 1`), `adb shell
+"/data/local/tmp/label_image \ -m /data/local/tmp/mobilenet_v1_1.0_224.tflite \
+-i /data/local/tmp/grace_hopper.bmp \ -l /data/local/tmp/labels.txt -x 1"` then
+you should see something like the followings: `Loaded model
+/data/local/tmp/mobilenet_v1_1.0_224.tflite resolved reporter INFO: Initialized
+TensorFlow Lite runtime. Applied XNNPACK delegate.invoked average time: 11.0237
+ms 0.90707: 653 military uniform 0.0372418: 907 Windsor tie 0.0073376: 466
+bulletproof vest 0.00592856: 458 bow tie 0.00414093: 514 cornet`
+
 See the `label_image.cc` source code for other command line options.
diff --git a/tensorflow/lite/examples/label_image/bitmap_helpers.h b/tensorflow/lite/examples/label_image/bitmap_helpers.h
index 05209963a16..817d0f67783 100644
--- a/tensorflow/lite/examples/label_image/bitmap_helpers.h
+++ b/tensorflow/lite/examples/label_image/bitmap_helpers.h
@@ -31,10 +31,12 @@ void resize(T* out, uint8_t* in, int image_height, int image_width,
             int wanted_channels, Settings* s);
 
 // explicit instantiation
-template void resize<uint8_t>(uint8_t*, unsigned char*, int, int, int, int, int,
-                              int, Settings*);
 template void resize<float>(float*, unsigned char*, int, int, int, int, int,
                             int, Settings*);
+template void resize<int8_t>(int8_t*, unsigned char*, int, int, int, int, int,
+                             int, Settings*);
+template void resize<uint8_t>(uint8_t*, unsigned char*, int, int, int, int, int,
+                              int, Settings*);
 
 }  // namespace label_image
 }  // namespace tflite
diff --git a/tensorflow/lite/examples/label_image/bitmap_helpers_impl.h b/tensorflow/lite/examples/label_image/bitmap_helpers_impl.h
index 89ba98bc824..07148dc6225 100644
--- a/tensorflow/lite/examples/label_image/bitmap_helpers_impl.h
+++ b/tensorflow/lite/examples/label_image/bitmap_helpers_impl.h
@@ -83,10 +83,19 @@ void resize(T* out, uint8_t* in, int image_height, int image_width,
   auto output_number_of_pixels = wanted_height * wanted_width * wanted_channels;
 
   for (int i = 0; i < output_number_of_pixels; i++) {
-    if (s->input_floating)
-      out[i] = (output[i] - s->input_mean) / s->input_std;
-    else
-      out[i] = (uint8_t)output[i];
+    switch (s->input_type) {
+      case kTfLiteFloat32:
+        out[i] = (output[i] - s->input_mean) / s->input_std;
+        break;
+      case kTfLiteInt8:
+        out[i] = static_cast<int8_t>(output[i] - 128);
+        break;
+      case kTfLiteUInt8:
+        out[i] = static_cast<uint8_t>(output[i]);
+        break;
+      default:
+        break;
+    }
   }
 }
 
diff --git a/tensorflow/lite/examples/label_image/get_top_n.h b/tensorflow/lite/examples/label_image/get_top_n.h
index 47fea2f7758..13527260e46 100644
--- a/tensorflow/lite/examples/label_image/get_top_n.h
+++ b/tensorflow/lite/examples/label_image/get_top_n.h
@@ -24,13 +24,17 @@ namespace label_image {
 template <class T>
 void get_top_n(T* prediction, int prediction_size, size_t num_results,
                float threshold, std::vector<std::pair<float, int>>* top_results,
-               bool input_floating);
+               TfLiteType input_type);
 
 // explicit instantiation so that we can use them otherwhere
-template void get_top_n<uint8_t>(uint8_t*, int, size_t, float,
-                                 std::vector<std::pair<float, int>>*, bool);
 template void get_top_n<float>(float*, int, size_t, float,
-                               std::vector<std::pair<float, int>>*, bool);
+                               std::vector<std::pair<float, int>>*, TfLiteType);
+template void get_top_n<int8_t>(int8_t*, int, size_t, float,
+                                std::vector<std::pair<float, int>>*,
+                                TfLiteType);
+template void get_top_n<uint8_t>(uint8_t*, int, size_t, float,
+                                 std::vector<std::pair<float, int>>*,
+                                 TfLiteType);
 
 }  // namespace label_image
 }  // namespace tflite
diff --git a/tensorflow/lite/examples/label_image/get_top_n_impl.h b/tensorflow/lite/examples/label_image/get_top_n_impl.h
index b2afdbec1cc..4d9d3b84d56 100644
--- a/tensorflow/lite/examples/label_image/get_top_n_impl.h
+++ b/tensorflow/lite/examples/label_image/get_top_n_impl.h
@@ -20,6 +20,8 @@ limitations under the License.
 #include <functional>
 #include <queue>
 
+#include "tensorflow/lite/c/common.h"
+
 namespace tflite {
 namespace label_image {
 
@@ -30,19 +32,29 @@ extern bool input_floating;
 template <class T>
 void get_top_n(T* prediction, int prediction_size, size_t num_results,
                float threshold, std::vector<std::pair<float, int>>* top_results,
-               bool input_floating) {
+               TfLiteType input_type) {
   // Will contain top N results in ascending order.
   std::priority_queue<std::pair<float, int>, std::vector<std::pair<float, int>>,
                       std::greater<std::pair<float, int>>>
       top_result_pq;
 
   const long count = prediction_size;  // NOLINT(runtime/int)
+  float value = 0.0;
+
   for (int i = 0; i < count; ++i) {
-    float value;
-    if (input_floating)
-      value = prediction[i];
-    else
-      value = prediction[i] / 255.0;
+    switch (input_type) {
+      case kTfLiteFloat32:
+        value = prediction[i];
+        break;
+      case kTfLiteInt8:
+        value = (prediction[i] + 128) / 256.0;
+        break;
+      case kTfLiteUInt8:
+        value = prediction[i] / 255.0;
+        break;
+      default:
+        break;
+    }
     // Only add it if it beats the threshold and has a chance at being in
     // the top N.
     if (value < threshold) {
diff --git a/tensorflow/lite/examples/label_image/label_image.cc b/tensorflow/lite/examples/label_image/label_image.cc
index 41fcda4f42f..ce5f17910f3 100644
--- a/tensorflow/lite/examples/label_image/label_image.cc
+++ b/tensorflow/lite/examples/label_image/label_image.cc
@@ -37,6 +37,7 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/examples/label_image/bitmap_helpers.h"
 #include "tensorflow/lite/examples/label_image/get_top_n.h"
 #include "tensorflow/lite/kernels/register.h"
@@ -101,6 +102,19 @@ TfLiteDelegatePtrMap GetDelegates(Settings* s) {
     }
   }
 
+  if (s->xnnpack_delegate) {
+    TfLiteXNNPackDelegateOptions xnnpack_options =
+        TfLiteXNNPackDelegateOptionsDefault();
+    xnnpack_options.num_threads = s->number_of_threads;
+
+    auto delegate = evaluation::CreateXNNPACKDelegate(&xnnpack_options);
+    if (!delegate) {
+      LOG(INFO) << "XNNPACK acceleration is unsupported on this platform.";
+    } else {
+      delegates.emplace("XNNPACK", std::move(delegate));
+    }
+  }
+
   return delegates;
 }
 
@@ -236,13 +250,18 @@ void RunInference(Settings* s) {
   int wanted_width = dims->data[2];
   int wanted_channels = dims->data[3];
 
-  switch (interpreter->tensor(input)->type) {
+  s->input_type = interpreter->tensor(input)->type;
+  switch (s->input_type) {
     case kTfLiteFloat32:
-      s->input_floating = true;
       resize<float>(interpreter->typed_tensor<float>(input), in.data(),
                     image_height, image_width, image_channels, wanted_height,
                     wanted_width, wanted_channels, s);
       break;
+    case kTfLiteInt8:
+      resize<int8_t>(interpreter->typed_tensor<int8_t>(input), in.data(),
+                     image_height, image_width, image_channels, wanted_height,
+                     wanted_width, wanted_channels, s);
+      break;
     case kTfLiteUInt8:
       resize<uint8_t>(interpreter->typed_tensor<uint8_t>(input), in.data(),
                       image_height, image_width, image_channels, wanted_height,
@@ -253,7 +272,6 @@ void RunInference(Settings* s) {
                  << interpreter->tensor(input)->type << " yet";
       exit(-1);
   }
-
   auto profiler =
       absl::make_unique<profiling::Profiler>(s->max_profiling_buffer_entries);
   interpreter->SetProfiler(profiler.get());
@@ -305,12 +323,18 @@ void RunInference(Settings* s) {
   switch (interpreter->tensor(output)->type) {
     case kTfLiteFloat32:
       get_top_n<float>(interpreter->typed_output_tensor<float>(0), output_size,
-                       s->number_of_results, threshold, &top_results, true);
+                       s->number_of_results, threshold, &top_results,
+                       s->input_type);
+      break;
+    case kTfLiteInt8:
+      get_top_n<int8_t>(interpreter->typed_output_tensor<int8_t>(0),
+                        output_size, s->number_of_results, threshold,
+                        &top_results, s->input_type);
       break;
     case kTfLiteUInt8:
       get_top_n<uint8_t>(interpreter->typed_output_tensor<uint8_t>(0),
                          output_size, s->number_of_results, threshold,
-                         &top_results, false);
+                         &top_results, s->input_type);
       break;
     default:
       LOG(FATAL) << "cannot handle output type "
@@ -350,6 +374,7 @@ void display_usage() {
       << "--threads, -t: number of threads\n"
       << "--verbose, -v: [0|1] print more information\n"
       << "--warmup_runs, -w: number of warmup runs\n"
+      << "--xnnpack_delegate, -x: xnnpack delegate\n"
       << "\n";
 }
 
@@ -376,13 +401,14 @@ int Main(int argc, char** argv) {
         {"warmup_runs", required_argument, nullptr, 'w'},
         {"gl_backend", required_argument, nullptr, 'g'},
         {"hexagon_delegate", required_argument, nullptr, 'j'},
+        {"xnnpack_delegate", required_argument, nullptr, 'x'},
         {nullptr, 0, nullptr, 0}};
 
     /* getopt_long stores the option index here. */
     int option_index = 0;
 
     c = getopt_long(argc, argv,
-                    "a:b:c:d:e:f:g:i:j:l:m:p:r:s:t:v:w:", long_options,
+                    "a:b:c:d:e:f:g:i:j:l:m:p:r:s:t:v:w:x:", long_options,
                     &option_index);
 
     /* Detect the end of the options. */
@@ -450,6 +476,9 @@ int Main(int argc, char** argv) {
         s.number_of_warmup_runs =
             strtol(optarg, nullptr, 10);  // NOLINT(runtime/deprecated_fn)
         break;
+      case 'x':
+        s.xnnpack_delegate = optarg;
+        break;
       case 'h':
       case '?':
         /* getopt_long already printed an error message. */
diff --git a/tensorflow/lite/examples/label_image/label_image.h b/tensorflow/lite/examples/label_image/label_image.h
index 45cc1bed454..737231e567f 100644
--- a/tensorflow/lite/examples/label_image/label_image.h
+++ b/tensorflow/lite/examples/label_image/label_image.h
@@ -26,11 +26,12 @@ struct Settings {
   bool verbose = false;
   bool accel = false;
   bool old_accel = false;
-  bool input_floating = false;
+  TfLiteType input_type = kTfLiteFloat32;
   bool profiling = false;
   bool allow_fp16 = false;
   bool gl_backend = false;
   bool hexagon_delegate = false;
+  bool xnnpack_delegate = false;
   int loop_count = 1;
   float input_mean = 127.5f;
   float input_std = 127.5f;
@@ -38,7 +39,6 @@ struct Settings {
   tflite::FlatBufferModel* model;
   string input_bmp_name = "./grace_hopper.bmp";
   string labels_file_name = "./labels.txt";
-  string input_layer_type = "uint8_t";
   int number_of_threads = 4;
   int number_of_results = 5;
   int max_profiling_buffer_entries = 1024;
diff --git a/tensorflow/lite/examples/label_image/label_image_test.cc b/tensorflow/lite/examples/label_image/label_image_test.cc
index 4db139f048d..f68792e22bb 100644
--- a/tensorflow/lite/examples/label_image/label_image_test.cc
+++ b/tensorflow/lite/examples/label_image/label_image_test.cc
@@ -29,6 +29,7 @@ TEST(LabelImageTest, GraceHopper) {
       "grace_hopper.bmp";
   int height, width, channels;
   Settings s;
+  s.input_type = kTfLiteUInt8;
   std::vector<uint8_t> input =
       read_bmp(lena_file, &width, &height, &channels, &s);
   ASSERT_EQ(height, 606);
@@ -45,7 +46,7 @@ TEST(LabelImageTest, GetTopN) {
   uint8_t in[] = {1, 1, 2, 2, 4, 4, 16, 32, 128, 64};
 
   std::vector<std::pair<float, int>> top_results;
-  get_top_n<uint8_t>(in, 10, 5, 0.025, &top_results, false);
+  get_top_n<uint8_t>(in, 10, 5, 0.025, &top_results, kTfLiteUInt8);
   ASSERT_EQ(top_results.size(), 4);
   ASSERT_EQ(top_results[0].second, 8);
 }
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/arithmetic_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/arithmetic_builder.cc
index a2a4dda68c9..2aff9c88010 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/arithmetic_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/arithmetic_builder.cc
@@ -73,36 +73,55 @@ TfLiteStatus ArithmeticOpBuilder::PopulateSubGraph(
   AddInput(TensorID(input2_min_const->GetID(), 0));
   AddInput(TensorID(input2_max_const->GetID(), 0));
 
-  // Output min/max as inputs, only if it's an Add node.
-  if (op_node_.op_type == OP_QuantizedAdd_8p8to8) {
-    output_min_ = 0;
-    output_max_ = 0;
-    TF_LITE_ENSURE_STATUS(ComputeMinAndMaxQuantValues(
-        context->tensors[outputs->data[0]], &output_min_, &output_max_,
-        std::numeric_limits<uint8_t>::min(),
-        std::numeric_limits<uint8_t>::max()));
-    if (output_max_ != 0) {
-      auto* output_min_const = graph_builder_->AddConstNodeWithData(
-          quant_bound_shape, reinterpret_cast<char*>(&output_min_),
-          sizeof(output_min_));
-      auto* output_max_const = graph_builder_->AddConstNodeWithData(
-          quant_bound_shape, reinterpret_cast<char*>(&output_max_),
-          sizeof(output_max_));
-      AddInput(TensorID(output_min_const->GetID(), 0));
-      AddInput(TensorID(output_max_const->GetID(), 0));
-    }
-  }
-
-  // Hexagon outputs for this node.
+  // Output details.
+  TF_LITE_ENSURE_STATUS(ComputeMinAndMaxQuantValues(
+      context->tensors[outputs->data[0]], &output_min_, &output_max_,
+      std::numeric_limits<uint8_t>::min(),
+      std::numeric_limits<uint8_t>::max()));
+  auto* output_min_const = graph_builder_->AddConstNodeWithData(
+      quant_bound_shape, reinterpret_cast<char*>(&output_min_),
+      sizeof(output_min_));
+  auto* output_max_const = graph_builder_->AddConstNodeWithData(
+      quant_bound_shape, reinterpret_cast<char*>(&output_max_),
+      sizeof(output_max_));
   int output_batch_size, output_height_size, output_width_size,
       output_depth_size;
   GetDims(&output_batch_size, &output_height_size, &output_width_size,
           &output_depth_size, context->tensors[outputs->data[0]].dims);
-  node_output_ = AddOutput(sizeof(uint8_t), 4,
-                           {output_batch_size, output_height_size,
-                            output_width_size, output_depth_size});
-  AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-  AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+
+  if (op_node_.op_type == OP_QuantizedAdd_8p8to8 && output_max_ != 0) {
+    // Hexagon's QuantizedAdd supports output min/max as input.
+    AddInput(TensorID(output_min_const->GetID(), 0));
+    AddInput(TensorID(output_max_const->GetID(), 0));
+  }
+
+  if (op_node_.op_type == OP_QuantizedMul_8x8to32) {
+    const auto& math_out = AddOutput(sizeof(int32_t), 4,
+                                     {output_batch_size, output_height_size,
+                                      output_width_size, output_depth_size});
+    const auto& math_out_min = AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+    const auto& math_out_max = AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+
+    auto* requantize_op = graph_builder_->AddNode(GetTFLiteNodeID());
+    requantize_op->SetOpType(OP_Requantize_32to8);
+    requantize_op->AddInput(math_out);
+    requantize_op->AddInput(math_out_min);
+    requantize_op->AddInput(math_out_max);
+    requantize_op->AddInput(TensorID(output_min_const->GetID(), 0));
+    requantize_op->AddInput(TensorID(output_max_const->GetID(), 0));
+    node_output_ =
+        requantize_op->AddOutput(sizeof(uint8_t), 4,
+                                 {output_batch_size, output_height_size,
+                                  output_width_size, output_depth_size});
+    requantize_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+    requantize_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  } else {
+    node_output_ = AddOutput(sizeof(uint8_t), 4,
+                             {output_batch_size, output_height_size,
+                              output_width_size, output_depth_size});
+    AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+    AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  }
 
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/conv_2d_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/conv_2d_builder.cc
index 809d6e7d7dc..85957706d57 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/conv_2d_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/conv_2d_builder.cc
@@ -125,6 +125,9 @@ TfLiteStatus Conv2dOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
 
   // Input data tensor.
   const auto& data_tensor = context->tensors[inputs->data[0]];
+  int input_batch_size, input_height_size, input_width_size, input_depth_size;
+  GetDims(&input_batch_size, &input_height_size, &input_width_size,
+          &input_depth_size, data_tensor.dims);
   TF_LITE_ENSURE_STATUS(ComputeMinAndMaxQuantValues(
       data_tensor, &data_min_, &data_max_, std::numeric_limits<uint8_t>::min(),
       std::numeric_limits<uint8_t>::max()));
@@ -139,6 +142,7 @@ TfLiteStatus Conv2dOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
   int stride_height = 0;
   int stride_width = 0;
   bool is_dilated_depthwise_conv = false;
+  int channel_multiplier = 1;
   if (op_node_.op_type == OP_Supernode_8x8p32to8) {
     const TfLiteConvParams* conv_params =
         reinterpret_cast<const TfLiteConvParams*>(builtin_data_);
@@ -153,6 +157,7 @@ TfLiteStatus Conv2dOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
     stride_width = conv_params->stride_width;
     padding_type = conv_params->padding;
     activation = conv_params->activation;
+    channel_multiplier = conv_params->depth_multiplier;
     // We only support dilation for DepthwiseConv.
     if (conv_params->dilation_height_factor > 1 ||
         conv_params->dilation_width_factor > 1) {
@@ -176,22 +181,41 @@ TfLiteStatus Conv2dOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
   // Transpose NHWC -> HWCN
   GetDims(&weights_batch_size, &weights_height_size, &weights_width_size,
           &weights_depth_size, weights_tensor.dims);
-  weight_shape_ = {weights_height_size, weights_width_size, weights_depth_size,
-                   weights_batch_size};
-  RuntimeShape nhwc_shape({weights_batch_size, weights_height_size,
-                           weights_width_size, weights_depth_size});
-  RuntimeShape hwcn_shape({weights_height_size, weights_width_size,
-                           weights_depth_size, weights_batch_size});
-  std::vector<uint8_t> hwcn(NumElements(&weights_tensor));
-  TransposeParams transpose_params;
-  transpose_params.perm_count = 4;
-  transpose_params.perm[0] = 1;
-  transpose_params.perm[1] = 2;
-  transpose_params.perm[2] = 3;
-  transpose_params.perm[3] = 0;
-  optimized_ops::Transpose<uint8_t>(transpose_params, nhwc_shape,
-                                    weights_tensor.data.uint8, hwcn_shape,
-                                    hwcn.data());
+  OpBuilder* const_weights_node = nullptr;
+  if (op_node_.op_type == OP_Supernode_8x8p32to8) {
+    // Hexagon lib expects the weight tensor in HWCN, TFLite uses NHWC.
+    // Transpose NHWC -> HWCN
+    weight_shape_ = {weights_height_size, weights_width_size,
+                     weights_depth_size, weights_batch_size};
+    RuntimeShape nhwc_shape({weights_batch_size, weights_height_size,
+                             weights_width_size, weights_depth_size});
+    RuntimeShape hwcn_shape({weights_height_size, weights_width_size,
+                             weights_depth_size, weights_batch_size});
+    std::vector<uint8_t> hwcn(NumElements(&weights_tensor));
+    TransposeParams transpose_params;
+    transpose_params.perm_count = 4;
+    transpose_params.perm[0] = 1;
+    transpose_params.perm[1] = 2;
+    transpose_params.perm[2] = 3;
+    transpose_params.perm[3] = 0;
+    optimized_ops::Transpose<uint8_t>(transpose_params, nhwc_shape,
+                                      weights_tensor.data.uint8, hwcn_shape,
+                                      hwcn.data());
+    const_weights_node = graph_builder_->AddConstNodeWithData(
+        weight_shape_.data(), (char*)hwcn.data(),
+        hwcn.size() * sizeof(hwcn[0]));
+  } else if (op_node_.op_type == OP_DepthwiseSupernode_8x8p32to8) {
+    // Hexagon treats depthwise conv like tf.nn.depthwise_conv2d, where the
+    // expected filter shape is [fh,fw,din,dmul].
+    // The data itself will remain the same, since TFLite's representation is
+    // just a 'flattening' of Hexagon's version.
+    const int channel_multiplier = weights_depth_size / input_depth_size;
+    weight_shape_ = {weights_height_size, weights_width_size, input_depth_size,
+                     channel_multiplier};
+    const_weights_node = graph_builder_->AddConstNodeWithData(
+        weight_shape_.data(), weights_tensor.data.raw,
+        NumElements(&weights_tensor) * sizeof(weights_tensor.data.uint8[0]));
+  }
   // Quantization params for Weights tensor.
   TF_LITE_ENSURE_STATUS(
       ComputeMinAndMaxQuantValues(weights_tensor, &weights_min_, &weights_max_,
@@ -201,8 +225,6 @@ TfLiteStatus Conv2dOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
       quant_bound_shape.data(), (char*)&weights_min_, sizeof(weights_min_));
   auto* weights_max_const = graph_builder_->AddConstNodeWithData(
       quant_bound_shape.data(), (char*)&weights_max_, sizeof(weights_max_));
-  auto* const_weights_node = graph_builder_->AddConstNodeWithData(
-      weight_shape_.data(), (char*)hwcn.data(), hwcn.size() * sizeof(hwcn[0]));
   graph_builder_->AddTensorWithID(inputs->data[1], const_weights_node->GetID(),
                                   0);
 
@@ -256,6 +278,18 @@ TfLiteStatus Conv2dOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
   auto* bias_max_const = graph_builder_->AddConstNodeWithData(
       quant_bound_shape.data(), (char*)&bias_max_, sizeof(bias_max_));
 
+  // TODO(b/143759564): Simplify this method when depth_multiplier support needs
+  // generalizing.
+  if (channel_multiplier > 1 && input_depth_size == 1) {
+    // Depthwise Conv with input_depth == 1 & channel_multiplier > 1 is
+    // equivalent to Conv.
+    SetOpType(OP_Supernode_8x8p32to8);
+  } else if (channel_multiplier > 1) {
+    TF_LITE_KERNEL_LOG(
+        context, "depth_multiplier > 1 not supported with input_depth > 1");
+    return kTfLiteError;
+  }
+
   TensorID output, output_min, output_max;
   if (is_dilated_depthwise_conv) {
     // For dilated Depthwise Conv, we convert this node into SpaceToBatchND, and
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
index 529b3e1f2e9..d9d75cf2b04 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
@@ -31,7 +31,9 @@ OpBuilder* GraphBuilder::CreateOpBuilderFromTfLiteOp(int op_type) {
     case kTfLiteBuiltinArgMin:
       return CreateArgMinMaxOpBuilder(this, OP_ArgMin_8);
     case kTfLiteBuiltinMul:
-      return CreateArithmeticBuilder(this, OP_QuantizedMul_8x8to8);
+      // The 32-bit version of Mul is more accurate, and robust to disparities
+      // in input/output ranges.
+      return CreateArithmeticBuilder(this, OP_QuantizedMul_8x8to32);
     case kTfLiteBuiltinSub:
       return CreateArithmeticBuilder(this, OP_QuantizedSub_8p8to8);
     case kTfLiteBuiltinMean:
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
index 11365d3d3d2..7270cfd06b0 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
@@ -28,6 +28,7 @@ hexagon_op_tests(
         "arg_min_max_test.cc",
         "concat_test.cc",
         "conv_test.cc",
+        "mul_test.cc",
         "neg_test.cc",
         "pad_test.cc",
         "pool_test.cc",
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/conv_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/conv_test.cc
index f1e1686b27a..ba4b57001fb 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/conv_test.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/conv_test.cc
@@ -256,4 +256,86 @@ TEST(QuantizedConvolutionOpModel, SimpleConvTestReLU6Activation) {
                                             1e-5)));
 }
 
+// Depthwise Conv with multiplier > 1 but input depth==1 should resolve into a
+// Conv op.
+TEST(QuantizedConvolutionOpModel, DepthwiseConvWithMultiplier_InputDepth1) {
+  QuantizedConvolutionOpModel m(BuiltinOperator_DEPTHWISE_CONV_2D,
+                                {TensorType_UINT8, {1, 6, 6, 1}, -63.5, 64},
+                                {TensorType_UINT8, {1, 5, 5, 3}, -63.5, 64},
+                                {TensorType_UINT8, {}, -127, 128},
+                                Padding_VALID);
+  // clang-format off
+  m.SetInput({0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0});
+  m.SetFilter({1, 2, 3, 4, 5,
+               1, 2, 3, 4, 5,
+               1, 2, 3, 4, 5,
+               1, 2, 3, 4, 5,
+               1, 2, 3, 4, 5,
+               6, 7, 8, 9, 10,
+               6, 7, 8, 9, 10,
+               6, 7, 8, 9, 10,
+               6, 7, 8, 9, 10,
+               6, 7, 8, 9, 10,
+               1, 2, 3, 4, 5,
+               1, 2, 3, 4, 5,
+               1, 2, 3, 4, 5,
+               1, 2, 3, 4, 5,
+               1, 2, 3, 4, 5});
+  // clang-format on
+  m.SetBias({1, 2, 3});
+
+  // Reference output.
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput();
+
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear(reference_output, 1e-5)));
+}
+
+// Depthwise Conv with multiplier > 1 but input depth==1 should resolve into a
+// Conv op.
+TEST(QuantizedConvolutionOpModel,
+     DepthwiseConvWithMultiplier_InputDepth1_RELU) {
+  QuantizedConvolutionOpModel m(BuiltinOperator_DEPTHWISE_CONV_2D,
+                                {TensorType_UINT8, {1, 6, 6, 1}, -63.5, 64},
+                                {TensorType_UINT8, {1, 5, 5, 3}, -63.5, 64},
+                                {TensorType_UINT8, {}, -127, 128},
+                                Padding_VALID, /**dilation_factor**/ 1,
+                                /**stride**/ 2, ActivationFunctionType_RELU6);
+  // clang-format off
+  m.SetInput({0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0});
+  m.SetFilter({1, 2, 3, 4, 5,
+               1, 2, 3, 4, 5,
+               1, 2, 3, 4, 5,
+               1, 2, 3, 4, 5,
+               1, 2, 3, 4, 5,
+               6, 7, 8, 9, 10,
+               6, 7, 8, 9, 10,
+               6, 7, 8, 9, 10,
+               6, 7, 8, 9, 10,
+               6, 7, 8, 9, 10,
+               1, 2, 3, 4, 5,
+               1, 2, 3, 4, 5,
+               1, 2, 3, 4, 5,
+               1, 2, 3, 4, 5,
+               1, 2, 3, 4, 5});
+  // clang-format on
+  m.SetBias({1, 2, 3});
+
+  // Reference output.
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput();
+
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear(reference_output, 1e-5)));
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/mul_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/mul_test.cc
new file mode 100644
index 00000000000..3fd68c30586
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/mul_test.cc
@@ -0,0 +1,96 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+using testing::ElementsAreArray;
+
+class MulOpModel : public SingleOpModelWithHexagon {
+ public:
+  explicit MulOpModel(const TensorData& input1, const TensorData& input2,
+                      const TensorData& output) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_MUL, BuiltinOptions_MulOptions,
+                 CreateMulOptions(builder_, activation_type).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  template <typename T>
+  void SetInput1(const std::vector<float>& data) {
+    QuantizeAndPopulate<T>(input1_, data);
+  }
+
+  template <typename T>
+  void SetInput2(const std::vector<float>& data) {
+    QuantizeAndPopulate<T>(input2_, data);
+  }
+
+  template <typename T>
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
+                         GetZeroPoint(output_));
+  }
+
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  BuiltinOperator op_code_;
+
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+TEST(MulOpModel, MulOutput) {
+  MulOpModel model(
+      /*input1=*/{TensorType_UINT8, {2, 3}, -0.44f, 8.0f},
+      /*input2=*/{TensorType_UINT8, {1, 3}, 0, 0.999f},
+      /*output=*/{TensorType_UINT8, {2, 3}, -0.44f, 4.996f});
+  model.SetInput1<uint8_t>({1, 2, 3, 4, 5, 6});
+  model.SetInput2<uint8_t>({0.1f, 0.2f, 0.3f});
+
+  // Reference output.
+  model.Invoke();
+  auto reference_out = model.GetDequantizedOutput<uint8_t>();
+
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 3}));
+  EXPECT_THAT(model.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_out, 0.03)));
+}
+
+TEST(MulOpModel, MulOutput_LargeInputRange) {
+  MulOpModel model(
+      /*input1=*/{TensorType_UINT8, {1, 2, 2, 3}, -0.44f, 55.7f},
+      /*input2=*/{TensorType_UINT8, {1, 1, 2, 3}, 0, 0.999f},
+      /*output=*/{TensorType_UINT8, {1, 2, 2, 3}, -0.44f, 4.996f});
+  model.SetInput1<uint8_t>({1, 2, 3, 4, 5, 6, 20, 30, 40, 50, 52, 55});
+  model.SetInput2<uint8_t>({0.8f, 0.9f, 0.99f, 0.8f, 0.9f, 0.99f});
+
+  // Reference output.
+  model.Invoke();
+  auto reference_out = model.GetDequantizedOutput<uint8_t>();
+
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 2, 2, 3}));
+  EXPECT_THAT(model.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_out, 0.03)));
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/utils.cc b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
index 55736373da3..feff2080eaa 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/utils.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
@@ -188,6 +188,8 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
       if (!InputsWithCorrectTypes(node, context,
                                   {kTfLiteUInt8, kTfLiteUInt8, kTfLiteInt32}))
         return false;
+
+      // Check dilation.
       const TfLiteDepthwiseConvParams* conv_params =
           reinterpret_cast<const TfLiteDepthwiseConvParams*>(
               node->builtin_data);
@@ -198,10 +200,19 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
         if (conv_params->stride_height != 1 || conv_params->stride_width != 1)
           return false;
       }
+
+      // We currently only support depth_multiplier > 1 when:
+      // 1. dilation_factor == 1 AND
+      // 2. input_depth == 1
+      // TODO(b/143759564): Add support for general case.
+      const auto& input = context->tensors[node->inputs->data[0]];
+      const bool supported_depth_multiplier =
+          conv_params->depth_multiplier == 1 ||
+          (!dilation && input.dims->size == 4 && input.dims->data[3] == 1);
+
       return (IsActivationReluOrNone(conv_params->activation) &&
               conv_params->stride_height <= 3 &&
-              conv_params->stride_width <= 3 &&
-              conv_params->depth_multiplier == 1);
+              conv_params->stride_width <= 3 && supported_depth_multiplier);
     }
     case kTfLiteBuiltinReshape: {
       if (node->inputs->size > 2 ||
diff --git a/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_lstm_test.py b/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_lstm_test.py
index 8c48ef6cec6..0d2a2f58ad2 100644
--- a/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_lstm_test.py
+++ b/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_lstm_test.py
@@ -20,7 +20,7 @@ import tempfile
 
 import numpy as np
 from six.moves import range
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 
 from tensorflow.examples.tutorials.mnist import input_data
 from tensorflow.lite.experimental.examples.lstm.rnn import bidirectional_dynamic_rnn
@@ -32,13 +32,11 @@ from tensorflow.python.platform import test
 # initial values. This can help make the test smaller.
 TRAIN_STEPS = 0
 
-CONFIG = tf.ConfigProto(device_count={"GPU": 0})
-
 
 class BidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
-    tf.reset_default_graph()
+    tf.compat.v1.reset_default_graph()
     # Import MNIST dataset
     self.mnist = input_data.read_data_sets(
         "/tmp/data/", fake_data=True, one_hot=True)
@@ -59,17 +57,17 @@ class BidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
 
   def buildLstmLayer(self):
     return tf.keras.layers.StackedRNNCells([
-        tf.lite.experimental.nn.TFLiteLSTMCell(
+        tf.compat.v1.lite.experimental.nn.TFLiteLSTMCell(
             self.num_units, use_peepholes=True, forget_bias=0, name="rnn1"),
-        tf.lite.experimental.nn.TFLiteLSTMCell(
+        tf.compat.v1.lite.experimental.nn.TFLiteLSTMCell(
             self.num_units, num_proj=8, forget_bias=0, name="rnn2"),
-        tf.lite.experimental.nn.TFLiteLSTMCell(
+        tf.compat.v1.lite.experimental.nn.TFLiteLSTMCell(
             self.num_units // 2,
             use_peepholes=True,
             num_proj=8,
             forget_bias=0,
             name="rnn3"),
-        tf.lite.experimental.nn.TFLiteLSTMCell(
+        tf.compat.v1.lite.experimental.nn.TFLiteLSTMCell(
             self.num_units, forget_bias=0, name="rnn4")
     ])
 
@@ -96,7 +94,7 @@ class BidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
     out_bias = tf.Variable(tf.random.normal([self.n_classes]))
 
     # input image placeholder
-    x = tf.placeholder(
+    x = tf.compat.v1.placeholder(
         "float", [None, self.time_steps, self.n_input], name="INPUT_IMAGE")
 
     if is_dynamic_rnn:
@@ -113,7 +111,7 @@ class BidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
       output = output[-1]
     else:
       lstm_input = tf.unstack(x, self.time_steps, 1)
-      outputs, _, _ = tf.nn.static_bidirectional_rnn(
+      outputs, _, _ = tf.compat.v1.nn.static_bidirectional_rnn(
           fw_lstm_layer, bw_lstm_layer, lstm_input, dtype="float32")
       output = outputs[-1]
 
@@ -140,11 +138,11 @@ class BidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
     loss = tf.reduce_mean(
         tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y))
     # Optimization
-    opt = tf.train.AdamOptimizer(
+    opt = tf.compat.v1.train.AdamOptimizer(
         learning_rate=self.learning_rate).minimize(loss)
 
     # Initialize variables
-    init = tf.global_variables_initializer()
+    init = tf.compat.v1.global_variables_initializer()
     sess.run(init)
     for _ in range(TRAIN_STEPS):
       batch_x, batch_y = self.mnist.train.next_batch(
@@ -182,12 +180,12 @@ class BidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
     saver.save(sess, model_dir)
 
     # Reset the graph.
-    tf.reset_default_graph()
+    tf.compat.v1.reset_default_graph()
     x, prediction, output_class = self.buildModel(fw_lstm_layer, bw_lstm_layer,
                                                   is_dynamic_rnn)
 
-    new_sess = tf.compat.v1.Session(config=CONFIG)
-    saver = tf.train.Saver()
+    new_sess = tf.compat.v1.Session()
+    saver = tf.compat.v1.train.Saver()
     saver.restore(new_sess, model_dir)
     return x, prediction, output_class, new_sess
 
@@ -235,8 +233,8 @@ class BidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
     Returns:
       The tflite inference result.
     """
-    converter = tf.lite.TFLiteConverter.from_session(sess, [input_tensor],
-                                                     [output_tensor])
+    converter = tf.compat.v1.lite.TFLiteConverter.from_session(
+        sess, [input_tensor], [output_tensor])
     converter.experimental_new_converter = use_mlir_converter
     tflite = converter.convert()
 
@@ -257,13 +255,13 @@ class BidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
     return result
 
   def testStaticRnnMultiRnnCell(self):
-    sess = tf.compat.v1.Session(config=CONFIG)
+    sess = tf.compat.v1.Session()
 
     x, prediction, output_class = self.buildModel(self.buildLstmLayer(),
                                                   self.buildLstmLayer(), False)
     self.trainModel(x, prediction, output_class, sess)
 
-    saver = tf.train.Saver()
+    saver = tf.compat.v1.train.Saver()
     x, prediction, output_class, new_sess = self.saveAndRestoreModel(
         self.buildLstmLayer(), self.buildLstmLayer(), sess, saver, False)
 
@@ -276,13 +274,13 @@ class BidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
 
   @test_util.enable_control_flow_v2
   def testDynamicRnnMultiRnnCell(self):
-    sess = tf.compat.v1.Session(config=CONFIG)
+    sess = tf.compat.v1.Session()
 
     x, prediction, output_class = self.buildModel(self.buildLstmLayer(),
                                                   self.buildLstmLayer(), True)
     self.trainModel(x, prediction, output_class, sess)
 
-    saver = tf.train.Saver()
+    saver = tf.compat.v1.train.Saver()
     x, prediction, output_class, new_sess = self.saveAndRestoreModel(
         self.buildLstmLayer(),
         self.buildLstmLayer(),
@@ -299,4 +297,5 @@ class BidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
 
 
 if __name__ == "__main__":
+  tf.disable_v2_behavior()
   test.main()
diff --git a/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_rnn_test.py b/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_rnn_test.py
index 49b0b8c85f2..00fdb4a2f96 100644
--- a/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_rnn_test.py
+++ b/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_rnn_test.py
@@ -20,24 +20,20 @@ import tempfile
 
 import numpy as np
 from six.moves import range
-import tensorflow as tf
-
-from tensorflow import flags
+import tensorflow.compat.v1 as tf
 
 from tensorflow.examples.tutorials.mnist import input_data
 from tensorflow.lite.experimental.examples.lstm.rnn import bidirectional_dynamic_rnn
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
-FLAGS = flags.FLAGS
+FLAGS = tf.compat.v1.flags.FLAGS
 
 # Number of steps to train model.
 # Dial to 0 means no training at all, all the weights will be just using their
 # initial values. This can help make the test smaller.
 TRAIN_STEPS = 0
 
-CONFIG = tf.ConfigProto(device_count={"GPU": 0})
-
 
 class BidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
 
@@ -66,8 +62,10 @@ class BidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
 
   def buildRnnLayer(self):
     return tf.keras.layers.StackedRNNCells([
-        tf.lite.experimental.nn.TfLiteRNNCell(self.num_units, name="rnn1"),
-        tf.lite.experimental.nn.TfLiteRNNCell(self.num_units, name="rnn2")
+        tf.compat.v1.lite.experimental.nn.TfLiteRNNCell(
+            self.num_units, name="rnn1"),
+        tf.compat.v1.lite.experimental.nn.TfLiteRNNCell(
+            self.num_units, name="rnn2")
     ])
 
   def buildModel(self,
@@ -103,7 +101,7 @@ class BidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
     if is_inference:
       batch_size = 1
     # input image placeholder
-    x = tf.placeholder(
+    x = tf.compat.v1.placeholder(
         "float", [batch_size, self.time_steps, self.n_input],
         name="INPUT_IMAGE")
 
@@ -130,7 +128,7 @@ class BidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
       # but inference phase, we change it to None.
       if is_inference:
         sequence_length = None
-      outputs, _, _ = tf.nn.static_bidirectional_rnn(
+      outputs, _, _ = tf.compat.v1.nn.static_bidirectional_rnn(
           fw_rnn_layer,
           bw_rnn_layer,
           rnn_inputs,
@@ -165,7 +163,7 @@ class BidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
         learning_rate=self.learning_rate).minimize(loss)
 
     # Initialize variables
-    init = tf.global_variables_initializer()
+    init = tf.compat.v1.global_variables_initializer()
     sess.run(init)
     for _ in range(TRAIN_STEPS):
       batch_x, batch_y = self.mnist.train.next_batch(
@@ -210,11 +208,11 @@ class BidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
     saver.save(sess, model_dir)
 
     # Reset the graph.
-    tf.reset_default_graph()
+    tf.compat.v1.reset_default_graph()
     x, prediction, output_class = self.buildModel(
         fw_rnn_layer, bw_rnn_layer, is_dynamic_rnn, True, use_sequence_length)
 
-    new_sess = tf.compat.v1.Session(config=CONFIG)
+    new_sess = tf.compat.v1.Session()
     saver = tf.train.Saver()
     saver.restore(new_sess, model_dir)
     return x, prediction, output_class, new_sess
@@ -263,8 +261,8 @@ class BidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
     Returns:
       The tflite inference result.
     """
-    converter = tf.lite.TFLiteConverter.from_session(sess, [input_tensor],
-                                                     [output_tensor])
+    converter = tf.compat.v1.lite.TFLiteConverter.from_session(
+        sess, [input_tensor], [output_tensor])
     tflite = converter.convert()
     converter.experimental_new_converter = use_mlir_converter
 
@@ -282,13 +280,13 @@ class BidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
     return result
 
   def testStaticRnnMultiRnnCell(self):
-    sess = tf.compat.v1.Session(config=CONFIG)
+    sess = tf.compat.v1.Session()
 
     x, prediction, output_class = self.buildModel(
         self.buildRnnLayer(), self.buildRnnLayer(), False, is_inference=False)
     self.trainModel(x, prediction, output_class, sess)
 
-    saver = tf.train.Saver()
+    saver = tf.compat.v1.train.Saver()
     x, prediction, output_class, new_sess = self.saveAndRestoreModel(
         self.buildRnnLayer(), self.buildRnnLayer(), sess, saver, False)
 
@@ -300,7 +298,7 @@ class BidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
     self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
 
   def testStaticRnnMultiRnnCellWithSequenceLength(self):
-    sess = tf.compat.v1.Session(config=CONFIG)
+    sess = tf.compat.v1.Session()
 
     x, prediction, output_class = self.buildModel(
         self.buildRnnLayer(),
@@ -328,13 +326,13 @@ class BidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
 
   @test_util.enable_control_flow_v2
   def testDynamicRnnMultiRnnCell(self):
-    sess = tf.compat.v1.Session(config=CONFIG)
+    sess = tf.compat.v1.Session()
 
     x, prediction, output_class = self.buildModel(
         self.buildRnnLayer(), self.buildRnnLayer(), True, is_inference=False)
     self.trainModel(x, prediction, output_class, sess)
 
-    saver = tf.train.Saver()
+    saver = tf.compat.v1.train.Saver()
     x, prediction, output_class, new_sess = self.saveAndRestoreModel(
         self.buildRnnLayer(),
         self.buildRnnLayer(),
@@ -351,7 +349,7 @@ class BidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
 
   @test_util.enable_control_flow_v2
   def testDynamicRnnMultiRnnCellWithSequenceLength(self):
-    sess = tf.compat.v1.Session(config=CONFIG)
+    sess = tf.compat.v1.Session()
 
     x, prediction, output_class = self.buildModel(
         self.buildRnnLayer(),
@@ -361,7 +359,7 @@ class BidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
         use_sequence_length=True)
     self.trainModel(x, prediction, output_class, sess)
 
-    saver = tf.train.Saver()
+    saver = tf.compat.v1.train.Saver()
     x, prediction, output_class, new_sess = self.saveAndRestoreModel(
         self.buildRnnLayer(),
         self.buildRnnLayer(),
@@ -379,4 +377,5 @@ class BidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
 
 
 if __name__ == "__main__":
+  tf.disable_v2_behavior()
   test.main()
diff --git a/tensorflow/lite/experimental/examples/lstm/g3doc/README.md b/tensorflow/lite/experimental/examples/lstm/g3doc/README.md
index 87c37d3541c..7f1d31d808e 100644
--- a/tensorflow/lite/experimental/examples/lstm/g3doc/README.md
+++ b/tensorflow/lite/experimental/examples/lstm/g3doc/README.md
@@ -154,7 +154,7 @@ class MnistLstmModel(object):
     """Build the model using the given configs.
 
     Returns:
-      x: The input placehoder tensor.
+      x: The input placeholder tensor.
       logits: The logits of the output.
       output_class: The prediction.
     """
diff --git a/tensorflow/lite/experimental/examples/lstm/rnn_cell.py b/tensorflow/lite/experimental/examples/lstm/rnn_cell.py
index 5017de9b93b..3d5ebf4946f 100644
--- a/tensorflow/lite/experimental/examples/lstm/rnn_cell.py
+++ b/tensorflow/lite/experimental/examples/lstm/rnn_cell.py
@@ -163,7 +163,7 @@ class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
   """Long short-term memory unit (LSTM) recurrent network cell.
 
   This is used only for TfLite, it provides hints and it also makes the
-  variables in the desired for the tflite ops  (transposed and seaparated).
+  variables in the desired for the tflite ops  (transposed and separated).
 
   The default non-peephole implementation is based on:
 
diff --git a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
index 303741ba89d..dd7005ace4d 100644
--- a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
+++ b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
@@ -20,7 +20,7 @@ import tempfile
 
 import numpy as np
 from six.moves import range
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 
 from tensorflow.examples.tutorials.mnist import input_data
 from tensorflow.python.framework import test_util
@@ -32,13 +32,11 @@ from tensorflow.python.platform import test
 # initial values. This can help make the test smaller.
 TRAIN_STEPS = 0
 
-CONFIG = tf.ConfigProto(device_count={"GPU": 0})
-
 
 class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
-    tf.reset_default_graph()
+    tf.compat.v1.reset_default_graph()
     # Import MNIST dataset
     self.mnist = input_data.read_data_sets(
         "/tmp/data/", fake_data=True, one_hot=True)
@@ -59,17 +57,17 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
 
   def buildLstmLayer(self):
     return tf.keras.layers.StackedRNNCells([
-        tf.lite.experimental.nn.TFLiteLSTMCell(
+        tf.compat.v1.lite.experimental.nn.TFLiteLSTMCell(
             self.num_units, use_peepholes=True, forget_bias=1.0, name="rnn1"),
-        tf.lite.experimental.nn.TFLiteLSTMCell(
+        tf.compat.v1.lite.experimental.nn.TFLiteLSTMCell(
             self.num_units, num_proj=8, forget_bias=1.0, name="rnn2"),
-        tf.lite.experimental.nn.TFLiteLSTMCell(
+        tf.compat.v1.lite.experimental.nn.TFLiteLSTMCell(
             self.num_units // 2,
             use_peepholes=True,
             num_proj=8,
             forget_bias=0,
             name="rnn3"),
-        tf.lite.experimental.nn.TFLiteLSTMCell(
+        tf.compat.v1.lite.experimental.nn.TFLiteLSTMCell(
             self.num_units, forget_bias=1.0, name="rnn4")
     ])
 
@@ -93,18 +91,19 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
     out_bias = tf.Variable(tf.random.normal([self.n_classes]))
 
     # input image placeholder
-    x = tf.placeholder(
+    x = tf.compat.v1.placeholder(
         "float", [None, self.time_steps, self.n_input], name="INPUT_IMAGE")
 
     # x is shaped [batch_size,time_steps,num_inputs]
     if is_dynamic_rnn:
       lstm_input = tf.transpose(x, perm=[1, 0, 2])
-      outputs, _ = tf.lite.experimental.nn.dynamic_rnn(
+      outputs, _ = tf.compat.v1.lite.experimental.nn.dynamic_rnn(
           lstm_layer, lstm_input, dtype="float32")
       outputs = tf.unstack(outputs, axis=0)
     else:
       lstm_input = tf.unstack(x, self.time_steps, 1)
-      outputs, _ = tf.nn.static_rnn(lstm_layer, lstm_input, dtype="float32")
+      outputs, _ = tf.compat.v1.nn.static_rnn(
+          lstm_layer, lstm_input, dtype="float32")
 
     # Compute logits by multiplying outputs[-1] of shape [batch_size,num_units]
     # by the softmax layer's out_weight of shape [num_units,n_classes]
@@ -124,16 +123,16 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
       sess: The graph session.
     """
     # input label placeholder
-    y = tf.placeholder("float", [None, self.n_classes])
+    y = tf.compat.v1.placeholder("float", [None, self.n_classes])
     # Loss function
     loss = tf.reduce_mean(
         tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y))
     # Optimization
-    opt = tf.train.AdamOptimizer(
+    opt = tf.compat.v1.train.AdamOptimizer(
         learning_rate=self.learning_rate).minimize(loss)
 
     # Initialize variables
-    init = tf.global_variables_initializer()
+    init = tf.compat.v1.global_variables_initializer()
     sess.run(init)
     for _ in range(TRAIN_STEPS):
       batch_x, batch_y = self.mnist.train.next_batch(
@@ -167,11 +166,11 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
     saver.save(sess, model_dir)
 
     # Reset the graph.
-    tf.reset_default_graph()
+    tf.compat.v1.reset_default_graph()
     x, prediction, output_class = self.buildModel(lstm_layer, is_dynamic_rnn)
 
-    new_sess = tf.compat.v1.Session(config=CONFIG)
-    saver = tf.train.Saver()
+    new_sess = tf.compat.v1.Session()
+    saver = tf.compat.v1.train.Saver()
     saver.restore(new_sess, model_dir)
     return x, prediction, output_class, new_sess
 
@@ -219,8 +218,8 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
     Returns:
       The tflite inference result.
     """
-    converter = tf.lite.TFLiteConverter.from_session(sess, [input_tensor],
-                                                     [output_tensor])
+    converter = tf.compat.v1.lite.TFLiteConverter.from_session(
+        sess, [input_tensor], [output_tensor])
     converter.experimental_new_converter = use_mlir_converter
     tflite = converter.convert()
 
@@ -241,13 +240,13 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
     return result
 
   def testStaticRnnMultiRnnCell(self):
-    sess = tf.compat.v1.Session(config=CONFIG)
+    sess = tf.compat.v1.Session()
 
     x, prediction, output_class = self.buildModel(
         self.buildLstmLayer(), is_dynamic_rnn=False)
     self.trainModel(x, prediction, output_class, sess)
 
-    saver = tf.train.Saver()
+    saver = tf.compat.v1.train.Saver()
     x, prediction, output_class, new_sess = self.saveAndRestoreModel(
         self.buildLstmLayer(), sess, saver, is_dynamic_rnn=False)
 
@@ -264,13 +263,13 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
 
   @test_util.enable_control_flow_v2
   def testDynamicRnnMultiRnnCell(self):
-    sess = tf.compat.v1.Session(config=CONFIG)
+    sess = tf.compat.v1.Session()
 
     x, prediction, output_class = self.buildModel(
         self.buildLstmLayer(), is_dynamic_rnn=True)
     self.trainModel(x, prediction, output_class, sess)
 
-    saver = tf.train.Saver()
+    saver = tf.compat.v1.train.Saver()
 
     x, prediction, output_class, new_sess = self.saveAndRestoreModel(
         self.buildLstmLayer(), sess, saver, is_dynamic_rnn=True)
@@ -288,4 +287,5 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
 
 
 if __name__ == "__main__":
+  tf.disable_v2_behavior()
   test.main()
diff --git a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_rnn_test.py b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_rnn_test.py
index 1834d4738e9..90b758e82fc 100644
--- a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_rnn_test.py
+++ b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_rnn_test.py
@@ -20,23 +20,19 @@ import tempfile
 
 import numpy as np
 from six.moves import range
-import tensorflow as tf
-
-from tensorflow import flags
+import tensorflow.compat.v1 as tf
 
 from tensorflow.examples.tutorials.mnist import input_data
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
-FLAGS = flags.FLAGS
+FLAGS = tf.compat.v1.flags.FLAGS
 
 # Number of steps to train model.
 # Dial to 0 means no training at all, all the weights will be just using their
 # initial values. This can help make the test smaller.
 TRAIN_STEPS = 0
 
-CONFIG = tf.ConfigProto(device_count={"GPU": 0})
-
 
 class UnidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
 
@@ -65,8 +61,10 @@ class UnidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
 
   def buildRnnLayer(self):
     return tf.keras.layers.StackedRNNCells([
-        tf.lite.experimental.nn.TfLiteRNNCell(self.num_units, name="rnn1"),
-        tf.lite.experimental.nn.TfLiteRNNCell(self.num_units, name="rnn2")
+        tf.compat.v1.lite.experimental.nn.TfLiteRNNCell(
+            self.num_units, name="rnn1"),
+        tf.compat.v1.lite.experimental.nn.TfLiteRNNCell(
+            self.num_units, name="rnn2")
     ])
 
   def buildModel(self, rnn_layer, is_dynamic_rnn):
@@ -89,18 +87,19 @@ class UnidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
     out_bias = tf.Variable(tf.random.normal([self.n_classes]))
 
     # input image placeholder
-    x = tf.placeholder(
+    x = tf.compat.v1.placeholder(
         "float", [None, self.time_steps, self.n_input], name="INPUT_IMAGE")
 
     # x is shaped [batch_size,time_steps,num_inputs]
     if is_dynamic_rnn:
       rnn_input = tf.transpose(x, perm=[1, 0, 2])
-      outputs, _ = tf.lite.experimental.nn.dynamic_rnn(
+      outputs, _ = tf.compat.v1.lite.experimental.nn.dynamic_rnn(
           rnn_layer, rnn_input, dtype="float32")
       outputs = tf.unstack(outputs, axis=0)
     else:
       rnn_input = tf.unstack(x, self.time_steps, 1)
-      outputs, _ = tf.nn.static_rnn(rnn_layer, rnn_input, dtype="float32")
+      outputs, _ = tf.compat.v1.nn.static_rnn(
+          rnn_layer, rnn_input, dtype="float32")
 
     # Compute logits by multiplying outputs[-1] of shape [batch_size,num_units]
     # by the softmax layer's out_weight of shape [num_units,n_classes]
@@ -120,16 +119,16 @@ class UnidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
       sess: The graph session.
     """
     # input label placeholder
-    y = tf.placeholder("float", [None, self.n_classes])
+    y = tf.compat.v1.placeholder("float", [None, self.n_classes])
     # Loss function
     loss = tf.reduce_mean(
         tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y))
     # Optimization
-    opt = tf.train.AdamOptimizer(
+    opt = tf.compat.v1.train.AdamOptimizer(
         learning_rate=self.learning_rate).minimize(loss)
 
     # Initialize variables
-    sess.run(tf.global_variables_initializer())
+    sess.run(tf.compat.v1.global_variables_initializer())
     for _ in range(TRAIN_STEPS):
       batch_x, batch_y = self.mnist.train.next_batch(
           batch_size=self.batch_size, fake_data=True)
@@ -165,8 +164,8 @@ class UnidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
     tf.reset_default_graph()
     x, prediction, output_class = self.buildModel(rnn_layer, is_dynamic_rnn)
 
-    new_sess = tf.compat.v1.Session(config=CONFIG)
-    saver = tf.train.Saver()
+    new_sess = tf.compat.v1.Session()
+    saver = tf.compat.v1.train.Saver()
     saver.restore(new_sess, model_dir)
     return x, prediction, output_class, new_sess
 
@@ -232,7 +231,7 @@ class UnidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
     return result
 
   def testStaticRnnMultiRnnCell(self):
-    sess = tf.compat.v1.Session(config=CONFIG)
+    sess = tf.compat.v1.Session()
 
     x, prediction, output_class = self.buildModel(
         self.buildRnnLayer(), is_dynamic_rnn=False)
@@ -255,13 +254,13 @@ class UnidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
 
   @test_util.enable_control_flow_v2
   def testDynamicRnnMultiRnnCell(self):
-    sess = tf.compat.v1.Session(config=CONFIG)
+    sess = tf.compat.v1.Session()
 
     x, prediction, output_class = self.buildModel(
         self.buildRnnLayer(), is_dynamic_rnn=True)
     self.trainModel(x, prediction, output_class, sess)
 
-    saver = tf.train.Saver()
+    saver = tf.compat.v1.train.Saver()
 
     x, prediction, output_class, new_sess = self.saveAndRestoreModel(
         self.buildRnnLayer(), sess, saver, is_dynamic_rnn=True)
@@ -279,4 +278,5 @@ class UnidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
 
 
 if __name__ == "__main__":
+  tf.disable_v2_behavior()
   test.main()
diff --git a/tensorflow/lite/experimental/kernels/hashtable_ops_test.cc b/tensorflow/lite/experimental/kernels/hashtable_ops_test.cc
index cb57d464c2a..4e4100b3734 100644
--- a/tensorflow/lite/experimental/kernels/hashtable_ops_test.cc
+++ b/tensorflow/lite/experimental/kernels/hashtable_ops_test.cc
@@ -29,7 +29,7 @@ limitations under the License.
 
 namespace tflite {
 
-// Forward declaraction for op kernels.
+// Forward declaration for op kernels.
 namespace ops {
 namespace custom {
 
@@ -505,7 +505,7 @@ class HashtableGraph {
   TestErrorReporter error_reporter_;
 };
 
-// HashtableDefaultGraphTest tests hash table feautres on a basic graph, created
+// HashtableDefaultGraphTest tests hash table features on a basic graph, created
 // by the HashtableGraph class.
 template <typename KeyType, typename ValueType>
 class HashtableDefaultGraphTest {
diff --git a/tensorflow/lite/experimental/microfrontend/README.md b/tensorflow/lite/experimental/microfrontend/README.md
new file mode 100644
index 00000000000..c276481b542
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/README.md
@@ -0,0 +1,75 @@
+# Audio "frontend" TensorFlow operations for feature generation
+The most common module used by most audio processing modules is the feature
+generation (also called frontend). It receives raw audio input, and produces
+filter banks (a vector of values).
+
+More specifically the audio signal goes through a pre-emphasis filter
+(optionally); then gets sliced into (overlapping) frames and a window function
+is applied to each frame; afterwards, we do a Fourier transform on each frame
+(or more specifically a Short-Time Fourier Transform) and calculate the power
+spectrum; and subsequently compute the filter banks.
+
+## Operations
+Here we provide implementations for both a TensorFlow and TensorFlow Lite
+operations that encapsulate the functionality of the audio frontend.
+
+Both frontend Ops receives audio data and produces as many unstacked frames
+(filterbanks) as audio is passed in, according to the configuration.
+
+The processing uses a lightweight library to perform:
+
+1. A slicing window function
+2. Short-time FFTs
+3. Filterbank calculations
+4. Noise reduction
+5. Auto Gain Control
+6. Logarithmic scaling
+
+Please refer to the Op's documentation for details on the different
+configuration parameters.
+
+However, it is important to clarify the contract of the Ops:
+
+> *A frontend OP will produce as many unstacked frames as possible with the
+> given audio input.*
+
+This means:
+
+1. The output is a rank-2 Tensor, where each row corresponds to the
+  sequence/time dimension, and each column is the feature dimension).
+2. It is expected that the Op will receive the right input (in terms of
+  positioning in the audio stream, and the amount), as needed to produce the
+  expected output.
+3. Thus, any logic to slice, cache, or otherwise rearrange the input and/or
+  output of the operation must be handled externally in the graph.
+
+For example, a 200ms audio input will produce an output tensor of shape
+`[18, num_channels]`, when configured with a `window_size=25ms`, and
+`window_step=10ms`. The reason being that when reaching the point in the
+audio at 180ms there’s not enough audio to construct a complete window.
+
+Due to both functional and efficiency reasons, we provide the following
+functionality related to input processing:
+
+**Padding.** A boolean flag `zero_padding` that indicates whether to pad the
+audio with zeros such that we generate output frames based on the `window_step`.
+This means that in the example above, we would generate a tensor of shape
+`[20, num_channels]` by adding enough zeros such that we step over all the
+available audio and still be able to create complete windows of audio (some of
+the window will just have zeros; in the example above, frame 19 and 20 will have
+the equivalent of 5 and 15ms full of zeros respectively).
+
+<!-- TODO
+Stacking. An integer that indicates how many contiguous frames to stack in the output tensor’s first dimension, such that the tensor is shaped [-1, stack_size * num_channels]. For example, if the stack_size is 3, the example above would produce an output tensor shaped [18, 120] is padding is false, and [20, 120] is padding is set to true.
+-->
+
+**Striding.** An integer `frame_stride` that indicates the striding step used to
+generate the output tensor, thus determining the second dimension. In the
+example above, with a `frame_stride=3`, the output tensor would have a shape of
+`[6, 120]` when `zero_padding` is set to false, and `[7, 120]` when
+`zero_padding` is set to true.
+
+<!-- TODO
+Note we would not expect the striding step to be larger than the stack_size
+(should we enforce that?).
+-->
diff --git a/tensorflow/lite/experimental/microfrontend/lib/README b/tensorflow/lite/experimental/microfrontend/lib/README
deleted file mode 100644
index 731d88c5bda..00000000000
--- a/tensorflow/lite/experimental/microfrontend/lib/README
+++ /dev/null
@@ -1,9 +0,0 @@
-The binary frontend_main shows sample usage of the frontend, printing out
-coefficients when it has processed enough data.
-
-The binary frontend_memmap_main shows a sample usage of how to avoid all the
-init code in your runtime, by first running "frontend_generate_memmap" to
-create a header/source file that uses a baked in frontend state. This command
-could be automated as part of your build process, or you can just use the output
-directly.
-
diff --git a/tensorflow/lite/experimental/microfrontend/lib/README.md b/tensorflow/lite/experimental/microfrontend/lib/README.md
new file mode 100644
index 00000000000..ba5e82c47c7
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/README.md
@@ -0,0 +1,65 @@
+# Audio "frontend" library for feature generation
+
+A feature generation library (also called frontend) that receives raw audio
+input, and produces filter banks (a vector of values).
+
+The raw audio input is expected to be 16-bit PCM features, with a configurable
+sample rate. More specifically the audio signal goes through a pre-emphasis
+filter (optionally); then gets sliced into (potentially overlapping) frames and
+a window function is applied to each frame; afterwards, we do a Fourier
+transform on each frame (or more specifically a Short-Time Fourier Transform)
+and calculate the power spectrum; and subsequently compute the filter banks.
+
+By default the library is configured with a set of defaults to perform the
+different processing tasks. This takes place with the frontend_util.c function:
+
+```c++
+void FrontendFillConfigWithDefaults(struct FrontendConfig* config)
+```
+
+A single invocation looks like:
+
+```c++
+struct FrontendConfig frontend_config;
+FrontendFillConfigWithDefaults(&frontend_config);
+int sample_rate = 16000;
+FrontendPopulateState(&frontend_config, &frontend_state, sample_rate);
+int16_t* audio_data = ;  // PCM audio samples at 16KHz.
+size_t audio_size = ;  // Number of audio samples.
+size_t num_samples_read;  // How many samples were processed.
+struct FrontendOutput output =
+    FrontendProcessSamples(
+        &frontend_state, audio_data, audio_size, &num_samples_read);
+for (i = 0; i < output.size; ++i) {
+  printf("%d ", output.values[i]);  // Print the feature vector.
+}
+```
+
+Something to note in the above example is that the frontend consumes as many
+samples needed from the audio data to produce a single feature vector (according
+to the frontend configuration). If not enough samples were available to generate
+a feature vector, the returned size will be 0 and the values pointer will be
+`NULL`.
+
+An example of how to use the frontend is provided in frontend_main.cc and its
+binary frontend_main. This example, expects a path to a file containing `int16`
+PCM features at a sample rate of 16KHz, and upon execution will printing out
+the coefficients according to the frontend default configuration.
+
+## Extra features
+Extra features of this frontend library include a noise reduction module, as
+well as a gain control module.
+
+**Noise cancellation**. Removes stationary noise from each channel of the signal
+using a low pass filter.
+
+**Gain control**. A novel automatic gain control based dynamic compression to
+replace the widely used static (such as log or root) compression. Disabled
+by default.
+
+## Memory map
+The binary frontend_memmap_main shows a sample usage of how to avoid all the
+initialization code in your application, by first running
+"frontend_generate_memmap" to create a header/source file that uses a baked in
+frontend state. This command could be automated as part of your build process,
+or you can just use the output directly.
diff --git a/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h b/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h
index 81557913223..3f6222bebce 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h
+++ b/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h
@@ -25,6 +25,7 @@ limitations under the License.
 extern "C" {
 #endif
 
+// Details at https://research.google/pubs/pub45911.pdf
 struct PcanGainControlState {
   int enable_pcan;
   uint32_t* noise_estimate;
diff --git a/tensorflow/lite/experimental/objc/apis/TFLInterpreter.h b/tensorflow/lite/experimental/objc/apis/TFLInterpreter.h
index 3c06a4bc82f..333047cc307 100644
--- a/tensorflow/lite/experimental/objc/apis/TFLInterpreter.h
+++ b/tensorflow/lite/experimental/objc/apis/TFLInterpreter.h
@@ -63,10 +63,10 @@ typedef NS_ENUM(NSUInteger, TFLInterpreterErrorCode) {
   /** Failed to allocate memory for tensors. */
   TFLInterpreterErrorCodeFailedToAllocateTensors,
 
-  /** Operaton not allowed without allocating memory for tensors first. */
+  /** Operation not allowed without allocating memory for tensors first. */
   TFLInterpreterErrorCodeAllocateTensorsRequired,
 
-  /** Operaton not allowed without invoking the interpreter first. */
+  /** Operation not allowed without invoking the interpreter first. */
   TFLInterpreterErrorCodeInvokeInterpreterRequired,
 };
 
diff --git a/tensorflow/lite/experimental/resource/lookup_util.h b/tensorflow/lite/experimental/resource/lookup_util.h
index bb2c1c53ce5..3bf53d72481 100644
--- a/tensorflow/lite/experimental/resource/lookup_util.h
+++ b/tensorflow/lite/experimental/resource/lookup_util.h
@@ -39,7 +39,7 @@ class TensorReader {
   const T* input_data_;
 };
 
-/// Helper class for accesing TFLite tensor data. This specialized class is for
+/// Helper class for accessing TFLite tensor data. This specialized class is for
 /// std::string type.
 template <>
 class TensorReader<std::string> {
diff --git a/tensorflow/lite/experimental/ruy/BUILD b/tensorflow/lite/experimental/ruy/BUILD
index 29061e1782f..410b197f11f 100644
--- a/tensorflow/lite/experimental/ruy/BUILD
+++ b/tensorflow/lite/experimental/ruy/BUILD
@@ -1,16 +1,29 @@
 # Ruy is not BLAS
 
-# TODO(b/123403203) actually make TFLite use ruy.
-
-load(":build_defs.bzl", "ruy_copts_avx2", "ruy_copts_avxvnni", "ruy_copts_base", "ruy_copts_skylake", "ruy_copts_sse42", "ruy_visibility")
+load(":build_defs.bzl", "ruy_copts_avx2", "ruy_copts_avxvnni", "ruy_copts_base", "ruy_copts_skylake", "ruy_copts_sse42")
 load(":ruy_test_ext.bzl", "ruy_test_ext_defines", "ruy_test_ext_deps")
 load(":ruy_test.bzl", "ruy_benchmark", "ruy_test")
 
 package(
-    default_visibility = ["//visibility:private"],
+    default_visibility = ["//visibility:public"],
     licenses = ["notice"],  # Apache 2.0
 )
 
+config_setting(
+    name = "windows",
+    values = {"cpu": "x64_windows"},
+)
+
+config_setting(
+    name = "armeabi-v7a",
+    values = {"cpu": "armeabi-v7a"},
+)
+
+config_setting(
+    name = "x86_64",
+    values = {"cpu": "k8"},
+)
+
 config_setting(
     name = "optimized",
     values = {
@@ -236,7 +249,6 @@ cc_library(
         "thread_pool.h",
     ],
     copts = ruy_copts_base(),
-    visibility = ruy_visibility(),
     deps = [
         ":blocking_counter",
         ":check_macros",
@@ -253,7 +265,6 @@ cc_library(
         "detect_arm.h",
     ],
     copts = ruy_copts_base(),
-    visibility = ruy_visibility(),
 )
 
 cc_library(
@@ -265,7 +276,6 @@ cc_library(
         "detect_x86.h",
     ],
     copts = ruy_copts_base(),
-    visibility = ruy_visibility(),
     deps = [
         ":platform",
     ],
@@ -275,7 +285,6 @@ cc_library(
     name = "path",
     hdrs = ["path.h"],
     copts = ruy_copts_base(),
-    visibility = ruy_visibility(),
     deps = [
         ":platform",
         ":size_util",
@@ -286,7 +295,6 @@ cc_library(
     name = "cpu_cache_size",
     hdrs = ["cpu_cache_size.h"],
     copts = ruy_copts_base(),
-    visibility = ruy_visibility(),
     deps = [
         ":path",
         ":platform",
@@ -314,7 +322,6 @@ cc_library(
     name = "matrix",
     hdrs = ["matrix.h"],
     copts = ruy_copts_base(),
-    visibility = ruy_visibility(),
     deps = [":check_macros"],
 )
 
@@ -322,7 +329,6 @@ cc_library(
     name = "spec",
     hdrs = ["spec.h"],
     copts = ruy_copts_base(),
-    visibility = ruy_visibility(),
     deps = [
         ":cpu_cache_size",
         ":matrix",
@@ -732,7 +738,6 @@ cc_library(
         "context.h",
     ],
     copts = ruy_copts_base(),
-    visibility = ruy_visibility(),
     deps = [
         ":allocator",
         ":check_macros",
@@ -807,7 +812,6 @@ cc_library(
         "ruy_advanced.h",
     ],
     copts = ruy_copts_base(),
-    visibility = ruy_visibility(),
     deps = [
         ":check_macros",
         ":common",
@@ -862,7 +866,7 @@ cc_library(
     # need defines, not copts, because it's controlling a header, test.h
     defines = ruy_test_ext_defines(),
     linkopts = select({
-        "//tensorflow:windows": [],
+        ":windows": [],
         "//conditions:default": ["-lm"],
     }),
     deps = [
diff --git a/tensorflow/lite/experimental/ruy/build_defs.bzl b/tensorflow/lite/experimental/ruy/build_defs.bzl
index 6660b2f08e7..9bccccf6316 100644
--- a/tensorflow/lite/experimental/ruy/build_defs.bzl
+++ b/tensorflow/lite/experimental/ruy/build_defs.bzl
@@ -1,17 +1,12 @@
 """Build definitions for Ruy."""
 
-def ruy_visibility():
-    return [
-        "//tensorflow/lite/kernels:__subpackages__",
-    ]
-
 # 1. Enable -mfpu=neon unconditionally on ARM32. If it turns out that we need to support
 #    ARM32 without NEON then we'll implement runtime detection and dispatch at that point.
 # 2. Explicitly pass -O3 on optimization configs where just "-c opt" means "optimize for code size".
 
 def ruy_copts_base():
     return select({
-        "//tensorflow:android_arm": [
+        ":armeabi-v7a": [
             "-mfpu=neon",
         ],
         "//conditions:default": [],
diff --git a/tensorflow/lite/experimental/ruy/pack_avx2.cc b/tensorflow/lite/experimental/ruy/pack_avx2.cc
index 061f9831a84..417eee6ae46 100644
--- a/tensorflow/lite/experimental/ruy/pack_avx2.cc
+++ b/tensorflow/lite/experimental/ruy/pack_avx2.cc
@@ -567,7 +567,7 @@ inline void PackFloatAvx2Packer(const float* src_ptr, const float* zerobuf,
   RUY_DCHECK_EQ(PackImplFloatAvx2::Layout::kCols, 8);
   RUY_DCHECK_EQ(PackImplFloatAvx2::Layout::kRows, 1);
 
-  // This packing amounts to tranposition of 8x8 blocks.
+  // This packing amounts to transposition of 8x8 blocks.
   static constexpr int kPackCols = 8;  // Source cols packed together.
   static constexpr int kPackRows = 8;  // Short input is padded.
 
diff --git a/tensorflow/lite/experimental/ruy/platform.h b/tensorflow/lite/experimental/ruy/platform.h
index 7121a7a2f38..d86c9576e5c 100644
--- a/tensorflow/lite/experimental/ruy/platform.h
+++ b/tensorflow/lite/experimental/ruy/platform.h
@@ -57,7 +57,7 @@ limitations under the License.
 //
 // These are mostly sub-selections of architectures.
 
-// Detect NEON. Explictly avoid emulation, or anything like it, on x86.
+// Detect NEON. Explicitly avoid emulation, or anything like it, on x86.
 #if (defined(__ARM_NEON) || defined(__ARM_NEON__)) && !RUY_PLATFORM(X86)
 #define RUY_DONOTUSEDIRECTLY_NEON 1
 #else
diff --git a/tensorflow/lite/experimental/ruy/side_pair.h b/tensorflow/lite/experimental/ruy/side_pair.h
index b20a2d1ef43..56ac16c85d1 100644
--- a/tensorflow/lite/experimental/ruy/side_pair.h
+++ b/tensorflow/lite/experimental/ruy/side_pair.h
@@ -20,8 +20,18 @@ limitations under the License.
 
 namespace ruy {
 
-enum class Side { kLhs = 0, kRhs = 1 };
+// Enumeration of the sides, i.e. the operands 'slots', in a matrix
+// multiplication. The numerical values of these enumeration constants matter
+// because these will be used as indices into the array underlying a SidePair.
+enum class Side {
+  // Left-hand side
+  kLhs = 0,
+  // Right-hand side
+  kRhs = 1
+};
 
+// SidePair is a pair container where the two elements are indexed by a Side
+// enum.
 template <typename T>
 class SidePair final {
  public:
diff --git a/tensorflow/lite/experimental/ruy/test.h b/tensorflow/lite/experimental/ruy/test.h
index 25766c5949a..a7b2ff483b2 100644
--- a/tensorflow/lite/experimental/ruy/test.h
+++ b/tensorflow/lite/experimental/ruy/test.h
@@ -252,7 +252,7 @@ Scalar Parametrized(float param) {
 template <typename Scalar>
 struct RandomRangeBounds<Scalar, false> {
   static Scalar GetMinBound(RandomRange range) {
-    static constexpr double offcentredness =
+    static constexpr double offcenteredness =
         0.02;  // Shift lower limit by about 5 for range of 255.
     switch (range) {
       case RandomRange::kGeneral:
@@ -262,8 +262,8 @@ struct RandomRangeBounds<Scalar, false> {
       case RandomRange::kOffCenterAvoidMinValue:
         return 1 + std::numeric_limits<Scalar>::lowest() +
                static_cast<Scalar>(
-                   offcentredness * std::numeric_limits<Scalar>::max() -
-                   offcentredness *
+                   offcenteredness * std::numeric_limits<Scalar>::max() -
+                   offcenteredness *
                        (std::numeric_limits<Scalar>::lowest() + 1));
       case RandomRange::kReasonableSrcZeroPoint:
         return std::numeric_limits<Scalar>::lowest();
diff --git a/tensorflow/lite/experimental/ruy/tune.h b/tensorflow/lite/experimental/ruy/tune.h
index c6257781402..be38ca3fab0 100644
--- a/tensorflow/lite/experimental/ruy/tune.h
+++ b/tensorflow/lite/experimental/ruy/tune.h
@@ -129,16 +129,16 @@ class TuningResolver {
   // access to that.
   friend class TuneTool;
   // Actually runs a nano-benchmark, producing a real number called 'ratio'
-  // whose meaning is generally opaque / implemenation defined. Typically,
+  // whose meaning is generally opaque / implementation defined. Typically,
   // this would be the ratio between the latencies of two different
   // pieces of asm code differing only by the ordering of instructions,
   // revealing whether the CPU cares about such ordering details.
-  // An implemenation may just return a dummy value if it is not based on
+  // An implementation may just return a dummy value if it is not based on
   // such nanobenchmarking / ratio evaluation.
   float EvalRatio();
   // Empirically determined threshold on ratio values delineating
   // out-of-order (ratios closer to 1) from in-order (ratios farther from 1).
-  // An implemenation may just return a dummy value if it is not based on
+  // An implementation may just return a dummy value if it is not based on
   // such nanobenchmarking / ratio evaluation.
   float ThresholdRatio();
   // Perform the tuning resolution now. That may typically use EvalRatio and
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/FileUtil.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/FileUtil.java
index 4a1e7d4a65e..e83fd403df3 100644
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/FileUtil.java
+++ b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/FileUtil.java
@@ -22,6 +22,7 @@ import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
+import java.nio.ByteBuffer;
 import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel;
 import java.nio.charset.Charset;
@@ -54,7 +55,8 @@ public class FileUtil {
    * Loads labels from the label file into a list of strings.
    *
    * <p>A legal label file is the plain text file whose contents are split into lines, and each line
-   * is an individual value. The file should be in assets of the context.
+   * is an individual value. The empty lines will be ignored. The file should be in assets of the
+   * context.
    *
    * @param context The context holds assets.
    * @param filePath The path of the label file, relative with assets directory.
@@ -67,8 +69,9 @@ public class FileUtil {
       @NonNull Context context, @NonNull String filePath, Charset cs) throws IOException {
     SupportPreconditions.checkNotNull(context, "Context cannot be null.");
     SupportPreconditions.checkNotNull(filePath, "File path cannot be null.");
-    InputStream inputStream = context.getAssets().open(filePath);
-    return loadLabels(inputStream, cs);
+    try (InputStream inputStream = context.getAssets().open(filePath)) {
+      return loadLabels(inputStream, cs);
+    }
   }
 
   /**
@@ -97,13 +100,15 @@ public class FileUtil {
   public static List<String> loadLabels(@NonNull InputStream inputStream, Charset cs)
       throws IOException {
     List<String> labels = new ArrayList<>();
-    BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, cs));
-    String line;
-    while ((line = reader.readLine()) != null) {
-      labels.add(line);
+    try (BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, cs))) {
+      String line;
+      while ((line = reader.readLine()) != null) {
+        if (line.trim().length() > 0) {
+          labels.add(line);
+        }
+      }
+      return labels;
     }
-    reader.close();
-    return labels;
   }
 
   /**
@@ -151,11 +156,29 @@ public class FileUtil {
       throws IOException {
     SupportPreconditions.checkNotNull(context, "Context should not be null.");
     SupportPreconditions.checkNotNull(filePath, "File path cannot be null.");
-    AssetFileDescriptor fileDescriptor = context.getAssets().openFd(filePath);
-    FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor());
-    FileChannel fileChannel = inputStream.getChannel();
-    long startOffset = fileDescriptor.getStartOffset();
-    long declaredLength = fileDescriptor.getDeclaredLength();
-    return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
+    try (AssetFileDescriptor fileDescriptor = context.getAssets().openFd(filePath);
+    FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor())) {
+      FileChannel fileChannel = inputStream.getChannel();
+      long startOffset = fileDescriptor.getStartOffset();
+      long declaredLength = fileDescriptor.getDeclaredLength();
+      return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
+    }
+  }
+
+  /**
+   * Loads a binary file from the asset folder.
+   *
+   * @param context Application context to access assets.
+   * @param filePath Asset path of the file.
+   * @return the byte array for the binary file.
+   * @throws IOException if an I/O error occurs when loading file.
+   */
+  @NonNull
+  public static byte[] loadByteFromFile(@NonNull Context context, @NonNull String filePath)
+      throws IOException {
+    ByteBuffer buffer = loadMappedFile(context, filePath);
+    byte[] byteArray = new byte[buffer.remaining()];
+    buffer.get(byteArray);
+    return byteArray;
   }
 }
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/label/LabelUtil.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/label/LabelUtil.java
new file mode 100644
index 00000000000..840ed5fb77d
--- /dev/null
+++ b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/label/LabelUtil.java
@@ -0,0 +1,64 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.support.label;
+
+import android.util.Log;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import org.checkerframework.checker.nullness.qual.NonNull;
+import org.tensorflow.lite.support.common.SupportPreconditions;
+import org.tensorflow.lite.support.tensorbuffer.TensorBuffer;
+
+/** Label operation utils. */
+public class LabelUtil {
+  /**
+   * Maps an int value tensor to a list of string labels. It takes an array of strings as the
+   * dictionary. Example: if the given tensor is [3, 1, 0], and given labels is ["background",
+   * "apple", "banana", "cherry", "date"], the result will be ["date", "banana", "apple"].
+   *
+   * @param tensorBuffer: A tensor with index values. The values should be non-negative integers,
+   *     and each value {@code x} will be converted to {@code labels[x + offset]}. If the tensor is
+   *     given as a float {@link TensorBuffer}, values will be cast to integers. All values that are
+   *     out of bound will map to empty string.
+   * @param labels: A list of strings, used as a dictionary to look up. The index of the array
+   *     element will be used as the key. To get better performance, use an object that implements
+   *     RandomAccess, such as {@link ArrayList}.
+   * @param offset: The offset value when look up int values in the {@code labels}.
+   * @return the mapped strings. The length of the list is {@link TensorBuffer#getFlatSize}.
+   * @throws IllegalArgumentException if {@code tensorBuffer} or {@code labels} is null.
+   */
+  public static List<String> mapValueToLabels(
+      @NonNull TensorBuffer tensorBuffer, @NonNull List<String> labels, int offset) {
+    SupportPreconditions.checkNotNull(tensorBuffer, "Given tensor should not be null");
+    SupportPreconditions.checkNotNull(labels, "Given labels should not be null");
+    int[] values = tensorBuffer.getIntArray();
+    Log.d("values", Arrays.toString(values));
+    List<String> result = new ArrayList<>();
+    for (int v : values) {
+      int index = v + offset;
+      if (index < 0 || index >= labels.size()) {
+        result.add("");
+      } else {
+        result.add(labels.get(index));
+      }
+    }
+    return result;
+  }
+
+  // Private constructor to prevent initialization.
+  private LabelUtil() {}
+}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/Model.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/Model.java
index 64a02d91d79..b99e2a1a818 100644
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/Model.java
+++ b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/Model.java
@@ -86,18 +86,27 @@ public class Model {
     }
   }
 
-  /** Return the memory-mapped model data. */
+  /** Returns the memory-mapped model data. */
   @NonNull
   public MappedByteBuffer getData() {
     return byteModel;
   }
 
-  /** Return the path of the model file stored in Assets. */
+  /** Returns the path of the model file stored in Assets. */
   @NonNull
   public String getPath() {
     return modelPath;
   }
 
+  /**
+   * Returns the output shape. Useful if output shape is only determined when graph is created.
+   *
+   * @throws IllegalStateException if the interpreter is closed.
+   */
+  public int[] getOutputTensorShape(int outputIndex) {
+    return interpreter.getOutputTensor(outputIndex).shape();
+  }
+
   /**
    * Runs model inference on multiple inputs, and returns multiple outputs.
    *
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/tensorbuffer/TensorBuffer.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/tensorbuffer/TensorBuffer.java
index ea6a085a3bc..6a9311c81aa 100644
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/tensorbuffer/TensorBuffer.java
+++ b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/tensorbuffer/TensorBuffer.java
@@ -363,7 +363,7 @@ public abstract class TensorBuffer {
       return true;
     }
 
-    // This shape refers to a multidimentional array.
+    // This shape refers to a multidimensional array.
     for (int s : shape) {
       // All elements in shape should be non-negative.
       if (s < 0) {
diff --git a/tensorflow/lite/experimental/support/metadata/BUILD b/tensorflow/lite/experimental/support/metadata/BUILD
new file mode 100644
index 00000000000..d6417a1bfcf
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/BUILD
@@ -0,0 +1,90 @@
+load("//tensorflow:tensorflow.bzl", "py_test")
+load("@flatbuffers//:build_defs.bzl", "flatbuffer_android_library", "flatbuffer_cc_library", "flatbuffer_java_library", "flatbuffer_py_library")
+
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+exports_files(["metadata_schema.fbs"])
+
+flatbuffer_py_library(
+    name = "schema_py",
+    srcs = ["//tensorflow/lite/schema:schema.fbs"],
+)
+
+# Generic schema for inference on device.
+flatbuffer_android_library(
+    name = "schema_fbs_android",
+    srcs = ["//tensorflow/lite/schema:schema.fbs"],
+    custom_package = "org.tensorflow.lite.schema",
+)
+
+flatbuffer_java_library(
+    name = "schema_fbs_java",
+    srcs = ["//tensorflow/lite/schema:schema.fbs"],
+    custom_package = "org.tensorflow.lite.schema",
+)
+
+# Generic schema for model metadata.
+flatbuffer_cc_library(
+    name = "metadata_schema_cc",
+    srcs = ["metadata_schema.fbs"],
+)
+
+flatbuffer_py_library(
+    name = "metadata_schema_py",
+    srcs = ["metadata_schema.fbs"],
+)
+
+flatbuffer_java_library(
+    name = "metadata_schema_java",
+    srcs = ["metadata_schema.fbs"],
+    custom_package = "org.tensorflow.lite.support.metadata.schema",
+)
+
+flatbuffer_android_library(
+    name = "metadata_schema_fbs_android",
+    srcs = ["metadata_schema.fbs"],
+    custom_package = "org.tensorflow.lite.support.metadata.schema",
+)
+
+py_library(
+    name = "metadata",
+    srcs = ["metadata.py"],
+    data = [
+        "//tensorflow/lite/experimental/support/metadata:metadata_schema.fbs",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":metadata_schema_py",
+        ":schema_py",
+        "//tensorflow/lite/experimental/support/metadata/flatbuffers_lib:_pywrap_flatbuffers",
+        "//tensorflow/python:platform",
+        "@flatbuffers//:runtime_py",
+    ],
+)
+
+py_test(
+    name = "metadata_test",
+    srcs = ["metadata_test.py"],
+    data = ["testdata/golden_json.json"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_mac",  # TODO(b/148247402): flatbuffers import broken on Mac OS.
+    ],
+    deps = [
+        ":metadata",
+        ":metadata_schema_py",
+        ":schema_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "@flatbuffers//:runtime_py",
+        "@six_archive//:six",
+    ],
+)
diff --git a/tensorflow/lite/experimental/support/metadata/README.md b/tensorflow/lite/experimental/support/metadata/README.md
new file mode 100644
index 00000000000..ff7d25f27cb
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/README.md
@@ -0,0 +1,15 @@
+# TensorFlow Lite Metadata and Android wrapper code generator
+
+Note: Both TensorFlow Lite Metadata and the Android wrapper code generator are
+in experimental (beta) phase.
+
+TensorFlow Lite metadata provides a structured framework for storing metadata
+to convey information for both the developer that will utilitised the model and
+code generators which can create wrapper around the model. For information on
+how to populate model metadata, please refer to the [TensorFlow Lite Metadata 
+documentation](https://www.tensorflow.org/lite/convert/metadata).
+
+The first code generator which takes advantage of this metadata format is the
+TensorFlow Lite Android Code Generator. For more information on how to use this
+generator, please refer to the [TensorFlow Lite Android wrapper code generator
+documentation](https://www.tensorflow.org/lite/guide/codegen).
diff --git a/tensorflow/lite/experimental/support/metadata/flatbuffers_lib/BUILD b/tensorflow/lite/experimental/support/metadata/flatbuffers_lib/BUILD
new file mode 100644
index 00000000000..ca9a79b7451
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/flatbuffers_lib/BUILD
@@ -0,0 +1,23 @@
+load("//tensorflow:tensorflow.bzl", "pybind_extension")
+
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+pybind_extension(
+    name = "_pywrap_flatbuffers",
+    srcs = [
+        "flatbuffers_lib.cc",
+    ],
+    features = ["-use_header_modules"],
+    module_name = "_pywrap_flatbuffers",
+    deps = [
+        "//tensorflow/python:pybind11_lib",
+        "//third_party/python_runtime:headers",
+        "@flatbuffers",
+        "@pybind11",
+    ],
+)
diff --git a/tensorflow/lite/experimental/support/metadata/flatbuffers_lib/flatbuffers_lib.cc b/tensorflow/lite/experimental/support/metadata/flatbuffers_lib/flatbuffers_lib.cc
new file mode 100644
index 00000000000..2b1402daf7e
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/flatbuffers_lib/flatbuffers_lib.cc
@@ -0,0 +1,59 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "flatbuffers/flatbuffers.h"  // TF:flatbuffers
+#include "flatbuffers/idl.h"  // TF:flatbuffers
+#include "include/pybind11/pybind11.h"
+#include "include/pybind11/pytypes.h"
+#include "include/pybind11/stl.h"
+
+namespace tflite {
+namespace support {
+
+PYBIND11_MODULE(_pywrap_flatbuffers, m) {
+  pybind11::class_<flatbuffers::IDLOptions>(m, "IDLOptions")
+      .def(pybind11::init<>())
+      .def_readwrite("strict_json", &flatbuffers::IDLOptions::strict_json);
+  pybind11::class_<flatbuffers::Parser>(m, "Parser")
+      .def(pybind11::init<const flatbuffers::IDLOptions&>())
+      .def("parse",
+           [](flatbuffers::Parser* self, const std::string& source) {
+             return self->Parse(source.c_str());
+           })
+      .def_readonly("builder", &flatbuffers::Parser::builder_)
+      .def_readonly("error", &flatbuffers::Parser::error_);
+  pybind11::class_<flatbuffers::FlatBufferBuilder>(m, "FlatBufferBuilder")
+      .def("clear", &flatbuffers::FlatBufferBuilder::Clear)
+      .def("push_flat_buffer", [](flatbuffers::FlatBufferBuilder* self,
+                                  const std::string& contents) {
+        self->PushFlatBuffer(reinterpret_cast<const uint8_t*>(contents.c_str()),
+                             contents.length());
+      });
+  m.def("generate_text_file", &flatbuffers::GenerateTextFile);
+  m.def(
+      "generate_text",
+      [](const flatbuffers::Parser& parser,
+         const std::string& buffer) -> std::string {
+        std::string text;
+        if (!flatbuffers::GenerateText(
+                parser, reinterpret_cast<const void*>(buffer.c_str()), &text)) {
+          return "";
+        }
+        return text;
+      });
+}
+
+}  // namespace support
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/support/metadata/java/AndroidManifest.xml b/tensorflow/lite/experimental/support/metadata/java/AndroidManifest.xml
new file mode 100644
index 00000000000..b2e22628db6
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/java/AndroidManifest.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="org.tensorflow.lite.support">
+    <uses-sdk android:minSdkVersion="19" />
+</manifest>
+
diff --git a/tensorflow/lite/experimental/support/metadata/java/BUILD b/tensorflow/lite/experimental/support/metadata/java/BUILD
new file mode 100644
index 00000000000..3b51b4a8a27
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/java/BUILD
@@ -0,0 +1,34 @@
+# Description:
+# TensorFlow Lite Support API in Java for metadata.
+
+load("@build_bazel_rules_android//android:rules.bzl", "android_library")
+load("//tensorflow/java:build_defs.bzl", "JAVACOPTS")
+
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+android_library(
+    name = "tensorflow-lite-support-metadata",
+    srcs = glob(["src/java/org/tensorflow/lite/support/metadata/**/*.java"]),
+    manifest = "AndroidManifest.xml",
+    deps = [
+        "//tensorflow/lite/experimental/support/metadata:metadata_schema_fbs_android",
+        "//tensorflow/lite/experimental/support/metadata:schema_fbs_android",
+        "//tensorflow/lite/java:tensorflowlite",
+        "@org_checkerframework_qual",
+    ],
+)
+
+java_library(
+    name = "tensorflow-lite-support-metadata-lib",
+    srcs = glob(["src/java/org/tensorflow/lite/support/metadata/**/*.java"]),
+    javacopts = JAVACOPTS,
+    deps = [
+        "//tensorflow/lite/experimental/support/metadata:metadata_schema_java",
+        "//tensorflow/lite/experimental/support/metadata:schema_fbs_java",
+        "//tensorflow/lite/java:tensorflowlitelib",
+        "@org_checkerframework_qual",
+    ],
+)
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/BoundedInputStream.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/BoundedInputStream.java
new file mode 100644
index 00000000000..6c3d23270f3
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/BoundedInputStream.java
@@ -0,0 +1,116 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.support.metadata;
+
+import static org.tensorflow.lite.support.metadata.Preconditions.checkArgument;
+import static org.tensorflow.lite.support.metadata.Preconditions.checkElementIndex;
+import static org.tensorflow.lite.support.metadata.Preconditions.checkNotNull;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+
+/**
+ * An {@link InputStream} that wraps a section of a {@link SeekableByteChannelCompat}.
+ *
+ * <p><b>WARNING:</b> Similar as {@link InputStream}, instances of an {@link BoundedInputStream} are
+ * <b>not</b> thread-safe. If multiple threads concurrently reading from the same {@link
+ * BoundedInputStream}, it must be synchronized externally. Also, if multiple instances of {@link
+ * BoundedInputStream} are created on the same {@link SeekableByteChannelCompat}, it must be
+ * synchronized as well.
+ */
+final class BoundedInputStream extends InputStream {
+  private final ByteBuffer singleByteBuffer = ByteBuffer.allocate(1);
+  private final long end; // The valid data for the stream is between [start, end).
+  private long position;
+  private final SeekableByteChannelCompat channel;
+
+  /**
+   * Creates a {@link BoundedInputStream} with a {@link SeekableByteChannelCompat}.
+   *
+   * @param channel the {@link SeekableByteChannelCompat} that backs up this {@link
+   *     BoundedInputStream}
+   * @param start the starting position of this {@link BoundedInputStream} in the given {@link
+   *     SeekableByteChannelCompat}
+   * @param remaining the length of this {@link BoundedInputStream}
+   * @throws IllegalArgumentException if {@code start} or {@code remaining} is negative
+   */
+  BoundedInputStream(SeekableByteChannelCompat channel, long start, long remaining) {
+    checkArgument(
+        remaining >= 0 && start >= 0,
+        String.format("Invalid length of stream at offset=%d, length=%d", start, remaining));
+
+    end = start + remaining;
+    this.channel = channel;
+    position = start;
+  }
+
+  @Override
+  public int available() throws IOException {
+    return (int) (Math.min(end, channel.size()) - position);
+  }
+
+  @Override
+  public int read() throws IOException {
+    if (position >= end) {
+      return -1;
+    }
+
+    singleByteBuffer.rewind();
+    int count = read(position, singleByteBuffer);
+    if (count < 0) {
+      return count;
+    }
+
+    position++;
+    return singleByteBuffer.get() & 0xff;
+  }
+
+  @Override
+  public int read(byte[] b, int off, int len) throws IOException {
+    checkNotNull(b);
+    checkElementIndex(off, b.length, "The start offset");
+    checkElementIndex(len, b.length - off + 1, "The maximumn number of bytes to read");
+
+    if (len == 0) {
+      return 0;
+    }
+
+    if (len > end - position) {
+      if (position >= end) {
+        return -1;
+      }
+      len = (int) (end - position);
+    }
+
+    ByteBuffer buf = ByteBuffer.wrap(b, off, len);
+    int count = read(position, buf);
+    if (count > 0) {
+      position += count;
+    }
+    return count;
+  }
+
+  private int read(long position, ByteBuffer buf) throws IOException {
+    int count;
+    synchronized (channel) {
+      channel.position(position);
+      count = channel.read(buf);
+    }
+    buf.flip();
+    return count;
+  }
+}
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ByteBufferChannel.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ByteBufferChannel.java
new file mode 100644
index 00000000000..e5d54a415ed
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ByteBufferChannel.java
@@ -0,0 +1,130 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.support.metadata;
+
+import static java.lang.Math.min;
+import static org.tensorflow.lite.support.metadata.Preconditions.checkArgument;
+import static org.tensorflow.lite.support.metadata.Preconditions.checkNotNull;
+
+import java.nio.ByteBuffer;
+import java.nio.channels.NonWritableChannelException;
+
+/** Implements the {@link SeekableByteChannelCompat} on top of {@link ByteBuffer}. */
+final class ByteBufferChannel implements SeekableByteChannelCompat {
+
+  /** The ByteBuffer that holds the data. */
+  private final ByteBuffer buffer;
+
+  /**
+   * Creates a {@link ByteBufferChannel} that wraps a {@link ByteBuffer}.
+   *
+   * @param buffer the {@link ByteBuffer} that backs this {@link ByteBufferChannel}
+   * @throws NullPointerException if {@code buffer} is null
+   */
+  public ByteBufferChannel(ByteBuffer buffer) {
+    checkNotNull(buffer, "The ByteBuffer cannot be null.");
+    this.buffer = buffer;
+  }
+
+  @Override
+  public void close() {}
+
+  @Override
+  public boolean isOpen() {
+    return true;
+  }
+
+  @Override
+  public long position() {
+    return buffer.position();
+  }
+
+  /**
+   * Sets this channel's position.
+   *
+   * @param newPosition the new position, a non-negative integer counting the number of bytes from
+   *     the beginning of the entity
+   * @return this channel
+   * @throws IllegalArgumentException if the new position is negative, or greater than the size of
+   *     the underlying {@link ByteBuffer}, or greater than Integer.MAX_VALUE
+   */
+  @Override
+  public synchronized ByteBufferChannel position(long newPosition) {
+    checkArgument(
+        (newPosition >= 0 && newPosition <= Integer.MAX_VALUE),
+        "The new position should be non-negative and be less than Integer.MAX_VALUE.");
+    buffer.position((int) newPosition);
+    return this;
+  }
+
+  /**
+   * {@inheritDoc}
+   *
+   * <p>Bytes are read starting at this channel's current position, and then the position is updated
+   * with the number of bytes actually read. Otherwise this method behaves exactly as specified in
+   * the {@link ReadableByteChannel} interface.
+   */
+  @Override
+  public synchronized int read(ByteBuffer dst) {
+    if (buffer.remaining() == 0) {
+      return -1;
+    }
+
+    int count = min(dst.remaining(), buffer.remaining());
+    if (count > 0) {
+      ByteBuffer tempBuffer = buffer.slice();
+      tempBuffer.order(buffer.order()).limit(count);
+      dst.put(tempBuffer);
+      buffer.position(buffer.position() + count);
+    }
+    return count;
+  }
+
+  @Override
+  public long size() {
+    return buffer.limit();
+  }
+
+  @Override
+  public synchronized ByteBufferChannel truncate(long size) {
+    checkArgument(
+        (size >= 0 && size <= Integer.MAX_VALUE),
+        "The new size should be non-negative and be less than Integer.MAX_VALUE.");
+
+    if (size < buffer.limit()) {
+      buffer.limit((int) size);
+      if (buffer.position() > size) {
+        buffer.position((int) size);
+      }
+    }
+    return this;
+  }
+
+  @Override
+  public synchronized int write(ByteBuffer src) {
+    if (buffer.isReadOnly()) {
+      throw new NonWritableChannelException();
+    }
+
+    int count = min(src.remaining(), buffer.remaining());
+    if (count > 0) {
+      ByteBuffer tempBuffer = src.slice();
+      tempBuffer.order(buffer.order()).limit(count);
+      buffer.put(tempBuffer);
+    }
+    return count;
+  }
+}
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java
new file mode 100644
index 00000000000..3bd60edfef6
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java
@@ -0,0 +1,247 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.support.metadata;
+
+import static org.tensorflow.lite.support.metadata.Preconditions.checkArgument;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.util.zip.ZipException;
+import org.checkerframework.checker.nullness.qual.Nullable;
+import org.tensorflow.lite.DataType;
+import org.tensorflow.lite.Tensor.QuantizationParams;
+import org.tensorflow.lite.schema.Tensor;
+import org.tensorflow.lite.support.metadata.schema.TensorMetadata;
+
+/**
+ * Loads metadata from TFLite Model FlatBuffer.
+ *
+ * <p>TFLite Model FlatBuffer can be generated using the <a
+ * href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema.fbs">TFLite
+ * Model schema file.</a>
+ *
+ * <p>Some models contain a TFLite Metadata Flatbuffer, which records more information about what
+ * the model does and how to interprete the model. TFLite Metadata Flatbuffer can be generated using
+ * the <a
+ * href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/metadata_schema.fbs">TFLite
+ * Metadata schema file.</a>
+ *
+ * <p>It is allowed to pass in a model FlatBuffer without TFLite metadata. However, invoking methods
+ * that read from TFLite metadata will cause runtime errors.
+ *
+ * <p>Similarly, it is allowed to pass in a model FlatBuffer without associated files. However,
+ * invoking methods that read the associated files will cause runtime errors.
+ *
+ * <p>Though TFLite model FlatBuffer supports multiple subgraphs, TFLite Interpreter only supports a
+ * single subgraph so far. See the <a
+ * href="https://www.tensorflow.org/lite/convert/cmdline_examples#specifying_subgraphs">instruction
+ * of how to specify subgraph during convertion for more information.</a> Therefore, {@link
+ * MetadataExtractor} omits subgraph index as an input in its methods.
+ */
+public class MetadataExtractor {
+  /** The helper class to load metadata from TFLite model FlatBuffer. */
+  private final ModelInfo modelInfo;
+
+  /** The helper class to load metadata from TFLite metadata FlatBuffer. */
+  @Nullable private final ModelMetadataInfo metadataInfo;
+
+  /** The handler to load associated files through zip. */
+  @Nullable private final ZipFile zipFile;
+
+  /**
+   * Creates a {@link MetadataExtractor} with TFLite model FlatBuffer.
+   *
+   * @param buffer the TFLite model FlatBuffer
+   * @throws IllegalArgumentException if the number of input or output tensors in the model does not
+   *     match that in the metadata
+   * @throws IOException if an error occurs while reading the model as a Zip file
+   */
+  public MetadataExtractor(ByteBuffer buffer) throws IOException {
+    modelInfo = new ModelInfo(buffer);
+    ByteBuffer metadataBuffer = modelInfo.getMetadataBuffer();
+    if (metadataBuffer != null) {
+      metadataInfo = new ModelMetadataInfo(metadataBuffer);
+      checkArgument(
+          modelInfo.getInputTensorCount() == metadataInfo.getInputTensorCount(),
+          String.format(
+              "The number of input tensors in the model is %d. The number of input tensors that"
+                  + " recorded in the metadata is %d. These two values does not match.",
+              modelInfo.getInputTensorCount(), metadataInfo.getInputTensorCount()));
+      checkArgument(
+          modelInfo.getOutputTensorCount() == metadataInfo.getOutputTensorCount(),
+          String.format(
+              "The number of output tensors in the model is %d. The number of output tensors that"
+                  + " recorded in the metadata is %d. These two values does not match.",
+              modelInfo.getOutputTensorCount(), metadataInfo.getOutputTensorCount()));
+    } else {
+      // It is allowed to pass in a model FlatBuffer without TFLite metadata. However, invoking
+      // methods that read from TFLite metadata will cause runtime errors.
+      metadataInfo = null;
+    }
+
+    zipFile = createZipFile(buffer);
+  }
+
+  /**
+   * Gets the packed associated file with the specified {@code fileName}.
+   *
+   * @param fileName the name of the associated file
+   * @return the raw input stream containing specified file
+   * @throws IllegalStateException if the model is not a zip file
+   * @throws IllegalArgumentException if the specified file does not exist in the model
+   */
+  public InputStream getAssociatedFile(String fileName) {
+    assertZipFile();
+    return zipFile.getRawInputStream(fileName);
+  }
+
+  /** Gets the count of input tensors in the model. */
+  public int getInputTensorCount() {
+    return modelInfo.getInputTensorCount();
+  }
+
+  /**
+   * Gets the metadata for the input tensor specified by {@code inputIndex}.
+   *
+   * @param inputIndex the index of the desired input tensor
+   * @throws IllegalStateException if this model does not contain model metadata
+   */
+  @Nullable
+  public TensorMetadata getInputTensorMetadata(int inputIndex) {
+    assertMetadataInfo();
+    return metadataInfo.getInputTensorMetadata(inputIndex);
+  }
+
+  /**
+   * Gets the quantization parameters for the input tensor specified by {@code inputIndex}.
+   *
+   * @param inputIndex the index of the desired input tensor
+   */
+  public QuantizationParams getInputTensorQuantizationParams(int inputIndex) {
+    Tensor tensor = modelInfo.getInputTensor(inputIndex);
+    return modelInfo.getQuantizationParams(tensor);
+  }
+
+  /**
+   * Gets the shape of the input tensor with {@code inputIndex}.
+   *
+   * @param inputIndex the index of the desired input tensor
+   */
+  public int[] getInputTensorShape(int inputIndex) {
+    return modelInfo.getInputTensorShape(inputIndex);
+  }
+
+  /**
+   * Gets the {@link DataType} of the input tensor with {@code inputIndex}.
+   *
+   * @param inputIndex the index of the desired input tensor
+   */
+  public DataType getInputTensorType(int inputIndex) {
+    return modelInfo.getInputTensorType(inputIndex);
+  }
+
+  /** Gets the count of output tensors in the model. */
+  public int getOutputTensorCount() {
+    return modelInfo.getOutputTensorCount();
+  }
+
+  /**
+   * Gets the metadata for the output tensor specified by {@code outputIndex}.
+   *
+   * @param outputIndex the index of the desired output tensor
+   * @throws IllegalStateException if this model does not contain model metadata
+   */
+  @Nullable
+  public TensorMetadata getOutputTensorMetadata(int outputIndex) {
+    assertMetadataInfo();
+    return metadataInfo.getOutputTensorMetadata(outputIndex);
+  }
+
+  /**
+   * Gets the quantization parameters for the output tensor specified by {@code outputIndex}.
+   *
+   * @param outputIndex the index of the desired output tensor
+   */
+  public QuantizationParams getOutputTensorQuantizationParams(int outputIndex) {
+    Tensor tensor = modelInfo.getOutputTensor(outputIndex);
+    return modelInfo.getQuantizationParams(tensor);
+  }
+
+  /**
+   * Gets the shape of the output tensor with {@code outputIndex}.
+   *
+   * @param outputIndex the index of the desired output tensor
+   */
+  public int[] getOutputTensorShape(int outputIndex) {
+    return modelInfo.getOutputTensorShape(outputIndex);
+  }
+
+  /**
+   * Gets the {@link DataType} of the output tensor with {@code outputIndex}.
+   *
+   * @param outputIndex the index of the desired output tensor
+   */
+  public DataType getOutputTensorType(int outputIndex) {
+    return modelInfo.getOutputTensorType(outputIndex);
+  }
+
+  /**
+   * Asserts if {@link metdadataInfo} is not initialized. Some models may not have metadata and this
+   * is allowed. However, invoking methods that reads the metadata is not allowed.
+   *
+   * @throws IllegalStateException if this model does not contain model metadata
+   */
+  private void assertMetadataInfo() {
+    if (metadataInfo == null) {
+      throw new IllegalStateException("This model does not contain model metadata.");
+    }
+  }
+
+  /**
+   * Asserts if {@link #zipFile} is not initialized. Some models may not have associated files, thus
+   * are not Zip files. This is allowed. However, invoking methods that reads those associated files
+   * is not allowed.
+   *
+   * @throws IllegalStateException if this model is not a Zip file
+   */
+  private void assertZipFile() {
+    if (zipFile == null) {
+      throw new IllegalStateException(
+          "This model does not contain associated files, and is not a Zip file.");
+    }
+  }
+
+  /**
+   * Creates a Zip file handler to read the associated files. If the model is not a zip file, i.e.
+   * it does not have associated files, return a null handler.
+   *
+   * @param buffer the TFLite model FlatBuffer
+   * @throws IOException if an error occurs while reading the model as a Zip file
+   */
+  @Nullable
+  private static ZipFile createZipFile(ByteBuffer buffer) throws IOException {
+    try {
+      // Creates the handler to hold the associated files through the Zip.
+      ByteBufferChannel byteBufferChannel = new ByteBufferChannel(buffer);
+      return ZipFile.createFrom(byteBufferChannel);
+    } catch (ZipException e) {
+      // Some models may not have associate files. Therefore, Those models are not zip files.
+      // However, invoking methods that read associated files later will lead into errors.
+      return null;
+    }
+  }
+}
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelInfo.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelInfo.java
new file mode 100644
index 00000000000..27dfcac7f9b
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelInfo.java
@@ -0,0 +1,282 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.support.metadata;
+
+import static org.tensorflow.lite.support.metadata.Preconditions.checkArgument;
+import static org.tensorflow.lite.support.metadata.Preconditions.checkNotNull;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.checkerframework.checker.nullness.qual.Nullable;
+import org.tensorflow.lite.DataType;
+import org.tensorflow.lite.Tensor.QuantizationParams;
+import org.tensorflow.lite.schema.Buffer;
+import org.tensorflow.lite.schema.Metadata;
+import org.tensorflow.lite.schema.Model;
+import org.tensorflow.lite.schema.QuantizationParameters;
+import org.tensorflow.lite.schema.SubGraph;
+import org.tensorflow.lite.schema.Tensor;
+import org.tensorflow.lite.schema.TensorType;
+
+/** Extracts model information out of TFLite model FLatBuffer. */
+final class ModelInfo {
+  /** The model that is loaded from TFLite model FlatBuffer. */
+  private final Model model;
+
+  /** A list of input tensors. */
+  private final List</* @Nullable */ Tensor> inputTensors;
+
+  /** A list of output tensors. */
+  private final List</* @Nullable */ Tensor> outputTensors;
+
+  /** Identifier of the TFLite model metadata in the Metadata array. */
+  static final String METADATA_FIELD_NAME = "TFLITE_METADATA";
+
+  /** Maps from TensorType in TFlite FlatBuffer to {@link DataType} in Java. */
+  private final Map<Byte, DataType> tensorTypeToDataTypeMap;
+
+  /**
+   * Creates a {@link ModelInfo} with the model FlatBuffer, {@code buffer}.
+   *
+   * <p>Though TFLite model FlatBuffer supports multiple subgraphs, TFLite Interpreter only supports
+   * single subgraph so far. See the <a
+   * href="https://www.tensorflow.org/lite/convert/cmdline_examples#specifying_subgraphs">instruction
+   * of how to specify subgraph during convertion for more information.</a> Therefore, all methods
+   * in {@link ModelInfo} retrieves metadata of the first subgrpah as default.
+   *
+   * @param buffer The TFLite model FlatBuffer.
+   * @throws NullPointerException if {@code buffer} is null.
+   * @throws IllegalArgumentException if the model does not contain any subgraph.
+   */
+  ModelInfo(ByteBuffer buffer) {
+    checkNotNull(buffer, "Model flatbuffer cannot be null.");
+
+    model = Model.getRootAsModel(buffer);
+    checkArgument(model.subgraphsLength() > 0, "The model does not contain any subgraph.");
+
+    inputTensors = getInputTensors(model);
+    outputTensors = getOutputTensors(model);
+    tensorTypeToDataTypeMap = createTensorTypeToDataTypeMap();
+  }
+
+  /**
+   * Gets the input tensor with {@code inputIndex}.
+   *
+   * @param inputIndex The index of the desired input tensor.
+   * @throws IllegalArgumentException if the inputIndex specified is invalid.
+   */
+  @Nullable
+  Tensor getInputTensor(int inputIndex) {
+    checkArgument(
+        inputIndex >= 0 && inputIndex < inputTensors.size(),
+        "The inputIndex specified is invalid.");
+    return inputTensors.get(inputIndex);
+  }
+
+  int getInputTensorCount() {
+    return inputTensors.size();
+  }
+
+  /**
+   * Gets shape of the input tensor with {@code inputIndex}.
+   *
+   * @param inputIndex The index of the desired intput tensor.
+   */
+  int[] getInputTensorShape(int inputIndex) {
+    Tensor tensor = getInputTensor(inputIndex);
+    return getShape(tensor);
+  }
+
+  /**
+   * Gets {@link DataType} of the input tensor with {@code inputIndex}.
+   *
+   * @param inputIndex The index of the desired intput tensor.
+   */
+  DataType getInputTensorType(int inputIndex) {
+    Tensor tensor = getInputTensor(inputIndex);
+    return getDataType(tensor.type());
+  }
+
+  /** Gets the metadata FlatBuffer from the model FlatBuffer. */
+  @Nullable
+  ByteBuffer getMetadataBuffer() {
+    // Some models may not have metadata, and this is allowed.
+    if (model.metadataLength() == 0) {
+      return null;
+    }
+
+    for (int i = 0; i < model.metadataLength(); i++) {
+      Metadata meta = model.metadata(i);
+      if (METADATA_FIELD_NAME.equals(meta.name())) {
+        long bufferIndex = meta.buffer();
+        Buffer metadataBuf = model.buffers((int) bufferIndex);
+        return metadataBuf.dataAsByteBuffer();
+      }
+    }
+    return null;
+  }
+
+  /**
+   * Gets the output tensor with {@code outputIndex}.
+   *
+   * @param outputIndex The index of the desired outtput tensor.
+   * @throws IllegalArgumentException if the outputIndex specified is invalid.
+   */
+  @Nullable
+  Tensor getOutputTensor(int outputIndex) {
+    checkArgument(
+        outputIndex >= 0 && outputIndex < outputTensors.size(),
+        "The outputIndex specified is invalid.");
+    return outputTensors.get(outputIndex);
+  }
+
+  int getOutputTensorCount() {
+    return outputTensors.size();
+  }
+
+  /**
+   * Gets shape of the output tensor with {@code outputIndex}.
+   *
+   * @param outputIndex The index of the desired outtput tensor.
+   */
+  int[] getOutputTensorShape(int outputIndex) {
+    Tensor tensor = getOutputTensor(outputIndex);
+    return getShape(tensor);
+  }
+
+  /**
+   * Gets {@link DataType} of the output tensor {@code outputIndex}.
+   *
+   * @param outputIndex The index of the desired outtput tensor.
+   */
+  DataType getOutputTensorType(int outputIndex) {
+    Tensor tensor = getOutputTensor(outputIndex);
+    return getDataType(tensor.type());
+  }
+
+  private static Map<Byte, DataType> createTensorTypeToDataTypeMap() {
+    Map<Byte, DataType> map = new HashMap<>();
+    map.put(TensorType.FLOAT32, DataType.FLOAT32);
+    map.put(TensorType.INT32, DataType.INT32);
+    map.put(TensorType.UINT8, DataType.UINT8);
+    map.put(TensorType.INT64, DataType.INT64);
+    map.put(TensorType.STRING, DataType.STRING);
+    return Collections.unmodifiableMap(map);
+  }
+
+  /**
+   * Gets the quantization parameters of a tensor.
+   *
+   * <p>Only quantized tensors have valid {@code QuantizationParameters}. For tensor that are not
+   * quantized, the values of scale and zero_point are both 0.
+   *
+   * @param tensor The tensor whoes quantization parameters is desired.
+   * @throws NullPointerException if the tensor is null.
+   * @throws IllegalArgumentException if {@code scale} and {@code zeroPoint} of the tensor's {@link
+   *     QuantizationParameters} are not single values.
+   */
+  QuantizationParams getQuantizationParams(Tensor tensor) {
+    checkNotNull(tensor, "Tensor cannot be null.");
+
+    float scale;
+    int zeroPoint;
+    QuantizationParameters quantization = tensor.quantization();
+
+    // Tensors that are not quantized do not have quantization parameters, which can be null when
+    // being extracted from the flatbuffer.
+    if (quantization == null) {
+      scale = 0.0f;
+      zeroPoint = 0;
+      return new QuantizationParams(scale, zeroPoint);
+    }
+
+    // Tensors that are not quantized do not have quantization parameters.
+    // quantization.scaleLength() and quantization.zeroPointLength() may both return 0.
+    checkArgument(
+        quantization.scaleLength() <= 1,
+        "Input and output tensors do not support per-channel quantization.");
+    checkArgument(
+        quantization.zeroPointLength() <= 1,
+        "Input and output tensors do not support per-channel quantization.");
+
+    // For tensors that are not quantized, quantization.scale(0) and quantization.zeroPoint(0) will
+    // both be the default value in flatbuffer, 0. This behavior is consistent with the TFlite C++
+    // runtime.
+    scale = quantization.scale(0);
+    // zeroPoint is a long value in the schema, but an integer in the C++ runtime. Here we keep it
+    // consistent with the C++ runtime.
+    zeroPoint = (int) quantization.zeroPoint(0);
+
+    return new QuantizationParams(scale, zeroPoint);
+  }
+
+  /**
+   * Transforms from TensorType in TFlite FlatBuffer to {@link DataType} in Java.
+   *
+   * @param tensorType The tensor type to be converted.
+   * @throws IllegalArgumentException if the tensor type is not supported.
+   */
+  private DataType getDataType(byte tensorType) {
+    checkArgument(
+        tensorTypeToDataTypeMap.containsKey(tensorType),
+        String.format("Tensor type %d is not supported.", tensorType));
+    return tensorTypeToDataTypeMap.get(tensorType);
+  }
+
+  /**
+   * Gets the shape of a tensor.
+   *
+   * @param tensor The tensor whoes shape is desired.
+   * @throws NullPointerException if the tensor is null.
+   */
+  private static int[] getShape(Tensor tensor) {
+    checkNotNull(tensor, "Tensor cannot be null.");
+    int shapeDim = tensor.shapeLength();
+    int[] tensorShape = new int[shapeDim];
+    for (int i = 0; i < shapeDim; i++) {
+      tensorShape[i] = tensor.shape(i);
+    }
+    return tensorShape;
+  }
+
+  /** Gets input tensors from a model. */
+  private static List<Tensor> getInputTensors(Model model) {
+    // TFLite only support one subgraph currently.
+    SubGraph subgraph = model.subgraphs(0);
+    int tensorNum = subgraph.inputsLength();
+    ArrayList<Tensor> inputTensors = new ArrayList<>(tensorNum);
+    for (int i = 0; i < tensorNum; i++) {
+      inputTensors.add(subgraph.tensors(subgraph.inputs(i)));
+    }
+    return Collections.unmodifiableList(inputTensors);
+  }
+
+  /** Gets output tensors from a model. */
+  private static List<Tensor> getOutputTensors(Model model) {
+    // TFLite only support one subgraph currently.
+    SubGraph subgraph = model.subgraphs(0);
+    int tensorNum = subgraph.outputsLength();
+    ArrayList<Tensor> outputTensors = new ArrayList<>(tensorNum);
+    for (int i = 0; i < tensorNum; i++) {
+      outputTensors.add(subgraph.tensors(subgraph.outputs(i)));
+    }
+    return Collections.unmodifiableList(outputTensors);
+  }
+}
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelMetadataInfo.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelMetadataInfo.java
new file mode 100644
index 00000000000..6a6419393d5
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelMetadataInfo.java
@@ -0,0 +1,116 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.support.metadata;
+
+import static org.tensorflow.lite.support.metadata.Preconditions.checkArgument;
+import static org.tensorflow.lite.support.metadata.Preconditions.checkNotNull;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import org.checkerframework.checker.nullness.qual.Nullable;
+import org.tensorflow.lite.support.metadata.schema.ModelMetadata;
+import org.tensorflow.lite.support.metadata.schema.SubGraphMetadata;
+import org.tensorflow.lite.support.metadata.schema.TensorMetadata;
+
+/** Extracts model metadata information out of TFLite metadata FlatBuffer. */
+final class ModelMetadataInfo {
+  /** Metadata array of input tensors. */
+  private final List</* @Nullable */ TensorMetadata> inputsMetadata;
+
+  /** Metadata array of output tensors. */
+  private final List</* @Nullable */ TensorMetadata> outputsMetadata;
+
+  /**
+   * Creates a {@link ModelMetadataInfo} with the metadata FlatBuffer, {@code buffer}.
+   *
+   * @param buffer The TFLite metadata FlatBuffer.
+   * @throws NullPointerException if {@code buffer} is null.
+   * @throws IllegalArgumentException if the metadata does not contain any subgraph metadata.
+   */
+  ModelMetadataInfo(ByteBuffer buffer) {
+    checkNotNull(buffer, "Metadata flatbuffer cannot be null.");
+
+    ModelMetadata modelMetadata = ModelMetadata.getRootAsModelMetadata(buffer);
+    checkArgument(
+        modelMetadata.subgraphMetadataLength() > 0,
+        "The metadata flatbuffer does not contain any subgraph metadata.");
+
+    inputsMetadata = getInputsMetadata(modelMetadata);
+    outputsMetadata = getOutputsMetadata(modelMetadata);
+  }
+
+  /** Gets the count of input tensors with metadata in the metadata FlatBuffer. */
+  int getInputTensorCount() {
+    return inputsMetadata.size();
+  }
+
+  /**
+   * Gets the metadata for the input tensor specified by {@code inputIndex}.
+   *
+   * @param inputIndex The index of the desired intput tensor.
+   * @throws IllegalArgumentException if the inputIndex specified is invalid.
+   */
+  @Nullable
+  TensorMetadata getInputTensorMetadata(int inputIndex) {
+    checkArgument(
+        inputIndex >= 0 && inputIndex < inputsMetadata.size(),
+        "The inputIndex specified is invalid.");
+    return inputsMetadata.get(inputIndex);
+  }
+
+  /** Gets the count of output tensors with metadata in the metadata FlatBuffer. */
+  int getOutputTensorCount() {
+    return outputsMetadata.size();
+  }
+
+  /**
+   * Gets the metadata for the output tensor specified by {@code outputIndex}.
+   *
+   * @param outputIndex The index of the desired output tensor.
+   * @throws IllegalArgumentException if the outputIndex specified is invalid.
+   */
+  @Nullable
+  TensorMetadata getOutputTensorMetadata(int outputIndex) {
+    checkArgument(
+        outputIndex >= 0 && outputIndex < outputsMetadata.size(),
+        "The outputIndex specified is invalid.");
+    return outputsMetadata.get(outputIndex);
+  }
+
+  /** Gets metadata for all input tensors. */
+  private static List<TensorMetadata> getInputsMetadata(ModelMetadata modelMetadata) {
+    SubGraphMetadata subgraphMetadata = modelMetadata.subgraphMetadata(0);
+    int tensorNum = subgraphMetadata.inputTensorMetadataLength();
+    ArrayList<TensorMetadata> inputsMetadata = new ArrayList<>(tensorNum);
+    for (int i = 0; i < tensorNum; i++) {
+      inputsMetadata.add(subgraphMetadata.inputTensorMetadata(i));
+    }
+    return Collections.unmodifiableList(inputsMetadata);
+  }
+
+  /** Gets metadata for all output tensors. */
+  private static List<TensorMetadata> getOutputsMetadata(ModelMetadata modelMetadata) {
+    SubGraphMetadata subgraphMetadata = modelMetadata.subgraphMetadata(0);
+    int tensorNum = subgraphMetadata.outputTensorMetadataLength();
+    ArrayList<TensorMetadata> outputsMetadata = new ArrayList<>(tensorNum);
+    for (int i = 0; i < tensorNum; i++) {
+      outputsMetadata.add(subgraphMetadata.outputTensorMetadata(i));
+    }
+    return Collections.unmodifiableList(outputsMetadata);
+  }
+}
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/Preconditions.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/Preconditions.java
new file mode 100644
index 00000000000..c2f20fbaacd
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/Preconditions.java
@@ -0,0 +1,184 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.support.metadata;
+
+import org.checkerframework.checker.nullness.qual.Nullable;
+
+/** Static error checking util methods. */
+final class Preconditions {
+  /**
+   * Ensures that an object reference passed as a parameter to the calling method is not null.
+   *
+   * @param reference an object reference
+   * @return the non-null reference that was validated
+   * @throws NullPointerException if {@code reference} is null
+   */
+  public static <T extends Object> T checkNotNull(T reference) {
+    if (reference == null) {
+      throw new NullPointerException("The object reference is null.");
+    }
+    return reference;
+  }
+
+  /**
+   * Ensures that an object reference passed as a parameter to the calling method is not null.
+   *
+   * @param reference an object reference
+   * @param errorMessage the exception message to use if the check fails; will be converted to a
+   *     string using {@link String#valueOf(Object)}
+   * @return the non-null reference that was validated
+   * @throws NullPointerException if {@code reference} is null
+   */
+  public static <T extends Object> T checkNotNull(T reference, @Nullable Object errorMessage) {
+    if (reference == null) {
+      throw new NullPointerException(String.valueOf(errorMessage));
+    }
+    return reference;
+  }
+
+  /**
+   * Ensures that the given String is not empty and not null.
+   *
+   * @param string the String to test
+   * @return the non-null non-empty String that was validated
+   * @throws IllegalArgumentException if {@code string} is null or empty
+   */
+  public static String checkNotEmpty(String string) {
+    if (string == null || string.length() == 0) {
+      throw new IllegalArgumentException("Given String is empty or null.");
+    }
+    return string;
+  }
+
+  /**
+   * Ensures that the given String is not empty and not null.
+   *
+   * @param string the String to test
+   * @param errorMessage the exception message to use if the check fails; will be converted to a
+   *     string using {@link String#valueOf(Object)}
+   * @return the non-null non-empty String that was validated
+   * @throws IllegalArgumentException if {@code string} is null or empty
+   */
+  public static String checkNotEmpty(String string, Object errorMessage) {
+    if (string == null || string.length() == 0) {
+      throw new IllegalArgumentException(String.valueOf(errorMessage));
+    }
+    return string;
+  }
+
+  /**
+   * Ensures the truth of an expression involving one or more parameters to the calling method.
+   *
+   * @param expression a boolean expression.
+   * @throws IllegalArgumentException if {@code expression} is false.
+   */
+  public static void checkArgument(boolean expression) {
+    if (!expression) {
+      throw new IllegalArgumentException();
+    }
+  }
+
+  /**
+   * Ensures the truth of an expression involving one or more parameters to the calling method.
+   *
+   * @param expression a boolean expression.
+   * @param errorMessage the exception message to use if the check fails; will be converted to a
+   *     string using {@link String#valueOf(Object)}.
+   * @throws IllegalArgumentException if {@code expression} is false.
+   */
+  public static void checkArgument(boolean expression, @Nullable Object errorMessage) {
+    if (!expression) {
+      throw new IllegalArgumentException(String.valueOf(errorMessage));
+    }
+  }
+
+  /**
+   * Ensures that {@code index} specifies a valid <i>element</i> in an array, list or string of size
+   * {@code size}. An element index may range from zero, inclusive, to {@code size}, exclusive.
+   *
+   * @param index a user-supplied index identifying an element of an array, list or string
+   * @param size the size of that array, list or string
+   * @return the value of {@code index}
+   * @throws IndexOutOfBoundsException if {@code index} is negative or is not less than {@code size}
+   * @throws IllegalArgumentException if {@code size} is negative
+   */
+  public static int checkElementIndex(int index, int size) {
+    return checkElementIndex(index, size, "index");
+  }
+
+  /**
+   * Ensures that {@code index} specifies a valid <i>element</i> in an array, list or string of size
+   * {@code size}. An element index may range from zero, inclusive, to {@code size}, exclusive.
+   *
+   * @param index a user-supplied index identifying an element of an array, list or string
+   * @param size the size of that array, list or string
+   * @param desc the text to use to describe this index in an error message
+   * @return the value of {@code index}
+   * @throws IndexOutOfBoundsException if {@code index} is negative or is not less than {@code size}
+   * @throws IllegalArgumentException if {@code size} is negative
+   */
+  public static int checkElementIndex(int index, int size, @Nullable String desc) {
+    // Carefully optimized for execution by hotspot (explanatory comment above)
+    if (index < 0 || index >= size) {
+      throw new IndexOutOfBoundsException(badElementIndex(index, size, desc));
+    }
+    return index;
+  }
+
+  /**
+   * Ensures the truth of an expression involving the state of the calling instance, but not
+   * involving any parameters to the calling method.
+   *
+   * @param expression a boolean expression
+   * @throws IllegalStateException if {@code expression} is false
+   * @see Verify#verify Verify.verify()
+   */
+  public static void checkState(boolean expression) {
+    if (!expression) {
+      throw new IllegalStateException();
+    }
+  }
+
+  /**
+   * Ensures the truth of an expression involving the state of the calling instance, but not
+   * involving any parameters to the calling method.
+   *
+   * @param expression a boolean expression
+   * @param errorMessage the exception message to use if the check fails; will be converted to a
+   *     string using {@link String#valueOf(Object)}
+   * @throws IllegalStateException if {@code expression} is false
+   * @see Verify#verify Verify.verify()
+   */
+  public static void checkState(boolean expression, @Nullable Object errorMessage) {
+    if (!expression) {
+      throw new IllegalStateException(String.valueOf(errorMessage));
+    }
+  }
+
+  private static String badElementIndex(int index, int size, @Nullable String desc) {
+    if (index < 0) {
+      return String.format("%s (%s) must not be negative", desc, index);
+    } else if (size < 0) {
+      throw new IllegalArgumentException("negative size: " + size);
+    } else { // index >= size
+      return String.format("%s (%s) must be less than size (%s)", desc, index, size);
+    }
+  }
+
+  private Preconditions() {
+    throw new AssertionError("Preconditions is Uninstantiable.");
+  }
+}
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/SeekableByteChannelCompat.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/SeekableByteChannelCompat.java
new file mode 100644
index 00000000000..c655786755b
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/SeekableByteChannelCompat.java
@@ -0,0 +1,107 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.support.metadata;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.channels.Channel;
+
+/**
+ * A byte channel that maintains a current <i>position</i> and allows the position to be changed.
+ * {@link SeekableByteChannelCompat} is compatible with {@link
+ * java.nio.channels.SeekableByteChannel}.
+ *
+ * <p>{@link java.nio.channels.SeekableByteChannel} is not available in Android API 23 and under.
+ * Therefore, {@link SeekableByteChannelCompat} is introduced here to make the interfaces used in
+ * the MetadtaExtractor library consistent with the common used Java libraries.
+ */
+interface SeekableByteChannelCompat extends Channel {
+  /**
+   * Reads a sequence of bytes from this channel into the given buffer.
+   *
+   * @param dst The buffer into which bytes are to be transferred
+   * @return The number of bytes read, possibly zero, or <tt>-1</tt> if the channel has reached
+   *     end-of-stream
+   * @throws NonReadableChannelException If this channel was not opened for reading
+   * @throws ClosedChannelException If this channel is closed
+   * @throws AsynchronousCloseException If another thread closes this channel while the read
+   *     operation is in progress
+   * @throws ClosedByInterruptException If another thread interrupts the current thread while the
+   *     read operation is in progress, thereby closing the channel and setting the current thread's
+   *     interrupt status
+   * @throws IOException If some other I/O error occurs
+   */
+  int read(ByteBuffer dst) throws IOException;
+
+  /**
+   * Writes a sequence of bytes to this channel from the given buffer.
+   *
+   * @param src The buffer from which bytes are to be retrieved
+   * @return The number of bytes written, possibly zero
+   * @throws NonWritableChannelException If this channel was not opened for writing
+   * @throws ClosedChannelException If this channel is closed
+   * @throws AsynchronousCloseException If another thread closes this channel while the write
+   *     operation is in progress
+   * @throws ClosedByInterruptException If another thread interrupts the current thread while the
+   *     write operation is in progress, thereby closing the channel and setting the current
+   *     thread's interrupt status
+   * @throws IOException If some other I/O error occurs
+   */
+  int write(ByteBuffer src) throws IOException;
+
+  /**
+   * Returns this channel's position.
+   *
+   * @return This channel's position, a non-negative integer counting the number of bytes from the
+   *     beginning of the entity to the current position
+   * @throws ClosedChannelException If this channel is closed
+   * @throws IOException If some other I/O error occurs
+   */
+  long position() throws IOException;
+
+  /**
+   * Sets this channel's position.
+   *
+   * @param newPosition The new position, a non-negative integer counting the number of bytes from
+   *     the beginning of the entity
+   * @return This channel
+   * @throws ClosedChannelException If this channel is closed
+   * @throws IllegalArgumentException If the new position is negative
+   * @throws IOException If some other I/O error occurs
+   */
+  SeekableByteChannelCompat position(long newPosition) throws IOException;
+
+  /**
+   * Returns the current size of entity to which this channel is connected.
+   *
+   * @return The current size, measured in bytes
+   * @throws ClosedChannelException If this channel is closed
+   * @throws IOException If some other I/O error occurs
+   */
+  long size() throws IOException;
+
+  /**
+   * Truncates the entity, to which this channel is connected, to the given size.
+   *
+   * @param size The new size, a non-negative byte count
+   * @return This channel
+   * @throws NonWritableChannelException If this channel was not opened for writing
+   * @throws ClosedChannelException If this channel is closed
+   * @throws IllegalArgumentException If the new size is negative
+   * @throws IOException If some other I/O error occurs
+   */
+  SeekableByteChannelCompat truncate(long size) throws IOException;
+}
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ZipFile.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ZipFile.java
new file mode 100644
index 00000000000..f055d7dcd7e
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ZipFile.java
@@ -0,0 +1,427 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.support.metadata;
+
+import static org.tensorflow.lite.support.metadata.Preconditions.checkArgument;
+import static org.tensorflow.lite.support.metadata.Preconditions.checkNotNull;
+
+import java.io.Closeable;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.zip.ZipException;
+
+/**
+ * Reads uncompressed files from the TFLite model, a zip file.
+ *
+ * <p>TODO(b/150237111): add a link to the webpage of MetadataPopulator once it's available.
+ *
+ * <p>A TFLite model file becomes a zip file when it contains associated files. The associated files
+ * can be packed to a TFLite model file using the MetadataPopulator. The associated files are not
+ * compressed when being added to the model file.
+ *
+ * <p>{@link ZipFile} does not support Zip64 format, because TFLite models are much smaller than the
+ * size limit for Zip64, which is 4GB.
+ */
+final class ZipFile implements Closeable {
+  /** Maps String to list of ZipEntrys, name -> actual entries. */
+  private final Map<String, List<ZipEntry>> nameMap;
+
+  /** The actual data source. */
+  private final ByteBufferChannel archive;
+
+  /**
+   * Opens the given {@link ByteBufferChannel} for reading, assuming "UTF8" for file names. {@link
+   * ZipFile} does not synchronized over the buffer that is passed into it.
+   *
+   * @param channel the archive
+   * @throws IOException if an error occurs while creating this {@link ZipFile}
+   * @throws ZipException if the channel is not a zip archive
+   * @throws NullPointerException if the archive is null
+   */
+  public static ZipFile createFrom(ByteBufferChannel channel) throws IOException {
+    checkNotNull(channel);
+    ZipParser zipParser = new ZipParser(channel);
+    Map<String, List<ZipEntry>> nameMap = zipParser.parseEntries();
+    return new ZipFile(channel, nameMap);
+  }
+
+  @Override
+  public void close() {
+    archive.close();
+  }
+
+  /**
+   * Exposes the raw stream of the archive entry.
+   *
+   * <p>Since the associated files will not be compressed when being packed to the zip file, the raw
+   * stream represents the non-compressed files.
+   *
+   * <p><b>WARNING:</b> The returned {@link InputStream}, is <b>not</b> thread-safe. If multiple
+   * threads concurrently reading from the returned {@link InputStream}, it must be synchronized
+   * externally.
+   *
+   * @param name name of the entry to get the stream for
+   * @return the raw input stream containing data
+   * @throws IllegalArgumentException if the specified file does not exist in the zip file
+   */
+  public InputStream getRawInputStream(String name) {
+    checkArgument(
+        nameMap.containsKey(name),
+        String.format("The file, %s, does not exist in the zip file.", name));
+
+    List<ZipEntry> entriesWithTheSameName = nameMap.get(name);
+    ZipEntry entry = entriesWithTheSameName.get(0);
+    long start = entry.getDataOffset();
+    long remaining = entry.getSize();
+    return new BoundedInputStream(archive, start, remaining);
+  }
+
+  private ZipFile(ByteBufferChannel channel, Map<String, List<ZipEntry>> nameMap) {
+    archive = channel;
+    this.nameMap = nameMap;
+  }
+
+  /* Parses a Zip archive and gets the information for each {@link ZipEntry}. */
+  private static class ZipParser {
+    private final ByteBufferChannel archive;
+
+    // Cached buffers that will only be used locally in the class to reduce garbage collection.
+    private final ByteBuffer longBuffer =
+        ByteBuffer.allocate(ZipConstants.LONG_BYTE_SIZE).order(ByteOrder.LITTLE_ENDIAN);
+    private final ByteBuffer intBuffer =
+        ByteBuffer.allocate(ZipConstants.INT_BYTE_SIZE).order(ByteOrder.LITTLE_ENDIAN);
+    private final ByteBuffer shortBuffer =
+        ByteBuffer.allocate(ZipConstants.SHORT_BYTE_SIZE).order(ByteOrder.LITTLE_ENDIAN);
+
+    private ZipParser(ByteBufferChannel archive) {
+      this.archive = archive;
+    }
+
+    /**
+     * Parses the underlying {@code archive} and returns the information as a list of {@link
+     * ZipEntry}.
+     */
+    private Map<String, List<ZipEntry>> parseEntries() throws IOException {
+      List<ZipEntry> entries = parseCentralDirectory();
+      return parseLocalFileHeaderData(entries);
+    }
+
+    /**
+     * Checks if the current position contains a central file header signature, {@link
+     * ZipConstants#CENSIG}.
+     */
+    private boolean foundCentralFileheaderSignature() {
+      long signature = (long) getInt();
+      return signature == ZipConstants.CENSIG;
+    }
+
+    /**
+     * Gets the value as a Java int from two bytes starting at the current position of the archive.
+     */
+    private int getShort() {
+      shortBuffer.rewind();
+      archive.read(shortBuffer);
+      shortBuffer.flip();
+      return (int) shortBuffer.getShort();
+    }
+
+    /**
+     * Gets the value as a Java long from four bytes starting at the current position of the
+     * archive.
+     */
+    private int getInt() {
+      intBuffer.rewind();
+      archive.read(intBuffer);
+      intBuffer.flip();
+      return intBuffer.getInt();
+    }
+
+    /**
+     * Gets the value as a Java long from four bytes starting at the current position of the
+     * archive.
+     */
+    private long getLong() {
+      longBuffer.rewind();
+      archive.read(longBuffer);
+      longBuffer.flip();
+      return longBuffer.getLong();
+    }
+
+    /**
+     * Positions the archive at the start of the central directory.
+     *
+     * <p>First, it searches for the signature of the "end of central directory record", {@link
+     * ZipConstants#ENDSIG}. Position the stream at the start of the "end of central directory
+     * record". The zip file are created without archive comments, thus {@link ZipConstants#ENDSIG}
+     * should appear exactly at {@link ZipConstants#ENDHDR} from the end of the zip file.
+     *
+     * <p>Then, parse the "end of central dir record" and position the archive at the start of the
+     * central directory.
+     */
+    private void locateCentralDirectory() throws IOException {
+      if (archive.size() < ZipConstants.ENDHDR) {
+        throw new ZipException("The archive is not a ZIP archive.");
+      }
+
+      // Positions the archive at the start of the "end of central directory record".
+      long offsetRecord = archive.size() - ZipConstants.ENDHDR;
+      archive.position(offsetRecord);
+
+      // Checks for the signature, {@link ZipConstants#ENDSIG}.
+      long endSig = getLong();
+      if (endSig != ZipConstants.ENDSIG) {
+        throw new ZipException("The archive is not a ZIP archive.");
+      }
+
+      // Positions the archive at the “offset of central directory”.
+      skipBytes(ZipConstants.ENDOFF - ZipConstants.ENDSUB);
+      // Gets the offset to central directory
+      long offsetDirectory = getInt();
+      // Goes to the central directory.
+      archive.position(offsetDirectory);
+    }
+
+    /**
+     * Reads the central directory of the given archive and populates the internal tables with
+     * {@link ZipEntry} instances.
+     */
+    private List<ZipEntry> parseCentralDirectory() throws IOException {
+      /** List of entries in the order they appear inside the central directory. */
+      List<ZipEntry> entries = new ArrayList<>();
+      locateCentralDirectory();
+
+      while (foundCentralFileheaderSignature()) {
+        ZipEntry entry = parseCentralDirectoryEntry();
+        entries.add(entry);
+      }
+
+      return entries;
+    }
+
+    /**
+     * Reads an individual entry of the central directory, creats an ZipEntry from it and adds it to
+     * the global maps.
+     */
+    private ZipEntry parseCentralDirectoryEntry() throws IOException {
+      // Positions the archive at the "compressed size" and read the value.
+      skipBytes(ZipConstants.CENSIZ - ZipConstants.CENVEM);
+      long compressSize = getInt();
+
+      // Positions the archive at the "filename length" and read the value.
+      skipBytes(ZipConstants.CENNAM - ZipConstants.CENLEN);
+      int fileNameLen = getShort();
+
+      // Reads the extra field length and the comment length.
+      int extraLen = getShort();
+      int commentLen = getShort();
+
+      // Positions the archive at the "local file header offset" and read the value.
+      skipBytes(ZipConstants.CENOFF - ZipConstants.CENDSK);
+      long localHeaderOffset = getInt();
+
+      // Reads the file name.
+      byte[] fileNameBuf = new byte[fileNameLen];
+      archive.read(ByteBuffer.wrap(fileNameBuf));
+      String fileName = new String(fileNameBuf, Charset.forName("UTF-8"));
+
+      // Skips the extra field and the comment.
+      skipBytes(extraLen + commentLen);
+
+      ZipEntry entry = new ZipEntry();
+      entry.setSize(compressSize);
+      entry.setLocalHeaderOffset(localHeaderOffset);
+      entry.setName(fileName);
+
+      return entry;
+    }
+
+    /** Walks through all recorded entries and records the offsets for the entry data. */
+    private Map<String, List<ZipEntry>> parseLocalFileHeaderData(List<ZipEntry> entries) {
+      /** Maps String to list of ZipEntrys, name -> actual entries. */
+      Map<String, List<ZipEntry>> nameMap = new LinkedHashMap<>();
+
+      for (ZipEntry entry : entries) {
+        long offset = entry.getLocalHeaderOffset();
+        archive.position(offset + ZipConstants.LOCNAM);
+
+        // Gets the data offset of this entry.
+        int fileNameLen = getShort();
+        int extraFieldLen = getShort();
+        long dataOffset =
+            offset
+                + ZipConstants.LOCEXT
+                + ZipConstants.SHORT_BYTE_SIZE
+                + fileNameLen
+                + extraFieldLen;
+        entry.setDataOffset(dataOffset);
+
+        // Puts the entry into the nameMap.
+        String name = entry.getName();
+        List<ZipEntry> entriesWithTheSameName;
+        if (nameMap.containsKey(name)) {
+          entriesWithTheSameName = nameMap.get(name);
+        } else {
+          entriesWithTheSameName = new ArrayList<>();
+          nameMap.put(name, entriesWithTheSameName);
+        }
+        entriesWithTheSameName.add(entry);
+      }
+
+      return nameMap;
+    }
+
+    /** Skips the given number of bytes or throws an EOFException if skipping failed. */
+    private void skipBytes(int count) throws IOException {
+      long currentPosition = archive.position();
+      long newPosition = currentPosition + count;
+      if (newPosition > archive.size()) {
+        throw new EOFException();
+      }
+      archive.position(newPosition);
+    }
+  }
+
+  /** Stores the data offset and the size of an entry in the archive. */
+  private static class ZipEntry {
+
+    private String name;
+    private long dataOffset = -1;
+    private long size = -1;
+    private long localHeaderOffset = -1;
+
+    public long getSize() {
+      return size;
+    }
+
+    public long getDataOffset() {
+      return dataOffset;
+    }
+
+    public String getName() {
+      return name;
+    }
+
+    public long getLocalHeaderOffset() {
+      return localHeaderOffset;
+    }
+
+    public void setSize(long size) {
+      this.size = size;
+    }
+
+    public void setDataOffset(long dataOffset) {
+      this.dataOffset = dataOffset;
+    }
+
+    public void setName(String name) {
+      this.name = name;
+    }
+
+    public void setLocalHeaderOffset(long localHeaderOffset) {
+      this.localHeaderOffset = localHeaderOffset;
+    }
+  }
+
+  /**
+   * Various constants for this {@link ZipFile}.
+   *
+   * <p>Referenced from {@link java.util.zip.ZipConstants}.
+   */
+  private static class ZipConstants {
+    /** length of Java short in bytes. */
+    static final int SHORT_BYTE_SIZE = Short.SIZE / 8;
+
+    /** length of Java int in bytes. */
+    static final int INT_BYTE_SIZE = Integer.SIZE / 8;
+
+    /** length of Java long in bytes. */
+    static final int LONG_BYTE_SIZE = Long.SIZE / 8;
+
+    /*
+     * Header signatures
+     */
+    static final long LOCSIG = 0x04034b50L; // "PK\003\004"
+    static final long EXTSIG = 0x08074b50L; // "PK\007\008"
+    static final long CENSIG = 0x02014b50L; // "PK\001\002"
+    static final long ENDSIG = 0x06054b50L; // "PK\005\006"
+
+    /*
+     * Header sizes in bytes (including signatures)
+     */
+    static final int LOCHDR = 30; // LOC header size
+    static final int EXTHDR = 16; // EXT header size
+    static final int CENHDR = 46; // CEN header size
+    static final int ENDHDR = 22; // END header size
+
+    /*
+     * Local file (LOC) header field offsets
+     */
+    static final int LOCVER = 4; // version needed to extract
+    static final int LOCFLG = 6; // general purpose bit flag
+    static final int LOCHOW = 8; // compression method
+    static final int LOCTIM = 10; // modification time
+    static final int LOCCRC = 14; // uncompressed file crc-32 value
+    static final int LOCSIZ = 18; // compressed size
+    static final int LOCLEN = 22; // uncompressed size
+    static final int LOCNAM = 26; // filename length
+    static final int LOCEXT = 28; // extra field length
+
+    /*
+     * Extra local (EXT) header field offsets
+     */
+    static final int EXTCRC = 4; // uncompressed file crc-32 value
+    static final int EXTSIZ = 8; // compressed size
+    static final int EXTLEN = 12; // uncompressed size
+
+    /*
+     * Central directory (CEN) header field offsets
+     */
+    static final int CENVEM = 4; // version made by
+    static final int CENVER = 6; // version needed to extract
+    static final int CENFLG = 8; // encrypt, decrypt flags
+    static final int CENHOW = 10; // compression method
+    static final int CENTIM = 12; // modification time
+    static final int CENCRC = 16; // uncompressed file crc-32 value
+    static final int CENSIZ = 20; // compressed size
+    static final int CENLEN = 24; // uncompressed size
+    static final int CENNAM = 28; // filename length
+    static final int CENEXT = 30; // extra field length
+    static final int CENCOM = 32; // comment length
+    static final int CENDSK = 34; // disk number start
+    static final int CENATT = 36; // internal file attributes
+    static final int CENATX = 38; // external file attributes
+    static final int CENOFF = 42; // LOC header offset
+
+    /*
+     * End of central directory (END) header field offsets
+     */
+    static final int ENDSUB = 8; // number of entries on this disk
+    static final int ENDTOT = 10; // total number of entries
+    static final int ENDSIZ = 12; // central directory size in bytes
+    static final int ENDOFF = 16; // offset of first CEN header
+    static final int ENDCOM = 20; // zip file comment length
+
+    private ZipConstants() {}
+  }
+}
diff --git a/tensorflow/lite/experimental/support/metadata/metadata.py b/tensorflow/lite/experimental/support/metadata/metadata.py
new file mode 100644
index 00000000000..1b5380352b8
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/metadata.py
@@ -0,0 +1,555 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorFlow Lite metadata tools."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import os
+import shutil
+import tempfile
+import warnings
+import zipfile
+
+from flatbuffers.python import flatbuffers
+from tensorflow.lite.experimental.support.metadata import metadata_schema_py_generated as _metadata_fb
+from tensorflow.lite.experimental.support.metadata import schema_py_generated as _schema_fb
+from tensorflow.lite.experimental.support.metadata.flatbuffers_lib import _pywrap_flatbuffers
+from tensorflow.python.platform import resource_loader
+
+_FLATC_TFLITE_METADATA_SCHEMA_FILE = resource_loader.get_path_to_datafile(
+    "metadata_schema.fbs")
+
+
+# TODO(b/141467403): add delete method for associated files.
+class MetadataPopulator(object):
+  """Packs metadata and associated files into TensorFlow Lite model file.
+
+  MetadataPopulator can be used to populate metadata and model associated files
+  into a model file or a model buffer (in bytearray). It can also help to
+  inspect list of files that have been packed into the model or are supposed to
+  be packed into the model.
+
+  The metadata file (or buffer) should be generated based on the metadata
+  schema:
+  third_party/tensorflow/lite/schema/metadata_schema.fbs
+
+  Example usage:
+  Populate matadata and label file into an image classifier model.
+
+  First, based on metadata_schema.fbs, generate the metadata for this image
+  classifer model using Flatbuffers API. Attach the label file onto the ouput
+  tensor (the tensor of probabilities) in the metadata.
+
+  Then, pack the metadata and lable file into the model as follows.
+
+    ```python
+    # Populating a metadata file (or a metadta buffer) and associated files to
+    a model file:
+    populator = MetadataPopulator.with_model_file(model_file)
+    # For metadata buffer (bytearray read from the metadata file), use:
+    # populator.load_metadata_buffer(metadata_buf)
+    populator.load_metadata_file(metadata_file)
+    populator.load_associated_files([label.txt])
+    populator.populate()
+
+    # Populating a metadata file (or a metadta buffer) and associated files to
+    a model buffer:
+    populator = MetadataPopulator.with_model_buffer(model_buf)
+    populator.load_metadata_file(metadata_file)
+    populator.load_associated_files([label.txt])
+    populator.populate()
+    # Writing the updated model buffer into a file.
+    updated_model_buf = populator.get_model_buffer()
+    with open("updated_model.tflite", "wb") as f:
+      f.write(updated_model_buf)
+    ```
+  """
+  # As Zip API is used to concatenate associated files after tflite model file,
+  # the populating operation is developed based on a model file. For in-memory
+  # model buffer, we create a tempfile to serve the populating operation.
+  # Creating the deleting such a tempfile is handled by the class,
+  # _MetadataPopulatorWithBuffer.
+
+  METADATA_FIELD_NAME = "TFLITE_METADATA"
+  TFLITE_FILE_IDENTIFIER = b"TFL3"
+  METADATA_FILE_IDENTIFIER = b"M001"
+
+  def __init__(self, model_file):
+    """Constructor for MetadataPopulator.
+
+    Args:
+      model_file: valid path to a TensorFlow Lite model file.
+
+    Raises:
+      IOError: File not found.
+    """
+    _assert_exist(model_file)
+    self._model_file = model_file
+    self._metadata_buf = None
+    self._associated_files = set()
+
+  @classmethod
+  def with_model_file(cls, model_file):
+    """Creates a MetadataPopulator object that populates data to a model file.
+
+    Args:
+      model_file: valid path to a TensorFlow Lite model file.
+
+    Returns:
+      MetadataPopulator object.
+
+    Raises:
+      IOError: File not found.
+    """
+    return cls(model_file)
+
+  # TODO(b/141468993): investigate if type check can be applied to model_buf for
+  # FB.
+  @classmethod
+  def with_model_buffer(cls, model_buf):
+    """Creates a MetadataPopulator object that populates data to a model buffer.
+
+    Args:
+      model_buf: TensorFlow Lite model buffer in bytearray.
+
+    Returns:
+      A MetadataPopulator(_MetadataPopulatorWithBuffer) object.
+    """
+    return _MetadataPopulatorWithBuffer(model_buf)
+
+  def get_model_buffer(self):
+    """Gets the buffer of the model with packed metadata and associated files.
+
+    Returns:
+      Model buffer (in bytearray).
+    """
+    with open(self._model_file, "rb") as f:
+      return f.read()
+
+  def get_packed_associated_file_list(self):
+    """Gets a list of associated files packed to the model file.
+
+    Returns:
+      List of packed associated files.
+    """
+    if not zipfile.is_zipfile(self._model_file):
+      return []
+
+    with zipfile.ZipFile(self._model_file, "r") as zf:
+      return zf.namelist()
+
+  def get_recorded_associated_file_list(self):
+    """Gets a list of associated files recorded in metadata of the model file.
+
+    Associated files may be attached to a model, a subgraph, or an input/output
+    tensor.
+
+    Returns:
+      List of recorded associated files.
+    """
+    recorded_files = []
+
+    if not self._metadata_buf:
+      return recorded_files
+
+    metadata = _metadata_fb.ModelMetadata.GetRootAsModelMetadata(
+        self._metadata_buf, 0)
+
+    # Add associated files attached to ModelMetadata
+    self._get_associated_files_from_metadata_struct(metadata, recorded_files)
+
+    # Add associated files attached to each SubgraphMetadata
+    for j in range(metadata.SubgraphMetadataLength()):
+      subgraph = metadata.SubgraphMetadata(j)
+      self._get_associated_files_from_metadata_struct(subgraph, recorded_files)
+
+      # Add associated files attached to each input tensor
+      for k in range(subgraph.InputTensorMetadataLength()):
+        tensor = subgraph.InputTensorMetadata(k)
+        self._get_associated_files_from_metadata_struct(tensor, recorded_files)
+
+      # Add associated files attached to each output tensor
+      for k in range(subgraph.OutputTensorMetadataLength()):
+        tensor = subgraph.OutputTensorMetadata(k)
+        self._get_associated_files_from_metadata_struct(tensor, recorded_files)
+
+    return recorded_files
+
+  def load_associated_files(self, associated_files):
+    """Loads associated files that to be concatenated after the model file.
+
+    Args:
+      associated_files: list of file paths.
+
+    Raises:
+      IOError:
+        File not found.
+    """
+    for af in associated_files:
+      _assert_exist(af)
+      self._associated_files.add(af)
+
+  def load_metadata_buffer(self, metadata_buf):
+    """Loads the metadata buffer (in bytearray) to be populated.
+
+    Args:
+      metadata_buf: metadata buffer (in bytearray) to be populated.
+
+    Raises:
+      ValueError:
+        The metadata to be populated is empty.
+    """
+    if not metadata_buf:
+      raise ValueError("The metadata to be populated is empty.")
+
+    self._metadata_buf = metadata_buf
+
+  def load_metadata_file(self, metadata_file):
+    """Loads the metadata file to be populated.
+
+    Args:
+      metadata_file: path to the metadata file to be populated.
+
+    Raises:
+      IOError:
+        File not found.
+    """
+    _assert_exist(metadata_file)
+    with open(metadata_file, "rb") as f:
+      metadata_buf = f.read()
+    self.load_metadata_buffer(bytearray(metadata_buf))
+
+  def populate(self):
+    """Populates loaded metadata and associated files into the model file."""
+    self._assert_validate()
+    self._populate_metadata_buffer()
+    self._populate_associated_files()
+
+  def _assert_validate(self):
+    """Validates the metadata and associated files to be populated.
+
+    Raises:
+      ValueError:
+        File is recorded in the metadata, but is not going to be populated.
+        File has already been packed.
+    """
+    # Gets files that are recorded in metadata.
+    recorded_files = self.get_recorded_associated_file_list()
+
+    # Gets files that have been packed to self._model_file.
+    packed_files = self.get_packed_associated_file_list()
+
+    # Gets the file name of those associated files to be populated.
+    to_be_populated_files = []
+    for af in self._associated_files:
+      to_be_populated_files.append(os.path.basename(af))
+
+    # Checks all files recorded in the metadata will be populated.
+    for rf in recorded_files:
+      if rf not in to_be_populated_files and rf not in packed_files:
+        raise ValueError("File, '{0}', is recorded in the metadata, but has "
+                         "not been loaded into the populator.".format(rf))
+
+    for f in to_be_populated_files:
+      if f in packed_files:
+        raise ValueError("File, '{0}', has already been packed.".format(f))
+
+      if f not in recorded_files:
+        warnings.warn(
+            "File, '{0}', does not exsit in the metadata. But packing it to "
+            "tflite model is still allowed.".format(f))
+
+  def _copy_archived_files(self, src_zip, dst_zip, file_list):
+    """Copy archieved files in file_list from src_zip ro dst_zip."""
+
+    if not zipfile.is_zipfile(src_zip):
+      raise ValueError("File, '{0}', is not a zipfile.".format(src_zip))
+
+    with zipfile.ZipFile(src_zip,
+                         "r") as src_zf, zipfile.ZipFile(dst_zip,
+                                                         "a") as dst_zf:
+      src_list = src_zf.namelist()
+      for f in file_list:
+        if f not in src_list:
+          raise ValueError(
+              "File, '{0}', does not exist in the zipfile, {1}.".format(
+                  f, src_zip))
+        file_buffer = src_zf.read(f)
+        dst_zf.writestr(f, file_buffer)
+
+  def _get_associated_files_from_metadata_struct(self, file_holder, file_list):
+    for j in range(file_holder.AssociatedFilesLength()):
+      file_list.append(file_holder.AssociatedFiles(j).Name().decode("utf-8"))
+
+  def _populate_associated_files(self):
+    """Concatenates associated files after TensorFlow Lite model file.
+
+    If the MetadataPopulator object is created using the method,
+    with_model_file(model_file), the model file will be updated.
+    """
+    # Opens up the model file in "appending" mode.
+    # If self._model_file already has pack files, zipfile will concatenate
+    # addition files after self._model_file. For example, suppose we have
+    # self._model_file = old_tflite_file | label1.txt | label2.txt
+    # Then after trigger populate() to add label3.txt, self._model_file becomes
+    # self._model_file = old_tflite_file | label1.txt | label2.txt | label3.txt
+    with zipfile.ZipFile(self._model_file, "a") as zf:
+      for af in self._associated_files:
+        filename = os.path.basename(af)
+        zf.write(af, filename)
+
+  def _populate_metadata_buffer(self):
+    """Populates the metadata buffer (in bytearray) into the model file.
+
+    Inserts metadata_buf into the metadata field of schema.Model. If the
+    MetadataPopulator object is created using the method,
+    with_model_file(model_file), the model file will be updated.
+    """
+
+    with open(self._model_file, "rb") as f:
+      model_buf = f.read()
+
+    model = _schema_fb.ModelT.InitFromObj(
+        _schema_fb.Model.GetRootAsModel(model_buf, 0))
+    buffer_field = _schema_fb.BufferT()
+    buffer_field.data = self._metadata_buf
+
+    is_populated = False
+    if not model.metadata:
+      model.metadata = []
+    else:
+      # Check if metadata has already been populated.
+      for meta in model.metadata:
+        if meta.name.decode("utf-8") == self.METADATA_FIELD_NAME:
+          is_populated = True
+          model.buffers[meta.buffer] = buffer_field
+
+    if not is_populated:
+      if not model.buffers:
+        model.buffers = []
+      model.buffers.append(buffer_field)
+      # Creates a new metadata field.
+      metadata_field = _schema_fb.MetadataT()
+      metadata_field.name = self.METADATA_FIELD_NAME
+      metadata_field.buffer = len(model.buffers) - 1
+      model.metadata.append(metadata_field)
+
+    # Packs model back to a flatbuffer binaray file.
+    b = flatbuffers.Builder(0)
+    b.Finish(model.Pack(b), self.TFLITE_FILE_IDENTIFIER)
+    model_buf = b.Output()
+
+    # Saves the updated model buffer to model file.
+    # Gets files that have been packed to self._model_file.
+    packed_files = self.get_packed_associated_file_list()
+    if packed_files:
+      # Writes the updated model buffer and associated files into a new model
+      # file. Then overwrites the original model file.
+      with tempfile.NamedTemporaryFile() as temp:
+        new_file = temp.name
+      with open(new_file, "wb") as f:
+        f.write(model_buf)
+      self._copy_archived_files(self._model_file, new_file, packed_files)
+      shutil.copy(new_file, self._model_file)
+      os.remove(new_file)
+    else:
+      with open(self._model_file, "wb") as f:
+        f.write(model_buf)
+
+
+class _MetadataPopulatorWithBuffer(MetadataPopulator):
+  """Subclass of MetadtaPopulator that populates metadata to a model buffer.
+
+  This class is used to populate metadata into a in-memory model buffer. As we
+  use Zip API to concatenate associated files after tflite model file, the
+  populating operation is developed based on a model file. For in-memory model
+  buffer, we create a tempfile to serve the populating operation. This class is
+  then used to generate this tempfile, and delete the file when the
+  MetadataPopulator object is deleted.
+  """
+
+  def __init__(self, model_buf):
+    """Constructor for _MetadataPopulatorWithBuffer.
+
+    Args:
+      model_buf: TensorFlow Lite model buffer in bytearray.
+
+    Raises:
+      ValueError: model_buf is empty.
+    """
+    if not model_buf:
+      raise ValueError("model_buf cannot be empty.")
+
+    with tempfile.NamedTemporaryFile() as temp:
+      model_file = temp.name
+
+    with open(model_file, "wb") as f:
+      f.write(model_buf)
+
+    MetadataPopulator.__init__(self, model_file)
+
+  def __del__(self):
+    """Destructor of _MetadataPopulatorWithBuffer.
+
+    Deletes the tempfile.
+    """
+    if os.path.exists(self._model_file):
+      os.remove(self._model_file)
+
+
+class MetadataDisplayer(object):
+  """Displays metadata and associated file info in human-readable format."""
+
+  def __init__(self, model_file, metadata_file, associated_file_list):
+    """Constructor for MetadataDisplayer.
+
+    Args:
+      model_file: valid path to the model file.
+      metadata_file: valid path to the metadata file.
+      associated_file_list: list of associate files in the model file.
+    """
+    self._model_file = model_file
+    self._metadata_file = metadata_file
+    self._associated_file_list = associated_file_list
+
+  @classmethod
+  def with_model_file(cls, model_file):
+    """Creates a MetadataDisplayer object for the model file.
+
+    Args:
+      model_file: valid path to a TensorFlow Lite model file.
+
+    Returns:
+      MetadataDisplayer object.
+
+    Raises:
+      IOError: File not found.
+      ValueError: The model does not have metadata.
+    """
+    _assert_exist(model_file)
+    metadata_file = cls._save_temporary_metadata_file(model_file)
+    associated_file_list = cls._parse_packed_associted_file_list(model_file)
+    return cls(model_file, metadata_file, associated_file_list)
+
+  @classmethod
+  def with_model_buffer(cls, model_buffer):
+    """Creates a MetadataDisplayer object for a file buffer.
+
+    Args:
+      model_buffer: TensorFlow Lite model buffer in bytearray.
+
+    Returns:
+      MetadataDisplayer object.
+    """
+    if not model_buffer:
+      raise ValueError("model_buffer cannot be empty.")
+
+    with tempfile.NamedTemporaryFile() as temp:
+      model_file = temp.name
+
+    with open(model_file, "wb") as f:
+      f.write(model_buffer)
+    return cls.with_model_file(model_file)
+
+  def get_metadata_json(self):
+    """Converts the metadata into a json string."""
+    opt = _pywrap_flatbuffers.IDLOptions()
+    opt.strict_json = True
+    parser = _pywrap_flatbuffers.Parser(opt)
+    with open(_FLATC_TFLITE_METADATA_SCHEMA_FILE) as f:
+      metadata_schema_content = f.read()
+    with open(self._metadata_file, "rb") as f:
+      metadata_file_content = f.read()
+    if not parser.parse(metadata_schema_content):
+      raise ValueError("Cannot parse metadata schema. Reason: " + parser.error)
+    with open(self._metadata_file, "rb") as f:
+      metadata_file_content = f.read()
+    return _pywrap_flatbuffers.generate_text(parser, metadata_file_content)
+
+  def get_packed_associated_file_list(self):
+    """Returns a list of associated files that are packed in the model.
+
+    Returns:
+      A name list of associated files.
+    """
+    return copy.deepcopy(self._associated_file_list)
+
+  @staticmethod
+  def _save_temporary_metadata_file(model_file):
+    """Saves the metadata in the model file to a temporary file.
+
+    Args:
+      model_file: valid path to the model file.
+
+    Returns:
+      Path to the metadata temporary file.
+
+    Raises:
+      ValueError: The model does not have metadata.
+    """
+    with open(model_file, "rb") as f:
+      model_buf = f.read()
+
+    tflite_model = _schema_fb.Model.GetRootAsModel(model_buf, 0)
+
+    # Gets metadata from the model file.
+    for i in range(tflite_model.MetadataLength()):
+      meta = tflite_model.Metadata(i)
+      if meta.Name().decode("utf-8") == MetadataPopulator.METADATA_FIELD_NAME:
+        buffer_index = meta.Buffer()
+        metadata = tflite_model.Buffers(buffer_index)
+        metadata_buf = metadata.DataAsNumpy().tobytes()
+        # Creates a temporary file to store the metadata.
+        with tempfile.NamedTemporaryFile() as temp:
+          metadata_file = temp.name
+        # Saves the metadata into the temporary file.
+        with open(metadata_file, "wb") as f:
+          f.write(metadata_buf)
+          return metadata_file
+
+    raise ValueError("The model does not have metadata.")
+
+  @staticmethod
+  def _parse_packed_associted_file_list(model_file):
+    """Gets a list of associated files packed to the model file.
+
+    Args:
+      model_file: valid path to the model file.
+
+    Returns:
+      List of packed associated files.
+    """
+    if not zipfile.is_zipfile(model_file):
+      return []
+
+    with zipfile.ZipFile(model_file, "r") as zf:
+      return zf.namelist()
+
+  def __del__(self):
+    """Destructor of MetadataDisplayer.
+
+    Deletes the tempfile.
+    """
+    if os.path.exists(self._metadata_file):
+      os.remove(self._metadata_file)
+
+
+def _assert_exist(filename):
+  """Checks if a file exists."""
+  if not os.path.exists(filename):
+    raise IOError("File, '{0}', does not exist.".format(filename))
diff --git a/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs b/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs
new file mode 100644
index 00000000000..a70dd044849
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs
@@ -0,0 +1,499 @@
+// Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+namespace tflite;
+
+// TFLite metadata contains both human readable and machine readable information
+// about what the model does and how to use the model. It can be used as a
+// README file, which elaborates the details of the model, each input/ouput
+// tensor, and each associated file.
+//
+// An important use case of TFLite metadata is the TFLite codegen tool, which
+// automatically generates the model interface based on the properties of the
+// model and the tensors. The model interface provides high-level APIs to
+// interact with the model, such as preprocessing the input data and running
+// inferences.
+//
+// Entries marked with "<Codegen usage>" are used in TFLite codegen tool to
+// generate the model interface. It is recommended to fill in at least those
+// enties to boost the codegen performance.
+
+// This corresponds to the schema version.
+file_identifier "M001";
+// File extension of any written files.
+file_extension "tflitemeta";
+
+enum AssociatedFileType : byte {
+  UNKNOWN = 0,
+  // Files such as readme.txt
+  DESCRIPTIONS = 1,
+
+  // Contains labels that annotate certain axis of the tensor. For example,
+  // the label file in image classification. Those labels annotate the
+  // the output tensor, such that each value in the output tensor is the
+  // probability of that corresponding category specified by the label.
+  //
+  // <Codegen usage>:
+  // If an output tensor has an associated file as TENSOR_AXIS_LABELS, return
+  // the output as a mapping between the labels and probability in the model
+  // interface.
+  // If multiple files of the same type are present, the first one is used by
+  // default; additional ones are to be distinguished from one another by their
+  // specified locale.
+  TENSOR_AXIS_LABELS = 2,
+
+  // Contains labels that tensor values correspond to. For example, in
+  // the object detection model, one of the output tensors is the detected
+  // classes. And each value in the tensor refers to the index of label in the
+  // category label file.
+  //
+  // <Codegen usage>:
+  // If an output tensor has an associated file as TENSOR_VALUE_LABELS, convert
+  // the tensor values into labels, and return a list of string as the output.
+  // If multiple files of the same type are present, the first one is used by
+  // default; additional ones are to be distinguished from one another by their
+  // specified locale.
+  TENSOR_VALUE_LABELS = 3,
+
+  // Contains sigmoid-based score calibration parameters, formatted as CSV.
+  // Lines contain for each index of an output tensor the scale, slope, offset
+  // and min_score parameters to be used for sigmoid fitting (in this order and
+  // in `strtof`-compatible [1] format).
+  // A line may be left empty to default calibrated scores for this index to
+  // default_score. See documentation for ScoreCalibrationOptions for details.
+  //
+  // [1]: https://en.cppreference.com/w/c/string/byte/strtof
+  TENSOR_AXIS_SCORE_CALIBRATION = 4,
+}
+
+table AssociatedFile {
+  // Name of this file. Need to be exact the same as the name of the actual file
+  // packed into the TFLite model as a zip file.
+  //
+  // <Codegen usage>:
+  // Locates to the actual file in the TFLite model.
+  name:string;
+
+  // A description of what the file is.
+  description:string;
+
+  // Type of the associated file. There may be special pre/post processing for
+  // some types. For example in image classification, a label file of the output
+  // will be used to convert object index into string.
+  //
+  // <Codegen usage>:
+  // Determines how to process the corresponding tensor.
+  type:AssociatedFileType;
+
+  // An optional locale for this associated file (if applicable). It is
+  // recommended to use an ISO 639-1 letter code (e.g. "en" for English),
+  // optionally completed by a two letter region code (e.g. "en-US" for US
+  // English and "en-CA" for Canadian English).
+  // Leverage this in order to specify e.g multiple label files translated in
+  // different languages.
+  locale:string;
+}
+
+// The basic content type for all tensors.
+//
+// <Codegen usage>:
+// Input feature tensors:
+// 1. Generates the method to load data from a TensorBuffer.
+// 2. Creates the preprocessing logic. The default processing pipeline is:
+// [NormalizeOp, QuantizeOp].
+// Output feature tensors:
+// 1. Generates the method to return the output data to a TensorBuffer.
+// 2. Creates the post-processing logic. The default processing pipeline is:
+// [DeQuantizeOp].
+table FeatureProperties {
+}
+
+// The type of color space of an image.
+enum ColorSpaceType : byte {
+  UNKNOWN = 0,
+  RGB = 1,
+  GRAYSCALE = 2,
+}
+
+table ImageSize {
+  width:uint;
+  height:uint;
+}
+
+// The properties for image tensors.
+//
+// <Codegen usage>:
+// Input image tensors:
+// 1. Generates the method to load an image from a TensorImage.
+// 2. Creates the preprocessing logic. The default processing pipeline is:
+// [ResizeOp, NormalizeOp, QuantizeOp].
+// Output image tensors:
+// 1. Generates the method to return the output data to a TensorImage.
+// 2. Creates the post-processing logic. The default processing pipeline is:
+// [DeQuantizeOp].
+table ImageProperties {
+  // The color space of the image.
+  //
+  // <Codegen usage>:
+  // Determines how to convert the color space of a given image from users.
+  color_space:ColorSpaceType;
+
+  // Indicates the default value of image width and height if the tensor shape
+  // is dynamic. For fixed-size tensor, this size will be consistent with the
+  // expected size.
+  default_size:ImageSize;
+}
+
+// The properties for tensors representing bounding boxes.
+//
+// <Codegen usage>:
+// Input image tensors: NA.
+// Output image tensors: parses the values into a data stucture that represents
+// bounding boxes. For example, in the generated wrapper for Android, it returns
+// the output as android.graphics.Rect objects.
+enum BoundingBoxType : byte {
+  UNKNOWN = 0,
+  // Represents the bounding box by using the combination of boundaries,
+  // {left, top, right, bottom}.
+  // The default order is {left, top, right, bottom}. Other orders can be
+  // indicated by BoundingBoxProperties.index.
+  BOUNDARIES = 1,
+
+  // Represents the bounding box by using the upper_left corner, width and
+  // height.
+  // The default order is {upper_left_x, upper_left_y, width, height}. Other
+  // orders can be indicated by BoundingBoxProperties.index.
+  UPPER_LEFT = 2,
+
+  // Represents the bounding box by using the center of the box, width and
+  // height. The default order is {center_x, center_y, width, height}. Other
+  // orders can be indicated by BoundingBoxProperties.index.
+  CENTER = 3,
+
+}
+
+enum CoordinateType : byte {
+  // The coordinates are float values from 0 to 1.
+  RATIO = 0,
+  // The coordinates are integers.
+  PIXEL = 1,
+}
+
+table BoundingBoxProperties {
+  // Denotes the order of the elements defined in each bounding box type. An
+  // empty index array represent the defualt order of each bounding box type.
+  // For example, to denote the default order of BOUNDARIES, {left, top, right,
+  // bottom}, the index should be {0, 1, 2, 3}. To denote the order {left,
+  // right, top, bottom}, the order should be {0, 2, 1, 3}.
+  //
+  // The index array can be applied to all bounding box types to adjust the
+  // order of their corresponding underlying elements.
+  //
+  // <Codegen usage>:
+  // Indicates how to parse the bounding box values.
+  index:[uint];
+
+  // <Codegen usage>:
+  // Indicates how to parse the bounding box values.
+  type:BoundingBoxType;
+
+  // <Codegen usage>:
+  // Indicates how to convert the bounding box back to the original image in
+  // pixels.
+  coordinate_type:CoordinateType;
+}
+
+union ContentProperties {
+  FeatureProperties,
+  ImageProperties,
+  BoundingBoxProperties,
+}
+
+table ValueRange {
+  min:int;
+  max:int;
+}
+
+table Content {
+  // The properties that the content may have, indicating the type of the
+  // Content.
+  //
+  // <Codegen usage>:
+  // Indicates how to process the tensor.
+  content_properties:ContentProperties;
+
+  // The range of dimensions that the content corresponds to. A NULL
+  // "range" indicates that the content uses up all dimensions,
+  // except the batch axis if applied.
+  //
+  // Here are all the possible situations of how a tensor is composed.
+  // Case 1: The tensor is a single object, such as an image.
+  // For example, the input of an image classifier
+  // (https://www.tensorflow.org/lite/models/image_classification/overview),
+  // a tensor of shape [1, 224, 224, 3]. Dimensions 1 to 3 correspond to the
+  // image. Since dimension 0 is a batch axis, which can be ignored,
+  // "range" can be left as NULL.
+  //
+  // Case 2: The tensor contains multiple instances of the same object.
+  // For example, the output tensor of detected bounding boxes of an object
+  // detection model
+  // (https://www.tensorflow.org/lite/models/object_detection/overview).
+  // The tensor shape is [1, 10, 4]. Here is the what the three dimensions
+  // represent for:
+  // dimension 0: the batch axis.
+  // dimension 1: the 10 objects detected with the highest confidence.
+  // dimension 2: the bounding boxes of the 10 detected objects.
+  // The tensor is essentially 10 bounding boxes. In this case,
+  // "range" should be {min=2; max=2;}.
+  // Another example is the pose estimation model
+  // (https://www.tensorflow.org/lite/models/pose_estimation/overview).
+  // The output tensor of heatmaps is in the shape of [1, 9, 9, 17].
+  // Here is the what the four dimensions represent for:
+  // dimension 0: the batch axis.
+  // dimension 1/2: the heatmap image.
+  // dimension 3: 17 body parts of a person.
+  // Even though the last axis is body part, the real content of this tensor is
+  // the heatmap. "range" should be [min=2; max=3].
+  //
+  // Case 3: The tensor contains multiple different objects. (Not supported by
+  // Content at this point).
+  // Sometimes a tensor may contain multiple different objects, thus different
+  // contents. It is very common for regression models. For example, a model
+  // to predict the fuel efficiency
+  // (https://www.tensorflow.org/tutorials/keras/regression).
+  // The input tensor has shape [1, 9], consisting of 9 features, such as
+  // "Cylinders", "Displacement", "Weight", etc. In this case, dimension 1
+  // contains 9 different contents. However, since these sub-dimension objects
+  // barely need to be specifically processed, their contents are not recorded
+  // in the metadata. Through, the name of each dimension can be set through
+  // TensorMetadata.dimension_names.
+  //
+  // Note that if it is not case 3, a tensor can only have one content type.
+  //
+  // <Codegen usage>:
+  // Case 1: return a processed single object of certain content type.
+  // Case 2: return a list of processed objects of certain content type. The
+  // generated model interface have API to random access those objects from
+  // the output.
+  range:ValueRange;
+}
+
+// Parameters that are used when normalizing the tensor.
+table NormalizationOptions{
+  // mean and std are normalization parameters. Tensor values are normailzed
+  // per-channelly by,
+  //   (x - mean) / std.
+  // For example, a float MobileNet model will have
+  //   mean = 127.5f and std = 127.5f.
+  // A quantized MobileNet model will have
+  //   mean = 0.0f and std = 1.0f.
+  // If there is only one value in mean or std, we'll propogate the value to
+  // all channels.
+
+  // Per-channel mean of the possible values used in normalization.
+  //
+  // <Codegen usage>:
+  // Apply normalization to input tensors accordingly.
+  mean:[float];
+
+  // Per-channel standard dev. of the possible values used in normalization.
+  //
+  // <Codegen usage>:
+  // Apply normalization to input tensors accordingly.
+  std:[float];
+}
+
+// The different possible score transforms to apply to uncalibrated scores
+// before applying score calibration.
+enum ScoreTransformationType : byte {
+  // Identity function: g(x) = x.
+  IDENTITY = 0,
+  // Log function: g(x) = log(x).
+  LOG = 1,
+  // Inverse logistic function: g(x) = log(x) - log(1-x).
+  INVERSE_LOGISTIC = 2,
+}
+
+// Options to perform score calibration on an output tensor through sigmoid
+// functions. One of the main purposes of score calibration is to make scores
+// across classes comparable, so that a common threshold can be used for all
+// output classes. This is meant for models producing class predictions as
+// output, e.g. image classification or detection models.
+//
+// For each index in the output tensor, this applies:
+// * `f(x) = scale / (1 + e^-(slope*g(x)+offset))` if `x > min_score`,
+// * `f(x) = default_score` otherwise or if no scale, slope, offset and
+//    min_score have been specified.
+// Where:
+// * scale, slope, offset and min_score are index-specific parameters
+// * g(x) is an index-independent transform among those defined in
+//   ScoreTransformationType
+// * default_score is an index-independent parameter.
+// An AssociatedFile with type TANSOR_AXIS_SCORE_CALIBRATION specifying the
+// index-specific parameters must be associated with the corresponding
+// TensorMetadata for score calibration be applied.
+table ScoreCalibrationOptions {
+  // The function to use for transforming the uncalibrated score before
+  // applying score calibration.
+  score_transformation:ScoreTransformationType;
+
+  // The default calibrated score to apply if the uncalibrated score is
+  // below min_score or if no parameters were specified for a given index.
+  default_score:float;
+}
+
+// Performs thresholding on output tensor values, in order to filter out
+// low-confidence results.
+table ScoreThresholdingOptions {
+  // The recommended global threshold below which results are considered
+  // low-confidence and should be filtered out.
+  global_score_threshold:float;
+}
+
+// Options that are used when processing the tensor.
+union ProcessUnitOptions {
+  NormalizationOptions,
+  ScoreCalibrationOptions,
+  ScoreThresholdingOptions,
+}
+
+// A process unit that is used to process the tensor out-of-graph.
+table ProcessUnit {
+  options:ProcessUnitOptions;
+}
+
+
+// Statistics to describe a tensor.
+table Stats {
+  // Max and min are not currently used in tflite.support codegen. They mainly
+  // serve as references for users to better understand the model. They can also
+  // be used to validate model pre/post processing results.
+  // If there is only one value in max or min, we'll propogate the value to
+  // all channels.
+
+  // Per-channel maximum value of the tensor.
+  max:[float];
+
+  // Per-channel minimum value of the tensor.
+  min:[float];
+}
+
+// Detailed information of an input or output tensor.
+table TensorMetadata {
+  // Name of the tensor.
+  //
+  // <Codegen usage>:
+  // The name of this tensor in the generated model interface.
+  name:string;
+
+  // A description of the tensor.
+  description:string;
+
+  // A list of names of the dimensions in this tentor. The length of
+  // dimension_names need to match the number of dimensions in this tensor.
+  //
+  // <Codegen usage>:
+  // The name of each dimension in the generated model interface. See "Case 2"
+  // in the comments of Content.range.
+  dimension_names:[string];
+
+  // The content that represents this tensor.
+  //
+  // <Codegen usage>:
+  // Determines how to process this tensor. See each item in ContentProperties
+  // for the default process units that will be applied to the tensor.
+  content:Content;
+
+  // The process units that are used to process the tensor out-of-graph.
+  //
+  // <Codegen usage>:
+  // Contains the parameters of the default processing pipeline for each content
+  // type, such as the normalization parameters in all content types. See the
+  // items under ContentProperties for the details of the default processing
+  // pipeline.
+  process_units:[ProcessUnit];
+
+  // The statistics of the tensor values.
+  stats:Stats;
+
+  // A list of associated files of this tensor.
+  //
+  // <Codegen usage>:
+  // Contains processing parameters of this tensor, such as normalization.
+  associated_files:[AssociatedFile];
+}
+
+table SubGraphMetadata {
+  // Name of the subgraph.
+  //
+  // Note that, since TFLite only support one subgraph at this moment, the
+  // Codegen tool will use the name in ModelMetadata in the generated model
+  // interface.
+  name:string;
+
+  // A description explains details about what the subgraph does.
+  description:string;
+
+  // Metadata of all input tensors used in this subgraph.
+  //
+  // <Codegen usage>:
+  // Determines how to process the inputs.
+  input_tensor_metadata:[TensorMetadata];
+
+  // Metadata of all output tensors used in this subgraph.
+  //
+  // <Codegen usage>:
+  // Determines how to process the outputs.
+  output_tensor_metadata:[TensorMetadata];
+
+  // A list of associated files of this subgraph.
+  associated_files:[AssociatedFile];
+}
+
+table ModelMetadata {
+  // Name of the model.
+  //
+  // <Codegen usage>:
+  // The name of the model in the generated model interface.
+  name:string;
+
+  // Model description in schema.
+  description:string;
+
+  // Version of the model that specified by model creators.
+  version:string;
+
+  // Noted that, the minimum required TFLite runtime version that the model is
+  // compatible with, has already been added as a metadata entry in tflite
+  // schema. We'll decide later if we want to move it here, and keep it with
+  // other metadata entries.
+
+  // Metadata of all the subgraphs of the model. The 0th is assumed to be the
+  // main subgraph.
+  //
+  // <Codegen usage>:
+  // Determines how to process the inputs and outputs.
+  subgraph_metadata:[SubGraphMetadata];
+
+  // The person who creates this model.
+  author:string;
+
+  // Licenses that may apply to this model.
+  license:string;
+
+  // A list of associated files of this model.
+  associated_files:[AssociatedFile];
+}
+
+root_type ModelMetadata;
diff --git a/tensorflow/lite/experimental/support/metadata/metadata_test.py b/tensorflow/lite/experimental/support/metadata/metadata_test.py
new file mode 100644
index 00000000000..30f6a73e070
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/metadata_test.py
@@ -0,0 +1,385 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.lite.experimental.support.metadata.metadata."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import six
+
+from flatbuffers.python import flatbuffers
+from tensorflow.lite.experimental.support.metadata import metadata as _metadata
+from tensorflow.lite.experimental.support.metadata import metadata_schema_py_generated as _metadata_fb
+from tensorflow.lite.experimental.support.metadata import schema_py_generated as _schema_fb
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.platform import test
+
+
+class MetadataTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    super(MetadataTest, self).setUp()
+    self._invalid_model_buf = None
+    self._invalid_file = "not_existed_file"
+    self._empty_model_buf = self._create_empty_model_buf()
+    self._empty_model_file = self.create_tempfile().full_path
+    with open(self._empty_model_file, "wb") as f:
+      f.write(self._empty_model_buf)
+    self._model_file = self._create_model_file_with_metadata_and_buf_fields()
+    self._metadata_file = self._create_metadata_file()
+    self._file1 = self.create_tempfile("file1").full_path
+    self._file2 = self.create_tempfile("file2").full_path
+    self._file3 = self.create_tempfile("file3").full_path
+
+  def _create_empty_model_buf(self):
+    model = _schema_fb.ModelT()
+    model_builder = flatbuffers.Builder(0)
+    model_builder.Finish(
+        model.Pack(model_builder),
+        _metadata.MetadataPopulator.TFLITE_FILE_IDENTIFIER)
+    return model_builder.Output()
+
+  def _create_model_file_with_metadata_and_buf_fields(self):
+    metadata_field = _schema_fb.MetadataT()
+    metadata_field.name = "meta"
+    buffer_field = _schema_fb.BufferT()
+    model = _schema_fb.ModelT()
+    model.metadata = [metadata_field, metadata_field]
+    model.buffers = [buffer_field, buffer_field, buffer_field]
+    model_builder = flatbuffers.Builder(0)
+    model_builder.Finish(
+        model.Pack(model_builder),
+        _metadata.MetadataPopulator.TFLITE_FILE_IDENTIFIER)
+
+    mnodel_file = self.create_tempfile().full_path
+    with open(mnodel_file, "wb") as f:
+      f.write(model_builder.Output())
+
+    return mnodel_file
+
+  def _create_metadata_file(self):
+    associated_file1 = _metadata_fb.AssociatedFileT()
+    associated_file1.name = b"file1"
+    associated_file2 = _metadata_fb.AssociatedFileT()
+    associated_file2.name = b"file2"
+    self.expected_recorded_files = [
+        six.ensure_str(associated_file1.name),
+        six.ensure_str(associated_file2.name)
+    ]
+
+    output_meta = _metadata_fb.TensorMetadataT()
+    output_meta.associatedFiles = [associated_file2]
+    subgraph = _metadata_fb.SubGraphMetadataT()
+    subgraph.outputTensorMetadata = [output_meta]
+
+    model_meta = _metadata_fb.ModelMetadataT()
+    model_meta.name = "Mobilenet_quantized"
+    model_meta.associatedFiles = [associated_file1]
+    model_meta.subgraphMetadata = [subgraph]
+    b = flatbuffers.Builder(0)
+    b.Finish(
+        model_meta.Pack(b),
+        _metadata.MetadataPopulator.METADATA_FILE_IDENTIFIER)
+
+    metadata_file = self.create_tempfile().full_path
+    with open(metadata_file, "wb") as f:
+      f.write(b.Output())
+    return metadata_file
+
+
+class MetadataPopulatorTest(MetadataTest):
+
+  def testToValidModelFile(self):
+    populator = _metadata.MetadataPopulator.with_model_file(
+        self._empty_model_file)
+    self.assertIsInstance(populator, _metadata.MetadataPopulator)
+
+  def testToInvalidModelFile(self):
+    with self.assertRaises(IOError) as error:
+      _metadata.MetadataPopulator.with_model_file(self._invalid_file)
+    self.assertEqual("File, '{0}', does not exist.".format(self._invalid_file),
+                     str(error.exception))
+
+  def testToValidModelBuffer(self):
+    populator = _metadata.MetadataPopulator.with_model_buffer(
+        self._empty_model_buf)
+    self.assertIsInstance(populator, _metadata.MetadataPopulator)
+
+  def testToInvalidModelBuffer(self):
+    with self.assertRaises(ValueError) as error:
+      _metadata.MetadataPopulator.with_model_buffer(self._invalid_model_buf)
+    self.assertEqual("model_buf cannot be empty.", str(error.exception))
+
+  def testSinglePopulateAssociatedFile(self):
+    populator = _metadata.MetadataPopulator.with_model_buffer(
+        self._empty_model_buf)
+    populator.load_associated_files([self._file1])
+    populator.populate()
+
+    packed_files = populator.get_packed_associated_file_list()
+    expected_packed_files = [os.path.basename(self._file1)]
+    self.assertEqual(set(packed_files), set(expected_packed_files))
+
+  def testRepeatedPopulateAssociatedFile(self):
+    populator = _metadata.MetadataPopulator.with_model_file(
+        self._empty_model_file)
+    populator.load_associated_files([self._file1, self._file2])
+    # Loads file2 multiple times.
+    populator.load_associated_files([self._file2])
+    populator.populate()
+
+    packed_files = populator.get_packed_associated_file_list()
+    expected_packed_files = [
+        os.path.basename(self._file1),
+        os.path.basename(self._file2)
+    ]
+    self.assertEqual(len(packed_files), 2)
+    self.assertEqual(set(packed_files), set(expected_packed_files))
+
+    # Check if the model buffer read from file is the same as that read from
+    # get_model_buffer().
+    with open(self._empty_model_file, "rb") as f:
+      model_buf_from_file = f.read()
+    model_buf_from_getter = populator.get_model_buffer()
+    self.assertEqual(model_buf_from_file, model_buf_from_getter)
+
+  def testPopulateInvalidAssociatedFile(self):
+    populator = _metadata.MetadataPopulator.with_model_buffer(
+        self._empty_model_buf)
+    with self.assertRaises(IOError) as error:
+      populator.load_associated_files([self._invalid_file])
+    self.assertEqual("File, '{0}', does not exist.".format(self._invalid_file),
+                     str(error.exception))
+
+  def testPopulatePackedAssociatedFile(self):
+    populator = _metadata.MetadataPopulator.with_model_buffer(
+        self._empty_model_buf)
+    populator.load_associated_files([self._file1])
+    populator.populate()
+    with self.assertRaises(ValueError) as error:
+      populator.load_associated_files([self._file1])
+      populator.populate()
+    self.assertEqual(
+        "File, '{0}', has already been packed.".format(
+            os.path.basename(self._file1)), str(error.exception))
+
+  def testGetPackedAssociatedFileList(self):
+    populator = _metadata.MetadataPopulator.with_model_buffer(
+        self._empty_model_buf)
+    packed_files = populator.get_packed_associated_file_list()
+    self.assertEqual(packed_files, [])
+
+  def testPopulateMetadataFileToEmptyModelFile(self):
+    populator = _metadata.MetadataPopulator.with_model_file(
+        self._empty_model_file)
+    populator.load_metadata_file(self._metadata_file)
+    populator.load_associated_files([self._file1, self._file2])
+    populator.populate()
+
+    with open(self._empty_model_file, "rb") as f:
+      model_buf_from_file = f.read()
+    model = _schema_fb.Model.GetRootAsModel(model_buf_from_file, 0)
+    metadata_field = model.Metadata(0)
+    self.assertEqual(
+        six.ensure_str(metadata_field.Name()),
+        six.ensure_str(_metadata.MetadataPopulator.METADATA_FIELD_NAME))
+
+    buffer_index = metadata_field.Buffer()
+    buffer_data = model.Buffers(buffer_index)
+    metadata_buf_np = buffer_data.DataAsNumpy()
+    metadata_buf = metadata_buf_np.tobytes()
+    with open(self._metadata_file, "rb") as f:
+      expected_metadata_buf = bytearray(f.read())
+    self.assertEqual(metadata_buf, expected_metadata_buf)
+
+    recorded_files = populator.get_recorded_associated_file_list()
+    self.assertEqual(set(recorded_files), set(self.expected_recorded_files))
+
+    # Up to now, we've proved the correctness of the model buffer that read from
+    # file. Then we'll test if get_model_buffer() gives the same model buffer.
+    model_buf_from_getter = populator.get_model_buffer()
+    self.assertEqual(model_buf_from_file, model_buf_from_getter)
+
+  def testPopulateMetadataFileWithoutAssociatedFiles(self):
+    populator = _metadata.MetadataPopulator.with_model_file(
+        self._empty_model_file)
+    populator.load_metadata_file(self._metadata_file)
+    populator.load_associated_files([self._file1])
+    # Suppose to populate self._file2, because it is recorded in the metadta.
+    with self.assertRaises(ValueError) as error:
+      populator.populate()
+    self.assertEqual(("File, '{0}', is recorded in the metadata, but has "
+                      "not been loaded into the populator.").format(
+                          os.path.basename(self._file2)), str(error.exception))
+
+  def _assert_golden_metadata(self, model_file):
+    with open(model_file, "rb") as f:
+      model_buf_from_file = f.read()
+    model = _schema_fb.Model.GetRootAsModel(model_buf_from_file, 0)
+    # There are two elements in model.Metadata array before the population.
+    # Metadata should be packed to the third element in the array.
+    metadata_field = model.Metadata(2)
+    self.assertEqual(
+        six.ensure_str(metadata_field.Name()),
+        six.ensure_str(_metadata.MetadataPopulator.METADATA_FIELD_NAME))
+
+    buffer_index = metadata_field.Buffer()
+    buffer_data = model.Buffers(buffer_index)
+    metadata_buf_np = buffer_data.DataAsNumpy()
+    metadata_buf = metadata_buf_np.tobytes()
+    with open(self._metadata_file, "rb") as f:
+      expected_metadata_buf = bytearray(f.read())
+    self.assertEqual(metadata_buf, expected_metadata_buf)
+
+  def testPopulateMetadataFileToModelWithMetadataAndAssociatedFiles(self):
+    # First, creates a dummy metadata. Populates it and the associated files
+    # into the model.
+    model_meta = _metadata_fb.ModelMetadataT()
+    model_meta.name = "Mobilenet_quantized"
+    b = flatbuffers.Builder(0)
+    b.Finish(
+        model_meta.Pack(b),
+        _metadata.MetadataPopulator.METADATA_FILE_IDENTIFIER)
+    metadata_buf = b.Output()
+
+    populator1 = _metadata.MetadataPopulator.with_model_file(self._model_file)
+    populator1.load_metadata_buffer(metadata_buf)
+    populator1.load_associated_files([self._file1, self._file2])
+    populator1.populate()
+
+    # Then, populates the metadata again.
+    populator2 = _metadata.MetadataPopulator.with_model_file(self._model_file)
+    populator2.load_metadata_file(self._metadata_file)
+    populator2.populate()
+
+    # Tests if the metadata is populated correctly.
+    self._assert_golden_metadata(self._model_file)
+
+  def testPopulateMetadataFileToModelFileWithMetadataAndBufFields(self):
+    populator = _metadata.MetadataPopulator.with_model_file(self._model_file)
+    populator.load_metadata_file(self._metadata_file)
+    populator.load_associated_files([self._file1, self._file2])
+    populator.populate()
+
+    # Tests if the metadata is populated correctly.
+    self._assert_golden_metadata(self._model_file)
+
+    recorded_files = populator.get_recorded_associated_file_list()
+    self.assertEqual(set(recorded_files), set(self.expected_recorded_files))
+
+    # Up to now, we've proved the correctness of the model buffer that read from
+    # file. Then we'll test if get_model_buffer() gives the same model buffer.
+    with open(self._model_file, "rb") as f:
+      model_buf_from_file = f.read()
+    model_buf_from_getter = populator.get_model_buffer()
+    self.assertEqual(model_buf_from_file, model_buf_from_getter)
+
+  def testPopulateInvalidMetadataFile(self):
+    populator = _metadata.MetadataPopulator.with_model_buffer(
+        self._empty_model_buf)
+    with self.assertRaises(IOError) as error:
+      populator.load_metadata_file(self._invalid_file)
+    self.assertEqual("File, '{0}', does not exist.".format(self._invalid_file),
+                     str(error.exception))
+
+  def testPopulateInvalidMetadataBuffer(self):
+    populator = _metadata.MetadataPopulator.with_model_buffer(
+        self._empty_model_buf)
+    with self.assertRaises(ValueError) as error:
+      populator.load_metadata_buffer([])
+    self.assertEqual("The metadata to be populated is empty.",
+                     str(error.exception))
+
+  def testGetModelBufferBeforePopulatingData(self):
+    populator = _metadata.MetadataPopulator.with_model_buffer(
+        self._empty_model_buf)
+    model_buf = populator.get_model_buffer()
+    expected_model_buf = self._empty_model_buf
+    self.assertEqual(model_buf, expected_model_buf)
+
+
+class MetadataDisplayerTest(MetadataTest):
+
+  def setUp(self):
+    super(MetadataDisplayerTest, self).setUp()
+    self._model_file = self._create_model_with_metadata_and_associated_files()
+
+  def _create_model_with_metadata_and_associated_files(self):
+    model_buf = self._create_empty_model_buf()
+    model_file = self.create_tempfile().full_path
+    with open(model_file, "wb") as f:
+      f.write(model_buf)
+
+    populator = _metadata.MetadataPopulator.with_model_file(model_file)
+    populator.load_metadata_file(self._metadata_file)
+    populator.load_associated_files([self._file1, self._file2])
+    populator.populate()
+    return model_file
+
+  def test_load_model_file_invalidModelFile_throwsException(self):
+    with self.assertRaises(IOError) as error:
+      _metadata.MetadataDisplayer.with_model_file(self._invalid_file)
+    self.assertEqual("File, '{0}', does not exist.".format(self._invalid_file),
+                     str(error.exception))
+
+  def test_load_model_file_modelWithoutMetadata_throwsException(self):
+    with self.assertRaises(ValueError) as error:
+      _metadata.MetadataDisplayer.with_model_file(self._empty_model_file)
+    self.assertEqual("The model does not have metadata.", str(error.exception))
+
+  def test_load_model_file_modelWithMetadata(self):
+    displayer = _metadata.MetadataDisplayer.with_model_file(self._model_file)
+    self.assertIsInstance(displayer, _metadata.MetadataDisplayer)
+
+  def test_load_model_buffer_modelWithOutMetadata_throwsException(self):
+    with self.assertRaises(ValueError) as error:
+      _metadata.MetadataDisplayer.with_model_buffer(
+          self._create_empty_model_buf())
+    self.assertEqual("The model does not have metadata.", str(error.exception))
+
+  def test_load_model_buffer_modelWithMetadata(self):
+    displayer = _metadata.MetadataDisplayer.with_model_buffer(
+        open(self._model_file, "rb").read())
+    self.assertIsInstance(displayer, _metadata.MetadataDisplayer)
+
+  def test_get_metadata_json_modelWithMetadata(self):
+    displayer = _metadata.MetadataDisplayer.with_model_file(self._model_file)
+    actual = displayer.get_metadata_json()
+
+    # Verifies the generated json file.
+    golden_json_file_path = resource_loader.get_path_to_datafile(
+        "testdata/golden_json.json")
+    with open(golden_json_file_path, "r") as f:
+      expected = f.read()
+    self.assertEqual(actual, expected)
+
+  def test_get_packed_associated_file_list_modelWithMetadata(self):
+    displayer = _metadata.MetadataDisplayer.with_model_file(self._model_file)
+    packed_files = displayer.get_packed_associated_file_list()
+
+    expected_packed_files = [
+        os.path.basename(self._file1),
+        os.path.basename(self._file2)
+    ]
+    self.assertEqual(len(packed_files), 2)
+    self.assertEqual(set(packed_files), set(expected_packed_files))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/lite/experimental/support/metadata/testdata/golden_json.json b/tensorflow/lite/experimental/support/metadata/testdata/golden_json.json
new file mode 100644
index 00000000000..bc3001e685a
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/testdata/golden_json.json
@@ -0,0 +1,21 @@
+{
+  "name": "Mobilenet_quantized",
+  "subgraph_metadata": [
+    {
+      "output_tensor_metadata": [
+        {
+          "associated_files": [
+            {
+              "name": "file2"
+            }
+          ]
+        }
+      ]
+    }
+  ],
+  "associated_files": [
+    {
+      "name": "file1"
+    }
+  ]
+}
diff --git a/tensorflow/lite/experimental/swift/Sources/Interpreter.swift b/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
index 680d8918547..b83c36c4e1d 100644
--- a/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
+++ b/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
@@ -270,7 +270,7 @@ extension Interpreter {
   }
 }
 
-/// A type alias for `Interpreter.Options` to support backwards compatiblity with the deprecated
+/// A type alias for `Interpreter.Options` to support backwards compatibility with the deprecated
 /// `InterpreterOptions` struct.
 @available(*, deprecated, renamed: "Interpreter.Options")
 public typealias InterpreterOptions = Interpreter.Options
diff --git a/tensorflow/lite/experimental/swift/Sources/InterpreterError.swift b/tensorflow/lite/experimental/swift/Sources/InterpreterError.swift
index f6372614483..1e21ec52405 100644
--- a/tensorflow/lite/experimental/swift/Sources/InterpreterError.swift
+++ b/tensorflow/lite/experimental/swift/Sources/InterpreterError.swift
@@ -44,7 +44,7 @@ extension InterpreterError: LocalizedError {
     case .failedToCreateInterpreter:
       return "Failed to create the interpreter."
     case .failedToResizeInputTensor(let index):
-      return "Failed to resize input tesnor at index \(index)."
+      return "Failed to resize input tensor at index \(index)."
     case .failedToCopyDataToInputTensor:
       return "Failed to copy data to input tensor."
     case .failedToAllocateTensors:
diff --git a/tensorflow/lite/experimental/writer/option_writer_generator.cc b/tensorflow/lite/experimental/writer/option_writer_generator.cc
index daa6bff5d54..8161860afb1 100644
--- a/tensorflow/lite/experimental/writer/option_writer_generator.cc
+++ b/tensorflow/lite/experimental/writer/option_writer_generator.cc
@@ -258,7 +258,7 @@ void GenerateImportForResizeBilinearOp(FILE* fp) {
           "    const auto* params = reinterpret_cast<const "
           "TfLiteResizeBilinearParams*>(builtin_op_data);\n"
           "    auto union_type = CreateResizeBilinearOptions(*fbb, "
-          "params->align_corners).Union();\n"
+          "params->align_corners, params->half_pixel_centers).Union();\n"
           "    return std::make_pair(BuiltinOptions_ResizeBilinearOptions, "
           "union_type);\n"
           "  }\n  break;\n");
diff --git a/tensorflow/lite/g3doc/_book.yaml b/tensorflow/lite/g3doc/_book.yaml
index a5206dc8123..e9891be1c29 100644
--- a/tensorflow/lite/g3doc/_book.yaml
+++ b/tensorflow/lite/g3doc/_book.yaml
@@ -44,6 +44,9 @@ upper_tabs:
         path: /lite/convert/rnn
       - title: "1.x compatibility"
         path: /lite/convert/1x_compatibility
+      - title: "Add metadata"
+        path: /lite/convert/metadata
+        status: experimental
 
       - heading: "Inference"
       - title: "Overview"
@@ -59,24 +62,18 @@ upper_tabs:
         status: experimental
       - title: "List of hosted models"
         path: /lite/guide/hosted_models
+      - title: "Generate code from metadata"
+        path: /lite/guide/codegen
+        status: experimental
 
       - heading: "Performance"
       - title: "Best practices"
         path: /lite/performance/best_practices
       - title: "Benchmarks"
         path: /lite/performance/benchmarks
-      - title: "Model optimization"
-        path: /lite/performance/model_optimization
-      - title: "Post-training quantization"
-        path: /lite/performance/post_training_quantization
-      - title: "Post-training weight quantization"
-        path: /lite/performance/post_training_quant
-      - title: "Post-training integer quantization"
-        path: /lite/performance/post_training_integer_quant
-      - title: "Post-training float16 quantization"
-        path: /lite/performance/post_training_float16_quant
       - title: "Delegates"
         path: /lite/performance/delegates
+        status: experimental
       - title: "GPU delegate"
         path: /lite/performance/gpu
       - title: "Advanced GPU"
@@ -86,6 +83,18 @@ upper_tabs:
       - title: "Hexagon delegate"
         path: /lite/performance/hexagon_delegate
         status: experimental
+
+      - heading: "Optimize a model"
+      - title: "Overview"
+        path: /lite/performance/model_optimization
+      - title: "Post-training quantization"
+        path: /lite/performance/post_training_quantization
+      - title: "Post-training weight quantization"
+        path: /lite/performance/post_training_quant
+      - title: "Post-training integer quantization"
+        path: /lite/performance/post_training_integer_quant
+      - title: "Post-training float16 quantization"
+        path: /lite/performance/post_training_float16_quant
       - title: "Quantization specification"
         path: /lite/performance/quantization_spec
 
diff --git a/tensorflow/lite/g3doc/convert/index.md b/tensorflow/lite/g3doc/convert/index.md
index c445361d6b9..be4088e84c6 100644
--- a/tensorflow/lite/g3doc/convert/index.md
+++ b/tensorflow/lite/g3doc/convert/index.md
@@ -11,6 +11,46 @@ Note: This page contains documentation on the converter API for TensorFlow 2.0.
 The API for TensorFlow 1.X is available
 [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/r1/convert/index.md).
 
+## New in TF 2.2
+
+Switching to use a new converter backend by default - in the nightly builds and
+TF 2.2 stable. Why we are switching?
+
+*   Enables conversion of new classes of models, including Mask R-CNN, Mobile
+    BERT, and many more
+*   Adds support for functional control flow (enabled by default in TensorFlow
+    2.x)
+*   Tracks original TensorFlow node name and Python code, and exposes them
+    during conversion if errors occur
+*   Leverages MLIR, Google's cutting edge compiler technology for ML, which
+    makes it easier to extend to accommodate feature requests
+*   Adds basic support for models with input tensors containing unknown
+    dimensions
+*   Supports all existing converter functionality
+
+In case you encounter any issues:
+
+*   Please create a
+    [GitHub issue](https://github.com/tensorflow/tensorflow/issues/new?template=60-tflite-converter-issue.md)
+    with the component label “TFLiteConverter.” Please include:
+    *   Command used to run the converter or code if you’re using the Python API
+    *   The output from the converter invocation
+    *   The input model to the converter
+    *   If the conversion is successful, but the generated model is wrong, state
+        what is wrong:
+        *   Producing wrong results and / or decrease in accuracy
+        *   Producing correct results, but the model is slower than expected
+            (model generated from old converter)
+*   If you are using the allow_custom_ops feature, please read the
+    [Python API](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/convert/python_api.md)
+    and
+    [Command Line Tool](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/convert/cmdline.md)
+    documentation
+*   Switch to the old converter by setting --experimental_new_converter=false
+    (from the [tflite_convert](https://www.tensorflow.org/lite/convert/cmdline)
+    command line tool) or converter.experimental_new_converter=False (from
+    [Python API](https://www.tensorflow.org/api_docs/python/tf/lite/TFLiteConverter))
+
 ## Device deployment
 
 The TensorFlow Lite `FlatBuffer` file is then deployed to a client device (e.g.
diff --git a/tensorflow/lite/g3doc/convert/metadata.md b/tensorflow/lite/g3doc/convert/metadata.md
new file mode 100644
index 00000000000..e6a4312d662
--- /dev/null
+++ b/tensorflow/lite/g3doc/convert/metadata.md
@@ -0,0 +1,217 @@
+# Adding metadata to TensorFlow Lite model
+
+Note: TensorFlow Lite Metadata is in experimental (beta) phase.
+
+TensorFlow Lite metadata provides a standard for model descriptions. The
+metadata is an important source of knowledge about what the model does and its
+input / output information. The metadata consists of both
+
+*   human readable parts which convey the best practice when using the model,
+    and
+*   machine readable parts that can be leveraged by code generators, such as
+    [the TensorFlow Lite Android code generator](../guide/codegen.md).
+
+## Setup the metadata tools
+
+Before adding metadata to your model, you will need to a Python programming
+environment setup for running TensorFlow. There is a detailed guide on how to
+set this up [here](https://www.tensorflow.org/install).
+
+After setup the Python programming environment, you will need to install
+additional tooling:
+
+```
+pip install tflite-support
+```
+
+TensorFlow Lite metadata tooling supports both Python 2 and Python 3.
+
+## Adding metadata
+
+There are three parts to the
+[model metadata](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs):
+
+1.  **Model information** - Overall description of the model as well as items
+    such as licence terms.
+2.  **Input information** - Description of the inputs and pre-processing
+    required such as normalization.
+3.  **Output information** - Description of the output and post-processing
+    required such as mapping to labels.
+
+### Supported Input / Output types
+
+TensorFlow Lite metadata for input and output are not designed with specific
+model types in mind but rather input and output types. It does not matter what
+the model functionally does, as long as the input and output types consists of
+the following or a combination of the following, it is supported by TensorFlow
+Lite metadata:
+
+*   Feature - Numbers which are unsigned integers or float32.
+*   Image - Metadata currently supports RGB and greyscale images.
+*   Bounding box - Rectangular shape bounding boxes. The schema supports
+    [a variety of numbering schemes](https://github.com/tensorflow/tensorflow/blob/268853ee81edab09e07f455cc918f7ef9a421485/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs#L165).
+
+### Examples
+
+Note: The export directory specified has to exist before you run the script, it
+does not get created as part of the process.
+
+You can find examples on how the metadata should be populated for different
+types of models here:
+
+#### Image Classification
+
+Download the script
+[here](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/metadata/metadata_writer_for_image_classifier.py)
+and run the script like this:
+
+```
+python ./metadata_writer_for_image_classifier.py \
+    --model_file=./model_without_metadata/mobilenet_v1_0.75_160_quantized.tflite \
+    --label_file=./model_without_metadata/labels.txt \
+    --export_directory=model_with_metadata
+```
+
+The rest of this guide will highlight some of the key sections in the Image
+Classification example to illustrate the key elements.
+
+### Deep dive into the Image Classification example
+
+#### Model Information
+
+Metadata starts by creating a new model info:
+
+```python
+from tflite_support import flatbuffers
+from tflite_support import metadata as _metadata
+from tflite_support import metadata_schema_py_generated as _metadata_fb
+
+""" ... """
+"""Creates the metadata for an image classifier."""
+
+# Creates model info.
+model_meta = _metadata_fb.ModelMetadataT()
+model_meta.name = "MobileNetV1 image classifier"
+model_meta.description = ("Identify the most prominent object in the "
+                          "image from a set of 1,001 categories such as "
+                          "trees, animals, food, vehicles, person etc.")
+model_meta.version = "v1"
+model_meta.author = "TensorFlow"
+model_meta.license = ("Apache License. Version 2.0 "
+                      "http://www.apache.org/licenses/LICENSE-2.0.")
+```
+
+#### Input / output information
+
+This describe your model's input and output signature and it maybe used by
+automatic code generators to create pre- and post- processing code. To create an
+input or output information about a tensor:
+
+```python
+# Creates input info.
+input_meta = _metadata_fb.TensorMetadataT()
+
+# Creates output info.
+output_meta = _metadata_fb.TensorMetadataT()
+```
+
+#### Image Input
+
+Image is a common input type for machine learning. TensorFlow Lite metadata
+supports information such as colorspace and pre-processing information such as
+normalization. One thing that does not required manual input is the dimension of
+the image as this is already provided by the shape of the input tensor and can
+be automatically inferred.
+
+```python
+input_meta.name = "image"
+input_meta.description = (
+    "Input image to be classified. The expected image is {0} x {1}, with "
+    "three channels (red, blue, and green) per pixel. Each value in the "
+    "tensor is a single byte between 0 and 255.".format(160, 160))
+input_meta.content = _metadata_fb.ContentT()
+input_meta.content.contentProperties = _metadata_fb.ImagePropertiesT()
+input_meta.content.contentProperties.colorSpace = (
+    _metadata_fb.ColorSpaceType.RGB)
+input_meta.content.contentPropertiesType = (
+    _metadata_fb.ContentProperties.ImageProperties)
+input_normalization = _metadata_fb.ProcessUnitT()
+input_normalization.optionsType = (
+    _metadata_fb.ProcessUnitOptions.NormalizationOptions)
+input_normalization.options = _metadata_fb.NormalizationOptionsT()
+input_normalization.options.mean = [127.5]
+input_normalization.options.std = [127.5]
+input_meta.processUnits = [input_normalization]
+input_stats = _metadata_fb.StatsT()
+input_stats.max = [255]
+input_stats.min = [0]
+input_meta.stats = input_stats
+```
+
+#### Label output
+
+Label can be mapped to an output tensor via an associated file using
+`TENSOR_AXIS_LABELS`.
+
+```Python
+# Creates output info.
+output_meta = _metadata_fb.TensorMetadataT()
+output_meta.name = "probability"
+output_meta.description = "Probabilities of the 1001 labels respectively."
+output_meta.content = _metadata_fb.ContentT()
+output_meta.content.content_properties = _metadata_fb.FeaturePropertiesT()
+output_meta.content.contentPropertiesType = (
+    _metadata_fb.ContentProperties.FeatureProperties)
+output_stats = _metadata_fb.StatsT()
+output_stats.max = [1.0]
+output_stats.min = [0.0]
+output_meta.stats = output_stats
+label_file = _metadata_fb.AssociatedFileT()
+label_file.name = os.path.basename("your_path_to_label_file")
+label_file.description = "Labels for objects that the model can recognize."
+label_file.type = _metadata_fb.AssociatedFileType.TENSOR_AXIS_LABELS
+output_meta.associatedFiles = [label_file]
+```
+
+#### Put it all together
+
+The following code pull the model information together with the input and output
+information:
+
+```python
+# Creates subgraph info.
+subgraph = _metadata_fb.SubGraphMetadataT()
+subgraph.inputTensorMetadata = [input_meta]
+subgraph.outputTensorMetadata = [output_meta]
+model_meta.subgraphMetadata = [subgraph]
+
+b = flatbuffers.Builder(0)
+b.Finish(
+    model_meta.Pack(b),
+    _metadata.MetadataPopulator.METADATA_FILE_IDENTIFIER)
+metadata_buf = b.Output()
+```
+
+Once the data structure is ready, the writing of the metadata into the tflite
+file is done via the `populate` method:
+
+```python
+populator = _metadata.MetadataPopulator.with_model_file(model_file)
+populator.load_metadata_buffer(metadata_buf)
+populator.load_associated_files(["your_path_to_label_file"])
+populator.populate()
+```
+
+#### Verify the metadata
+
+You can read back the metadata in a tflite file using the `MetadataDisplayer`:
+
+```Python
+displayer = _metadata.MetadataDisplayer.with_model_file(export_model_path)
+export_json_file = os.path.join(FLAGS.export_directory,
+                    os.path.splitext(model_basename)[0] + ".json")
+json_file = displayer.get_metadata_json()
+# Optional: write out the metadata as a json file
+with open(export_json_file, "w") as f:
+  f.write(json_file)
+```
diff --git a/tensorflow/lite/g3doc/convert/python_api.md b/tensorflow/lite/g3doc/convert/python_api.md
index 4c22d6ac860..ef9bdf2c9ef 100644
--- a/tensorflow/lite/g3doc/convert/python_api.md
+++ b/tensorflow/lite/g3doc/convert/python_api.md
@@ -163,6 +163,17 @@ for tf_result, tflite_result in zip(tf_results, tflite_results):
   np.testing.assert_almost_equal(tf_result, tflite_result, decimal=5)
 ```
 
+#### TensorFlow Lite Metadata
+
+Note: TensorFlow Lite Metadata is in experimental (beta) phase.
+
+TensorFlow Lite metadata provides a standard for model descriptions. The
+metadata is an important source of knowledge about what the model does and its
+input / output information. This makes it easier for other developers to
+understand the best practices and for code generators to create platform
+specific wrapper code. For more infomation, please refer to the
+[TensorFlow Lite Metadata](metadata.md) section.
+
 ## Installing TensorFlow <a name="versioning"></a>
 
 ### Installing the TensorFlow nightly <a name="2.0-nightly"></a>
diff --git a/tensorflow/lite/g3doc/guide/android.md b/tensorflow/lite/g3doc/guide/android.md
index 8e9207c683c..940e523c1d5 100644
--- a/tensorflow/lite/g3doc/guide/android.md
+++ b/tensorflow/lite/g3doc/guide/android.md
@@ -145,6 +145,7 @@ the root checkout directory as follows:
 
 ```sh
 bazel build -c opt --fat_apk_cpu=x86,x86_64,arm64-v8a,armeabi-v7a \
+  --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
   //tensorflow/lite/java:tensorflow-lite
 ```
 
diff --git a/tensorflow/lite/g3doc/guide/build_rpi.md b/tensorflow/lite/g3doc/guide/build_rpi.md
index 42bbf45fcfb..1e04ee77a0e 100644
--- a/tensorflow/lite/g3doc/guide/build_rpi.md
+++ b/tensorflow/lite/g3doc/guide/build_rpi.md
@@ -55,7 +55,7 @@ This should compile a static library in:
 To build ARMv6 binary for Raspberry Pi Zero:
 
 ```bash
-TARGET_ARCH=armv6 ./tensorflow/lite/tools/make/build_rpi_lib.sh
+./tensorflow/lite/tools/make/build_rpi_lib.sh TARGET_ARCH=armv6
 ```
 
 This should compile a static library in:
diff --git a/tensorflow/lite/g3doc/guide/codegen.md b/tensorflow/lite/g3doc/guide/codegen.md
new file mode 100644
index 00000000000..0d19ba6dbc0
--- /dev/null
+++ b/tensorflow/lite/g3doc/guide/codegen.md
@@ -0,0 +1,142 @@
+# Generate code from TensorFlow Lite metadata
+
+Note: TensorFlow Lite wrapper code generator is in experimental (beta) phase and
+it currently only supports Android.
+
+For TensorFlow Lite model enhanced with [metadata](../convert/metadata.md),
+developers can use the TensorFlow Lite Android wrapper code generator to create
+platform specific wrapper code. The wrapper code removes the need to interact
+directly with `ByteBuffer`. Instead, developers can interact with the TensorFlow
+Lite model with typed objects such as `Bitmap` and `Rect`.
+
+The usefulness of the code generator depend on the completeness of the
+TensorFlow Lite model's metadata entry. Refer to the `<Codegen usage>` section
+under relevant fields in
+[metadata_schema.fbs](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs),
+to see how the codegen tool parses each field.
+
+## Generate Wrapper Code
+
+You will need to install the following tooling in your terminal:
+
+```
+pip install tflite-support
+```
+
+Once completed, the code generator can be used using the following syntax:
+
+```
+tflite_codegen --model=./model_with_metadata/mobilenet_v1_0.75_160_quantized.tflite \
+    --package_name=org.tensorflow.lite.classify \
+    --model_class_name=MyClassifierModel \
+    --destination=./classify_wrapper
+```
+
+The resulting code will be located in the destination directory. If you are
+using [Google Colab](https://colab.research.google.com/) or other remote
+environment, it maybe easier to zip up the result in a zip archive and download
+it to your Android Studio project:
+
+```python
+## Zip up the generated code
+!zip -r classify_wrapper.zip classify_wrapper/
+
+## Kick off the download
+from google.colab import files
+files.download('classify_wrapper.zip')
+```
+
+## Using the generated code
+
+### Step 1: Import the generated code
+
+Unzip the generated code if necessary into a directory structure. The root of
+the generated code is assumed to be `SRC_ROOT`.
+
+Open the Android Studio project where you would like to use the TensorFlow lite
+model and import the generated module by: And File -> New -> Import Module ->
+select `SRC_ROOT`
+
+Using the above example, the directory and the module imported would be called
+`classify_wrapper`.
+
+### Step 2: Update the app's `build.gradle` file
+
+In the app module that will be consuming the generated library module:
+
+Under the android section, add the following:
+
+```java
+aaptOptions {
+   noCompress "tflite"
+}
+```
+
+Under the dependencies section, add the following:
+
+```java
+implementation project(":classify_wrapper")
+```
+
+### Step 3: Using the model
+
+```java
+// 1. Initiatize the Model
+MyClassifierModel myImageClassifier = null;
+
+try {
+    myImageClassifier = new MyClassifierModel(this);
+} catch (IOException io){
+    // Error reading the model
+}
+
+if(null != myImageClassifier) {
+
+    // 2. Setting the input with a Bitmap called inputBitmap
+    MyClassifierModel.Inputs inputs = myImageClassifier.createInputs();
+    inputs.loadImage(inputBitmap));
+
+    // 3. Running the model
+    MyClassifierModel.Outputs outputs = myImageClassifier.run(inputs);
+
+    // 4. Retrieving the result
+    Map<String, Float> labeledProbability = outputs.getProbability();
+}
+```
+
+## Accelerating model inference
+
+The generated code provides a way for developers to accelerate their code
+through the use of [delegates](../performance/delegates.md) and the number of
+threads. These can be set when initiatizing the model object as it takes three
+parameters:
+
+*   **`Context`**: Context from the Android Activity or Service
+*   (Optional) **`Device`**: TFLite acceleration delegate for example
+    GPUDelegate or NNAPIDelegate
+*   (Optional) **`numThreads`**: Number of threads used to run the model -
+    default is one.
+
+For example, to use a NNAPI delegate and up to three threads, you can initiate
+the model like this:
+
+```java
+try {
+    myImageClassifier = new MyClassifierModel(this, Model.Device.NNAPI, 3);
+} catch (IOException io){
+    // Error reading the model
+}
+```
+
+## Troubleshooting
+
+### Getting 'java.io.FileNotFoundException: This file can not be opened as a file descriptor; it is probably compressed'
+
+Under the app module that will uses the library module, insert the following
+lines under the android section:
+
+```java
+aaptOptions {
+   noCompress "tflite"
+}
+```
diff --git a/tensorflow/lite/g3doc/guide/hosted_models.md b/tensorflow/lite/g3doc/guide/hosted_models.md
index 5304fe7a455..5a52b37e59a 100644
--- a/tensorflow/lite/g3doc/guide/hosted_models.md
+++ b/tensorflow/lite/g3doc/guide/hosted_models.md
@@ -3,7 +3,9 @@
 The following is an incomplete list of pre-trained models optimized to work with
 TensorFlow Lite.
 
-To get started choosing a model, visit <a href="../models">Models</a>.
+To get started choosing a model, visit <a href="../models">Models</a> page with
+end-to-end examples, or pick a [TensorFlow Lite model from TensorFlow Hub]
+(https://tfhub.dev/s?deployment-format=lite).
 
 Note: The best model for a given application depends on your requirements. For
 example, some applications might benefit from higher accuracy, while others
@@ -22,6 +24,9 @@ classification models offer the smallest model size and fastest performance, at
 the expense of accuracy. The performance values are measured on Pixel 3 on
 Android 10.
 
+You can find many [quantized models](https://tfhub.dev/s?deployment-format=lite&module-type=image-classification&q=quantized)
+from TensorFlow Hub and get more model information there.
+
 Model name                  | Paper and model                                                                                                                                                                   | Model size | Top-1 accuracy | Top-5 accuracy | CPU, 4 threads | NNAPI
 --------------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | ---------: | -------------: | -------------: | -------------: | ----:
 Mobilenet_V1_0.25_128_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.25_128_quant.tgz) | 0.5 Mb     | 39.5%          | 64.4%          | 0.8 ms         | 2 ms
@@ -60,6 +65,9 @@ performance. <a href="../performance/gpu">GPU acceleration</a> requires the use
 of floating point models. The performance values are measured on Pixel 3 on
 Android 10.
 
+You can find many [image classification models](https://tfhub.dev/s?deployment-format=lite&module-type=image-classification)
+from TensorFlow Hub and get more model information there.
+
 Model name            | Paper and model                                                                                                                                                                           | Model size | Top-1 accuracy | Top-5 accuracy | CPU, 4 threads | GPU    | NNAPI
 --------------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | ---------: | -------------: | -------------: | -------------: | -----: | ----:
 DenseNet              | [paper](https://arxiv.org/abs/1608.06993), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/densenet_2018_04_27.tgz)            | 43.6 Mb    | 64.2%          | 85.6%          | 195 ms         | 60 ms  | 1656 ms
@@ -94,6 +102,9 @@ The following image classification models were created using
 <a href="https://cloud.google.com/automl/">Cloud AutoML</a>. The performance
 values are measured on Pixel 3 on Android 10.
 
+You can find these models in [TensorFlow Hub](https://tfhub.dev/s?deployment-format=lite&q=MnasNet)
+and get more model information there.
+
 Model Name       | Paper and model                                                                                                                                                | Model size | Top-1 accuracy | Top-5 accuracy | CPU, 4 threads | GPU     | NNAPI
 ---------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------: | ---------: | -------------: | -------------: | -------------: | ------: | ----:
 MnasNet_0.50_224 | [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_0.5_224_09_07_2018.tgz)  | 8.5 Mb     | 68.03%         | 87.79%         | 9.5 ms         | 5.9 ms  | 16.6 ms
@@ -114,44 +125,37 @@ Accuracy numbers were computed using the
 For more information about object detection, see
 <a href="../models/object_detection/overview.md">Object detection</a>.
 
-The object detection model we currently host is
-**coco_ssd_mobilenet_v1_1.0_quant_2018_06_29**.
-
-<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">Download
-model and labels</a>
+Please find [object detection models](https://tfhub.dev/s?deployment-format=lite&module-type=image-object-detection)
+from TensorFlow Hub.
 
 ## Pose estimation
 
 For more information about pose estimation, see
 <a href="../models/pose_estimation/overview.md">Pose estimation</a>.
 
-The pose estimation model we currently host is
-**multi_person_mobilenet_v1_075_float**.
-
-<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/multi_person_mobilenet_v1_075_float.tflite">Download
-model</a>
+Please find [pose estimation models](https://tfhub.dev/s?deployment-format=lite&module-type=image-pose-detection)
+from TensorFlow Hub.
 
 ## Image segmentation
 
 For more information about image segmentation, see
 <a href="../models/segmentation/overview.md">Segmentation</a>.
 
-The image segmentation model we currently host is **deeplabv3_257_mv_gpu**.
-
-<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/deeplabv3_257_mv_gpu.tflite">Download
-model</a>
+Please find [image segmentation models](https://tfhub.dev/s?deployment-format=lite&module-type=image-segmentation)
+from TensorFlow Hub.
 
 ## Question and Answer
 
 For more information about text classification with Mobile BERT, see
 <a href="../models/bert_qa/overview.md">Question And Answer</a>.
 
+Please find [Mobile BERT model](https://tfhub.dev/tensorflow/mobilebert/1) from
+TensorFlow Hub.
+
 ## Smart reply
 
 For more information about smart reply, see
 <a href="../models/smart_reply/overview.md">Smart reply</a>.
 
-The smart reply model we currently host is **smartreply_1.0_2017_11_01**.
-
-<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/smartreply_1.0_2017_11_01.zip">Download
-model</a>
+Please find [Smart Reply model](https://tfhub.dev/tensorflow/smartreply/1) from
+TensorFlow Hub.
diff --git a/tensorflow/lite/g3doc/guide/inference.md b/tensorflow/lite/g3doc/guide/inference.md
index 4f5ddeb976b..ab3265cc3c2 100644
--- a/tensorflow/lite/g3doc/guide/inference.md
+++ b/tensorflow/lite/g3doc/guide/inference.md
@@ -67,6 +67,19 @@ require writing JNI wrappers to move data between Java and C++ layers.
 See below for details about using C++ and Java, or
 follow the [Android quickstart](android.md) for a tutorial and example code.
 
+#### TensorFlow Lite Android wrapper code generator
+
+Note: TensorFlow Lite wrapper code generator is in experimental (beta) phase and
+it currently only supports Android.
+
+For TensorFlow Lite model enhanced with [metadata](../convert/metadata.md),
+developers can use the TensorFlow Lite Android wrapper code generator to create
+platform specific wrapper code. The wrapper code removes the need to interact
+directly with `ByteBuffer` on Android. Instead, developers can interact with the
+TensorFlow Lite model with typed objects such as `Bitmap` and `Rect`. For more
+information, please refer to the
+[TensorFlow Lite Android wrapper code generator](codegen.md).
+
 ### iOS
 
 On iOS, TensorFlow Lite is available with native iOS libraries written in
diff --git a/tensorflow/lite/g3doc/guide/ops_version.md b/tensorflow/lite/g3doc/guide/ops_version.md
index 1273ed306e6..46df25f7018 100644
--- a/tensorflow/lite/g3doc/guide/ops_version.md
+++ b/tensorflow/lite/g3doc/guide/ops_version.md
@@ -163,7 +163,7 @@ execute the op. In this example, it means:
 *   Populate version=2 otherwise.
 
 To do this, you need to override `GetVersion` function for the operator class in
-`lite/toco/tflite/operator.cc`.
+`lite/tools/versioning/op_version.cc`.
 
 For ops with only one version, the `GetVersion` function is defined as:
 
@@ -191,7 +191,8 @@ The last step is to add the new version info into the operator version map. This
 step is required because we need generate the model's minimum required runtime
 version based on this version map.
 
-To do this, you need to add a new map entry in `lite/toco/tflite/op_version.cc`.
+To do this, you need to add a new map entry in
+`lite/tools/versioning/op_version.cc`.
 
 In this example, it means you need to add the following into `op_version_map`:
 ```
diff --git a/tensorflow/lite/g3doc/microcontrollers/get_started.md b/tensorflow/lite/g3doc/microcontrollers/get_started.md
index eab28858793..5c46701d1fe 100644
--- a/tensorflow/lite/g3doc/microcontrollers/get_started.md
+++ b/tensorflow/lite/g3doc/microcontrollers/get_started.md
@@ -303,7 +303,7 @@ TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, invoke_status);
 ### Obtain the output
 
 The model's output tensor can be obtained by calling `output(0)` on the
-`tflite::MicroIntepreter`, where `0` represents the first (and only) output
+`tflite::MicroInterpreter`, where `0` represents the first (and only) output
 tensor.
 
 In the example, the model's output is a single floating point value contained
diff --git a/tensorflow/lite/g3doc/models/style_transfer/overview.ipynb b/tensorflow/lite/g3doc/models/style_transfer/overview.ipynb
index e71b07661a0..0dbbcbb6ccb 100644
--- a/tensorflow/lite/g3doc/models/style_transfer/overview.ipynb
+++ b/tensorflow/lite/g3doc/models/style_transfer/overview.ipynb
@@ -250,7 +250,7 @@
         "# Function to load an image from a file, and add a batch dimension.\n",
         "def load_img(path_to_img):\n",
         "  img = tf.io.read_file(path_to_img)\n",
-        "  img = tf.image.decode_image(img, channels=3)\n",
+        "  img = tf.io.decode_image(img, channels=3)\n",
         "  img = tf.image.convert_image_dtype(img, tf.float32)\n",
         "  img = img[tf.newaxis, :]\n",
         "\n",
diff --git a/tensorflow/lite/g3doc/performance/benchmarks.md b/tensorflow/lite/g3doc/performance/benchmarks.md
index e825f7c41c3..2dc20f2d74c 100644
--- a/tensorflow/lite/g3doc/performance/benchmarks.md
+++ b/tensorflow/lite/g3doc/performance/benchmarks.md
@@ -7,7 +7,7 @@ These performance benchmark numbers were generated with the
 [Android TFLite benchmark binary](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark)
 and the [iOS benchmark app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/ios).
 
-# Android performance benchmarks
+## Android performance benchmarks
 
 For Android benchmarks, the CPU affinity is set to use big cores on the device to
 reduce variance (see [details](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark#reducing-variance-between-runs-on-android)).
@@ -135,7 +135,7 @@ The performance values below are measured on Android 10.
 
  </table>
 
-# iOS benchmarks
+## iOS benchmarks
 
 To run iOS benchmarks, the
 [benchmark app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/ios)
diff --git a/tensorflow/lite/g3doc/performance/best_practices.md b/tensorflow/lite/g3doc/performance/best_practices.md
index 76553cedcfd..56093e63722 100644
--- a/tensorflow/lite/g3doc/performance/best_practices.md
+++ b/tensorflow/lite/g3doc/performance/best_practices.md
@@ -2,8 +2,8 @@
 
 Mobile and embedded devices have limited computational resources, so it is
 important to keep your application resource efficient. We have compiled a list
-of best practices and strategies that you can use to optimize your model and
-application when using TensorFlow Lite.
+of best practices and strategies that you can use to improve your TensorFlow
+Lite model performance.
 
 ## Choose the best model for the task
 
@@ -23,13 +23,20 @@ One example of models optimized for mobile devices are
 vision applications. [Hosted models](../models/hosted.md) lists several other
 models that have been optimized specifically for mobile and embedded devices.
 
-You can retrain the listed models on your own dataset by using transfer learning. Check out our transfer learning tutorial for
-[image classification](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/#0) and
- [object detection](https://medium.com/tensorflow/training-and-serving-a-realtime-mobile-object-detector-in-30-minutes-with-cloud-tpus-b78971cf1193).
-
+You can retrain the listed models on your own dataset by using transfer
+learning. Check out our transfer learning tutorial for
+[image classification](https://colab.sandbox.google.com/github/tensorflow/examples/blob/master/tensorflow_examples/lite/model_maker/demo/image_classification.ipynb)
+and
+[object detection](https://medium.com/tensorflow/training-and-serving-a-realtime-mobile-object-detector-in-30-minutes-with-cloud-tpus-b78971cf1193).
 
 ## Profile your model
-Once you have selected a candidate model that is right for your task, it is a good practice to profile and benchmark your model. TensorFlow Lite [benchmarking tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark) has a built-in profiler that shows per operator profiling statistics. This can help in understanding performance bottlenecks and which operators dominate the computation time.
+
+Once you have selected a candidate model that is right for your task, it is a
+good practice to profile and benchmark your model. TensorFlow Lite
+[benchmarking tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark)
+has a built-in profiler that shows per operator profiling statistics. This can
+help in understanding performance bottlenecks and which operators dominate the
+computation time.
 
 ## Profile and optimize operators in the graph
 
@@ -43,23 +50,12 @@ operator is executed. Check out our
 
 ## Optimize your model
 
-Model compression aims to create smaller models that are generally faster and
-more energy efficient, so that they can be deployed on mobile devices.
+Model optimization aims to create smaller models that are generally faster and
+more energy efficient, so that they can be deployed on mobile devices. There are
+multiple optimization techniques suppored by TensorFlow Lite, such as
+quantization.
 
-### Quantization
-
-If your model uses floating-point weights or activations, then it may be
-possible to reduce the size of model up to ~4x by using quantization, which
-effectively turns the float weights to 8-bit. There are two flavors of
-quantization: [post-training quantization](post_training_quantization.md) and
-[quantized training](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/quantize/README.md){:.external}.
-The former does not require model re-training, but, in rare cases, may have
-accuracy loss. When accuracy loss is beyond acceptable thresholds, quantized
-training should be used instead.
-
-We strongly recommend running benchmarks to make sure that the accuracy is not
-impacted during model compression. Check out our
-[model optimization docs](model_optimization.md) for details.
+Check out our [model optimization docs](model_optimization.md) for details.
 
 ## Tweak the number of threads
 
@@ -87,7 +83,14 @@ the Java API is a lot faster if ByteBuffers are used as
 [inputs](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java#L175).
 
 ## Profile your application with platform specific tools
-Platform specific tools like [Android profiler](https://developer.android.com/studio/profile/android-profiler) and [Instruments](https://help.apple.com/instruments/mac/current/) provide a wealth of profiling information that can be used to debug your app. Sometimes the performance bug may be not in the model but in parts of application code that interact with the model. Make sure to familiarize yourself with platform specific profiling tools and best practices for your platform.
+
+Platform specific tools like
+[Android profiler](https://developer.android.com/studio/profile/android-profiler)
+and [Instruments](https://help.apple.com/instruments/mac/current/) provide a
+wealth of profiling information that can be used to debug your app. Sometimes
+the performance bug may be not in the model but in parts of application code
+that interact with the model. Make sure to familiarize yourself with platform
+specific profiling tools and best practices for your platform.
 
 ## Evaluate whether your model benefits from using hardware accelerators available on the device
 
@@ -102,18 +105,23 @@ interpreter execution. TensorFlow Lite can use delegates by:
     efficiency of your model. To enable the Neural Networks API, call
     [UseNNAPI](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/interpreter.h#L343)
     on the interpreter instance.
-*   A binary-only GPU delegate has been released for Android and iOS, using
-    OpenGL and Metal, respectively. To try them out, see the
-    [GPU delegate tutorial](gpu.md) and [documentation](gpu_advanced.md).
+*   GPU delegate is available on Android and iOS, using OpenGL/OpenCL and Metal,
+    respectively. To try them out, see the [GPU delegate tutorial](gpu.md) and
+    [documentation](gpu_advanced.md).
+*   Hexagon delegate is available on Android. It leverages the Qualcomm Hexagon
+    DSP if it is available on the device. See the
+    [Hexagon delegate tutorial](hexagon_delegate.md) for more information.
 *   It is possible to create your own delegate if you have access to
     non-standard hardware. See [TensorFlow Lite delegates](delegates.md) for
     more information.
 
-Be aware that some accelerators work better for different types of models. It is
-important to benchmark each delegate to see if it is a good choice for your
-application. For example, if you have a very small model, it may not be worth
-delegating the model to either the NN API or the GPU. Conversely, accelerators
-are a great choice for large models that have high arithmetic intensity.
+Be aware that some accelerators work better for different types of models. Some
+delegates only support float models or models optimized in a specific way. It is
+important to [benchmark](benchmarks.md) each delegate to see if it is a good
+choice for your application. For example, if you have a very small model, it may
+not be worth delegating the model to either the NN API or the GPU. Conversely,
+accelerators are a great choice for large models that have high arithmetic
+intensity.
 
 ## Need more help
 
diff --git a/tensorflow/lite/g3doc/performance/delegates.md b/tensorflow/lite/g3doc/performance/delegates.md
index 16bd1b65f67..4f383b52e1f 100644
--- a/tensorflow/lite/g3doc/performance/delegates.md
+++ b/tensorflow/lite/g3doc/performance/delegates.md
@@ -1,7 +1,6 @@
 # TensorFlow Lite delegates
 
-_Note: Delegate API is still experimental and is subject to change._
-
+Note: Delegate API is still experimental and is subject to change.
 
 ## What is a TensorFlow Lite delegate?
 
@@ -14,24 +13,26 @@ Running inference on compute-heavy machine learning models on mobile devices is
 
 Instead of relying on the CPU, some devices have hardware accelerators, such as GPU or DSP, that allows for better performance and higher energy efficiency.
 
-## Using the GPU / NNAPI delegate
+## Using the built-in delegates
 
 TensorFlow Lite provides the following delegates for hardware acceleration:
 
 *   **GPU delegate for cross platform acceleration** - The GPU delegate can be
     used on both Android and iOS. It is optimized to run 32-bit and 16-bit float
     based models where a GPU is available. For an overview of the GPU delegate,
-    see
-    [TensorFlow Lite on GPU](gpu_advanced.md).
-    For step-by-step tutorials on using the GPU delegate with Android and iOS,
-    see
+    see [TensorFlow Lite on GPU](gpu_advanced.md). For step-by-step tutorials on
+    using the GPU delegate with Android and iOS, see
     [TensorFlow Lite GPU Delegate Tutorial](gpu.md).
 *   **NNAPI delegate for newer Android devices** - The NNAPI delegate can be
     used to accelerate models on Android devices with GPU, DSP and / or NPU
     available. It is available in Android 8.1 (API 27+) or higher. For an
     overview of the NNAPI delegate, step-by-step instructions and best
-    practices, see
-    [TensorFlow Lite NNAPI delegate](nnapi.md).
+    practices, see [TensorFlow Lite NNAPI delegate](nnapi.md).
+*   **Hexagon delegate for older Android devices** - The Hexagon delegate can be
+    used to accelerate models on Android devices with Qualcomm Hexagon DSP. It
+    can be used on devices older version of Android OS that does not fully
+    support NNAPI. See [TensorFlow Lite Hexagon delegate](hexagon_delegate.md)
+    for more detail.
 
 ## How do delegates work?
 
diff --git a/tensorflow/lite/g3doc/performance/hexagon_delegate.md b/tensorflow/lite/g3doc/performance/hexagon_delegate.md
index dc728a0d194..73ae26ced0d 100644
--- a/tensorflow/lite/g3doc/performance/hexagon_delegate.md
+++ b/tensorflow/lite/g3doc/performance/hexagon_delegate.md
@@ -1,6 +1,6 @@
-# Tensorflow Lite Hexagon delegate
+# TensorFlow Lite Hexagon delegate
 
-This document explains how to use the Tensorflow Lite Hexagon Delegate in your
+This document explains how to use the TensorFlow Lite Hexagon Delegate in your
 application using the Java and/or C API. The delegate leverages the Qualcomm
 Hexagon library to execute quantized kernels on the DSP. Note that the delegate
 is intended to *complement* NNAPI functionality, particularly for devices where
diff --git a/tensorflow/lite/g3doc/performance/model_optimization.md b/tensorflow/lite/g3doc/performance/model_optimization.md
index eea225f7e80..befc40aa738 100644
--- a/tensorflow/lite/g3doc/performance/model_optimization.md
+++ b/tensorflow/lite/g3doc/performance/model_optimization.md
@@ -1,39 +1,102 @@
 # Model optimization
 
+Edge devices often have limited memory or computational power. Various
+optimizations can be applied to models so that they can be run within these
+constraints. In addition, some optimizations allow the use of specialized
+hardware for accelerated inference.
+
 Tensorflow Lite and the
 [Tensorflow Model Optimization Toolkit](https://www.tensorflow.org/model_optimization)
 provide tools to minimize the complexity of optimizing inference.
 
-Inference efficiency is particularly important for edge devices, such as mobile
-and Internet of Things (IoT). Such devices have many restrictions on processing,
-memory, power-consumption, and storage for models. Furthermore, model
-optimization unlocks the processing power of fixed-point hardware and next
-generation hardware accelerators.
+It's recommended that you consider model optimization during your application
+development process. This document outlines some best practices for optimizing
+TensorFlow models for deployment to edge hardware.
 
-## Model quantization
+## Why models should be optimized
 
-Quantizing deep neural networks uses techniques that allow for reduced precision
-representations of weights and, optionally, activations for both storage and
-computation. Quantization provides several benefits:
+There are several main ways model optimization can help with application
+development.
 
-* Support on existing CPU platforms.
-* Quantization of activations reduces memory access costs for reading and storing intermediate activations.
-* Many CPU and hardware accelerator implementations provide SIMD instruction capabilities, which are especially beneficial for quantization.
+### Size reduction
 
-TensorFlow Lite provides several levels of support for quantization.
+Some forms of optimization can be used to reduce the size of a model. Smaller
+models have the following benefits:
 
-*   Tensorflow Lite [post-training quantization](post_training_quantization.md)
-    quantizes weights and activations post training easily.
-*   [Quantization-aware training](https://github.com/tensorflow/tensorflow/tree/r1.13/tensorflow/contrib/quantize){:.external}
-    allows for training of networks that can be quantized with minimal accuracy
-    drop; this is only available for a subset of convolutional neural network
-    architectures.
+-   **Smaller storage size:** Smaller models occupy less storage space on your
+    users' devices. For example, an Android app using a smaller model will take
+    up less storage space on a user's mobile device.
+-   **Smaller download size:** Smaller models require less time and bandwidth to
+    download to users' devices.
+-   **Less memory usage:** Smaller models use less RAM when they are run, which
+    frees up memory for other parts of your application to use, and can
+    translate to better performance and stability.
 
-### Latency and accuracy results
+Quantization can reduce the size of a model in all of these cases, potentially
+at the expense of some accuracy. Pruning can reduce the size of a model for
+download by making it more easily compressible.
+
+### Latency reduction
+
+*Latency* is the amount of time it takes to run a single inference with a given
+model. Some forms of optimization can reduce the amount of computation required
+to run inference using a model, resulting in lower latency. Latency can also
+have an impact on power consumption.
+
+Currently, quantization can be used to reduce latency by simplifying the
+calculations that occur during inference, potentially at the expense of some
+accuracy.
+
+### Accelerator compatibility
+
+Some hardware accelerators, such as the
+[Edge TPU](https://cloud.google.com/edge-tpu/), can run inference extremely fast
+with models that have been correctly optimized.
+
+Generally, these types of devices require models to be quantized in a specific
+way. See each hardware accelerators documentation to learn more about their
+requirements.
+
+## Trade-offs
+
+Optimizations can potentially result in changes in model accuracy, which must be
+considered during the application development process.
+
+The accuracy changes depend on the individual model being optimized, and are
+difficult to predict ahead of time. Generally, models that are optimized for
+size or latency will lose a small amount of accuracy. Depending on your
+application, this may or may not impact your users' experience. In rare cases,
+certain models may gain some accuracy as a result of the optimization process.
+
+## Types of optimization
+
+TensorFlow Lite currently supports optimization via quantization and pruning.
+
+These are part of the
+[TensorFlow Model Optimization Toolkit](https://www.tensorflow.org/model_optimization),
+which provides resources for model optimization techniques that are compatible
+with TensorFlow Lite.
+
+### Quantization
+
+[Quantization](https://www.tensorflow.org/model_optimization/guide/quantization)
+works by reducing the precision of the numbers used to represent a model's
+parameters, which by default are 32-bit floating point numbers. This results in
+a smaller model size and faster computation.
+
+The following types of quantization are available in TensorFlow Lite:
+
+Technique                                                                                                      | Data requirements                | Size reduction | Accuracy                    | Supported hardware
+-------------------------------------------------------------------------------------------------------------- | -------------------------------- | -------------- | --------------------------- | ------------------
+[Post-training float16 quantization](post_training_float16_quant.ipynb)                                        | No data                          | Up to 50%      | Insignificant accuracy loss | CPU, GPU
+[Post-training weight quantization](post_training_quant.ipynb)                                                 | No data                          | Up to 75%      | Accuracy loss               | CPU
+[Post-training integer quantization](post_training_integer_quant.ipynb)                                        | Unlabelled representative sample | Up to 75%      | Smaller accuracy loss       | CPU, EdgeTPU, Hexagon DSP
+[Quantization-aware training](https://github.com/tensorflow/tensorflow/tree/r1.13/tensorflow/contrib/quantize) | Labelled training data           | Up to 75%      | Smallest accuracy loss      | CPU, EdgeTPU, Hexagon DSP
 
 Below are the latency and accuracy results for post-training quantization and
 quantization-aware training on a few models. All latency numbers are measured on
-Pixel&nbsp;2 devices using a single big core. As the toolkit improves, so will the numbers here:
+Pixel 2 devices using a single big core CPU. As the toolkit improves, so will
+the numbers here:
 
 <figure>
   <table>
@@ -61,7 +124,17 @@ Pixel&nbsp;2 devices using a single big core. As the toolkit improves, so will t
   </figcaption>
 </figure>
 
-## Choice of tool
+### Pruning
+
+[Pruning](https://www.tensorflow.org/model_optimization/guide/pruning) works by
+removing parameters within a model that have only a minor impact on its
+predictions. Pruned models are the same size on disk, and have the same runtime
+latency, but can be compressed more effectively. This makes pruning a useful
+technique for reducing model download size.
+
+In the future, TensorFlow Lite will provide latency reduction for pruned models.
+
+## Development workflow
 
 As a starting point, check if the models in
 [hosted models](../guide/hosted_models.md) can work for your application. If
@@ -76,3 +149,6 @@ is the better option. See additional optimization techniques under the
 [Tensorflow Model Optimization Toolkit](https://www.tensorflow.org/model_optimization).
 
 Note: Quantization-aware training supports a subset of convolutional neural network architectures.
+
+If you want to further reduce your model size, you can try [pruning](#pruning)
+prior to quantizing your models.
diff --git a/tensorflow/lite/g3doc/performance/quantization_spec.md b/tensorflow/lite/g3doc/performance/quantization_spec.md
index b0cea36ac1e..9c30fbdc855 100644
--- a/tensorflow/lite/g3doc/performance/quantization_spec.md
+++ b/tensorflow/lite/g3doc/performance/quantization_spec.md
@@ -1,6 +1,10 @@
 # TensorFlow Lite 8-bit quantization specification
 
-### Specification summary
+The following document outlines the specification for TensorFlow Lite's 8-bit
+quantization scheme. This is intended to assist hardware developers in providing
+hardware support for inference with quantized TensorFlow Lite models.
+
+## Specification summary
 
 We are providing a specification, and we can only provide some guarantees on
 behaviour if the spec is followed. We also understand different hardware may
@@ -27,14 +31,14 @@ Note: In the past our quantized tooling used per-tensor, asymmetric, `uint8`
 quantization. New tooling, reference kernels, and optimized kernels for 8-bit
 quantization will use this spec.
 
-### Signed integer vs unsigned integer
+## Signed integer vs unsigned integer
 
 TensorFlow Lite quantization will primarily prioritize tooling and kernels for
 `int8` quantization for 8-bit. This is for the convenience of symmetric
 quantization being represented by zero-point equal to 0. Additionally many
 backends have additional optimizations for `int8xint8` accumulation.
 
-### Per-axis vs per-tensor
+## Per-axis vs per-tensor
 
 Per-tensor quantization means that there will be one scale and/or zero-point per
 entire tensor. Per-axis quantization means that there will be one scale and/or
@@ -56,7 +60,7 @@ without performance implications. This has large improvements to accuracy.
 TFLite has per-axis support for a growing number of operations. At the time of
 this document support exists for Conv2d and DepthwiseConv2d.
 
-### Symmetric vs asymmetric
+## Symmetric vs asymmetric
 
 Activations are asymmetric: they can have their zero-point anywhere within the
 signed `int8` range `[-128, 127]`. Many activations are asymmetric in nature and
@@ -95,7 +99,7 @@ The \\(\sum_{i=0}^{n} q_{a}^{(i)} z_b\\) term needs to be computed every inferen
 since the activation changes every inference. By enforcing weights to be
 symmetric we can remove the cost of this term.
 
-### int8 quantized operator specifications
+## int8 quantized operator specifications
 
 Below we describe the quantization requirements for our int8 tflite kernels:
 
diff --git a/tensorflow/lite/graph_info.cc b/tensorflow/lite/graph_info.cc
index a9da742e05d..875a03af817 100644
--- a/tensorflow/lite/graph_info.cc
+++ b/tensorflow/lite/graph_info.cc
@@ -17,33 +17,11 @@ limitations under the License.
 #include <algorithm>
 
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/context_util.h"
 
 namespace tflite {
-
 namespace {
 
-// Provide a range iterable wrapper for TfLiteIntArray* (C lists that TfLite
-// C api uses. Can't use the google array_view, since we can't depend on even
-// absl for embedded device reasons.
-// TODO(aselle): Move this into central utilities.
-class TfLiteIntArrayView {
- public:
-  // Construct a view of a TfLiteIntArray*. Note, `int_array` should be non-null
-  // and this view does not take ownership of it.
-  explicit TfLiteIntArrayView(const TfLiteIntArray* int_array)
-      : int_array_(int_array) {}
-
-  typedef const int* const_iterator;
-  const_iterator begin() const { return int_array_->data; }
-  const_iterator end() const { return &int_array_->data[int_array_->size]; }
-
-  TfLiteIntArrayView(const TfLiteIntArrayView&) = default;
-  TfLiteIntArrayView& operator=(const TfLiteIntArrayView& rhs) = default;
-
- private:
-  const TfLiteIntArray* int_array_;
-};
-
 // Helper class that actually performs partitioning by node sub set.
 // Outputs to a provided `NodeSubset` structure.
 //
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 2974b26f574..57e9b876ec1 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -182,7 +182,7 @@ cc_library(
         "//tensorflow/lite/testing:util",
         "//tensorflow/lite/tools/optimize:quantization_utils",
         "//tensorflow/lite/tools/optimize/sparsity:format_converter",
-        "//tensorflow/lite/tools/versioning:op_version",
+        "//tensorflow/lite/tools/versioning",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -549,7 +549,6 @@ cc_library(
         "@farmhash_archive//:farmhash",
         "@flatbuffers",
     ],
-    alwayslink = 1,
 )
 
 cc_library(
@@ -629,6 +628,28 @@ cc_library(
     copts = tflite_copts(),
 )
 
+# For internal usage by shared libraries only.
+cc_library(
+    name = "builtin_ops_all_linked",
+    srcs = ["register.cc"],
+    hdrs = [
+        "builtin_op_kernels.h",
+        "fully_connected.h",
+        "register.h",
+    ],
+    copts = tflite_copts(),
+    # Limit visibility to TFLite only.
+    visibility = [
+        "//tensorflow/lite:__subpackages__",
+    ],
+    deps = [
+        ":builtin_op_kernels",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:common",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "builtin_ops",
     srcs = ["register.cc"],
@@ -637,13 +658,11 @@ cc_library(
         "fully_connected.h",
         "register.h",
     ],
-    copts = tflite_copts(),
     deps = [
         ":builtin_op_kernels",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:common",
     ],
-    alwayslink = 1,
 )
 
 # The builtin_ops target will resolve to optimized kernels when available. This
diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 0ffc2e7d696..7e65fbb5306 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -12,19 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <cassert>
 #include <cmath>
 #include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <iostream>
 #include <limits>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
-#include "tensorflow/lite/kernels/internal/optimized/integer_ops/softmax.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/log_softmax.h"
@@ -76,9 +71,10 @@ struct LogSoftmaxOpData : public OpData {
 };
 
 struct LeakyReluOpData : public OpData {
-  uint8_t q_alpha;
-  int32_t output_multiplier = 0;
-  int output_shift = 0;
+  int32_t output_multiplier_alpha = 0;
+  int32_t output_shift_alpha = 0;
+  int32_t output_multiplier_identity = 0;
+  int32_t output_shift_identity = 0;
 };
 
 struct PreluOpData : public OpData {
@@ -363,20 +359,17 @@ TfLiteStatus LeakyReluPrepare(TfLiteContext* context, TfLiteNode* node) {
 
   LeakyReluOpData* data = reinterpret_cast<LeakyReluOpData*>(node->user_data);
 
-  if (output->type == kTfLiteUInt8) {
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
     const auto* params =
         reinterpret_cast<TfLiteLeakyReluParams*>(node->builtin_data);
-    // Quantize the alpha with same zero-point and scale as of input
-    data->q_alpha = static_cast<uint8_t>(std::max<float>(
-        std::numeric_limits<uint8_t>::min(),
-        std::min<float>(std::numeric_limits<uint8_t>::max(),
-                        std::round(input->params.zero_point +
-                                   (params->alpha / input->params.scale)))));
 
-    double real_multiplier =
-        input->params.scale * input->params.scale / output->params.scale;
-    QuantizeMultiplierSmallerThanOneExp(
-        real_multiplier, &data->output_multiplier, &data->output_shift);
+    double alpha_multiplier =
+        input->params.scale * params->alpha / output->params.scale;
+    QuantizeMultiplier(alpha_multiplier, &data->output_multiplier_alpha,
+                       &data->output_shift_alpha);
+    double identity_multiplier = input->params.scale / output->params.scale;
+    QuantizeMultiplier(identity_multiplier, &data->output_multiplier_identity,
+                       &data->output_shift_identity);
   }
   return context->ResizeTensor(context, output,
                                TfLiteIntArrayCopy(input->dims));
@@ -658,7 +651,7 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
                              input, output, data);
     } break;
     default:
-      context->ReportError(
+      TF_LITE_KERNEL_LOG(
           context, "Only float32 & int8/uint8 is supported currently, got %s.",
           TfLiteTypeGetName(input->type));
       return kTfLiteError;
@@ -686,10 +679,10 @@ TfLiteStatus Relu1Eval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     default:
-      context->ReportError(context,
-                           "Only float32, uint8, int8 supported "
-                           "currently, got %s.",
-                           TfLiteTypeGetName(input->type));
+      TF_LITE_KERNEL_LOG(context,
+                         "Only float32, uint8, int8 supported "
+                         "currently, got %s.",
+                         TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -740,7 +733,7 @@ TfLiteStatus HardSwishEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     default:
-      context->ReportError(
+      TF_LITE_KERNEL_LOG(
           context,
           "Only float32, uint8 and int8 are supported currently, got %s.",
           TfLiteTypeGetName(input->type));
@@ -769,7 +762,7 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     default:
-      context->ReportError(
+      TF_LITE_KERNEL_LOG(
           context,
           "Only float32, uint8 and int8 are supported currently, got %s.",
           TfLiteTypeGetName(input->type));
@@ -840,10 +833,10 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     default:
-      context->ReportError(context,
-                           "Only float32, uint8, int16 and int8 are supported "
-                           "currently, got %s.",
-                           TfLiteTypeGetName(input->type));
+      TF_LITE_KERNEL_LOG(context,
+                         "Only float32, uint8, int16 and int8 are supported "
+                         "currently, got %s.",
+                         TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -912,10 +905,10 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
       break;
     }
     default:
-      context->ReportError(context,
-                           "Only float32, uint8, int16 and int8 are supported "
-                           "currently, got %s.",
-                           TfLiteTypeGetName(input->type));
+      TF_LITE_KERNEL_LOG(context,
+                         "Only float32, uint8, int16 and int8 are supported "
+                         "currently, got %s.",
+                         TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
   return kTfLiteOk;
@@ -936,7 +929,7 @@ TfLiteStatus SoftmaxFloat(TfLiteContext* context, const TfLiteTensor* input,
           CpuBackendContext::GetFromContext(context));
       return kTfLiteOk;
     default:
-      context->ReportError(
+      TF_LITE_KERNEL_LOG(
           context,
           "Only 1D, 2D, 3D and 4D tensors supported currently, got %dD.",
           NumDimensions(input));
@@ -953,7 +946,7 @@ TfLiteStatus SoftmaxQuantized(TfLiteContext* context, const TfLiteTensor* input,
                            GetTensorData<T>(output));
     return kTfLiteOk;
   } else {
-    context->ReportError(
+    TF_LITE_KERNEL_LOG(
         context, "Only 1D, 2D, 3D and 4D tensors supported currently, got %dD.",
         NumDimensions(input));
     return kTfLiteError;
@@ -981,7 +974,7 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
     }
 
     default:
-      context->ReportError(
+      TF_LITE_KERNEL_LOG(
           context,
           "Only float32, uint8_t and Int8_t are supported currently, got %s.",
           TfLiteTypeGetName(input->type));
@@ -1039,7 +1032,7 @@ TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     }
     default:
-      context->ReportError(
+      TF_LITE_KERNEL_LOG(
           context,
           "Only float32, uint8 and int8 are supported currently, got %s.",
           TfLiteTypeGetName(input->type));
@@ -1080,31 +1073,13 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     default:
-      context->ReportError(
+      TF_LITE_KERNEL_LOG(
           context, "Only float32 and uint8 are supported currently, got %d.",
           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
 
-namespace {
-template <typename T>
-void QLeakyRelu(const TfLiteTensor* input, TfLiteTensor* output, float alpha,
-                const LeakyReluOpData* data) {
-  LeakyReluParams op_params;
-  op_params.input_offset = input->params.zero_point;
-  op_params.alpha_offset = input->params.zero_point;
-  op_params.output_offset = output->params.zero_point;
-
-  op_params.output_multiplier = data->output_multiplier;
-  op_params.output_shift = data->output_shift;
-
-  reference_ops::QuantizeLeakyRelu(
-      op_params, data->q_alpha, GetTensorShape(input), GetTensorData<T>(input),
-      GetTensorShape(output), GetTensorData<T>(output));
-}
-}  // namespace
-
 TfLiteStatus LeakyReluEval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
@@ -1114,21 +1089,42 @@ TfLiteStatus LeakyReluEval(TfLiteContext* context, TfLiteNode* node) {
       reinterpret_cast<LeakyReluOpData*>(node->user_data);
 
   LeakyReluParams op_params;
-  op_params.alpha = params->alpha;
   switch (input->type) {
     case kTfLiteFloat32: {
+      op_params.alpha = params->alpha;
       optimized_ops::LeakyRelu(
           op_params, GetTensorShape(input), GetTensorData<float>(input),
           GetTensorShape(output), GetTensorData<float>(output));
       return kTfLiteOk;
     } break;
     case kTfLiteUInt8: {
-      QLeakyRelu<uint8_t>(input, output, params->alpha, data);
+      op_params.input_offset = input->params.zero_point;
+      op_params.output_offset = output->params.zero_point;
+      op_params.output_multiplier_alpha = data->output_multiplier_alpha;
+      op_params.output_shift_alpha = data->output_shift_alpha;
+      op_params.output_multiplier_identity = data->output_multiplier_identity;
+      op_params.output_shift_identity = data->output_shift_identity;
+      reference_ops::QuantizeLeakyRelu(
+          op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+          GetTensorShape(output), GetTensorData<uint8_t>(output));
+      return kTfLiteOk;
+    } break;
+    case kTfLiteInt8: {
+      op_params.input_offset = input->params.zero_point;
+      op_params.output_offset = output->params.zero_point;
+      op_params.output_multiplier_alpha = data->output_multiplier_alpha;
+      op_params.output_shift_alpha = data->output_shift_alpha;
+      op_params.output_multiplier_identity = data->output_multiplier_identity;
+      op_params.output_shift_identity = data->output_shift_identity;
+      reference_ops::QuantizeLeakyRelu(
+          op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+          GetTensorShape(output), GetTensorData<int8_t>(output));
       return kTfLiteOk;
     } break;
     default:
-      context->ReportError(
-          context, "Only float32 and uint8 is supported currently, got %s.",
+      TF_LITE_KERNEL_LOG(
+          context,
+          "Only float32, int8 and uint8 is supported currently, got %s.",
           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
@@ -1144,9 +1140,9 @@ TfLiteStatus EluEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     default:
-      context->ReportError(context,
-                           "Only float32 is supported currently, got %s.",
-                           TfLiteTypeGetName(input->type));
+      TF_LITE_KERNEL_LOG(context,
+                         "Only float32 is supported currently, got %s.",
+                         TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index e80adce9c4c..781d3d03eb2 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -95,10 +95,11 @@ class BaseActivationsOpModel : public SingleOpModel {
   // A dedicated constructor for LeakyRelu, which does some options.
   BaseActivationsOpModel(TensorData input, float alpha) {
     input_ = AddInput(input);
-    if (input.type == TensorType_UINT8) {
-      output_ = AddOutput({input.type, {}, input.min, input.max});
-    } else if (input.type == TensorType_INT8) {
-      output_ = AddOutput({TensorType_INT8, {}, input.min, input.max});
+    // The output scale and input scale might be different.
+    if (input.type == TensorType_UINT8 || input.type == TensorType_INT8) {
+      auto output_min = (input.min >= 0) ? input.min : input.min * alpha;
+      auto output_max = (input.max >= 0) ? input.max : input.max * alpha;
+      output_ = AddOutput({input.type, {}, output_min, output_max});
     } else {
       output_ = AddOutput({input.type, {}});
     }
@@ -488,15 +489,34 @@ TEST(QuantizedActivationsOpTest, LeakyReluUint8) {
                       0.0f, 1.0f, 3.0f,    // Row 1
                       1.0f, -0.5f, -1.0f,  // Row 2
                   },
-                  kQuantizedTolerance)));
-  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({
-                                          128,
-                                          144,
-                                          176,
-                                          144,
-                                          120,
-                                          112,
-                                      }));
+                  kQuantizedTolerance * 8)));
+}
+
+TEST(QuantizedActivationsOpTest, LeakyReluInt8) {
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+
+  QuantizedActivationsOpModel m(
+      /*input=*/{TensorType_INT8, {5, 5}, 5 * kMin, 5 * kMax}, 0.1);
+
+  m.SetInput<int8_t>({
+      -5.0f, -4.6f, -4.2f, -3.8f, -3.4f,  // Row 1
+      -3.0f, -2.6f, -2.2f, -1.8f, -1.4f,  // Row 2
+      -1.0f, -0.6f, -0.2f, 0.2f,  0.6f,   // Row 3
+      1.0f,  1.4f,  1.8f,  2.2f,  2.6f,   // Row 4
+      3.0f,  3.4f,  3.8f,  4.2f,  4.6f,   // Row 5
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      -0.50f, -0.46f, -0.42f, -0.38f, -0.34f,  // Row 1
+                      -0.30f, -0.26f, -0.22f, -0.18f, -0.14f,  // Row 2
+                      -0.10f, -0.06f, -0.02f, 0.20f,  0.60f,   // Row 3
+                      1.00f,  1.40f,  1.80f,  2.20f,  2.60f,   // Row 4
+                      3.00f,  3.40f,  3.80f,  4.20f,  4.60f,   // Row 5
+                  },
+                  kQuantizedTolerance * 5)));
 }
 
 TEST(QuantizedActivationsOpTest, Relu1Int8) {
diff --git a/tensorflow/lite/kernels/batch_to_space_nd.cc b/tensorflow/lite/kernels/batch_to_space_nd.cc
index 570ec21800d..d7d796ebec1 100644
--- a/tensorflow/lite/kernels/batch_to_space_nd.cc
+++ b/tensorflow/lite/kernels/batch_to_space_nd.cc
@@ -48,12 +48,12 @@ struct BatchToSpaceNDContext {
   TfLiteTensor* output;
 };
 
-// Currently, only 4D NHWC input/output op_context are supported.
+// Currently, only 3D NHC or 4D NHWC input/output op_context are supported.
+// In case of 3D input,it will be converted to 4D by adding W=1 to be NH1C.
 // The 4D array need to have exactly 2 spatial dimensions.
 // TODO(ycling): Support arbitrary dimension in BatchToSpaceND.
-const int kInputDimensionNum = 4;
-const int kBlockSizeDimensionNum = 1;
-const int kSpatialDimensionNum = 2;
+const int kInputMinDimensionNum = 3;
+const int kInputMaxDimensionNum = 4;
 
 TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
                                 BatchToSpaceNDContext* op_context) {
@@ -61,41 +61,34 @@ TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
   const int* block_shape = GetTensorData<int32>(op_context->block_shape);
   const int* crops = GetTensorData<int32>(op_context->crops);
 
-  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->block_shape),
-                    kBlockSizeDimensionNum);
+  int spatial_dims_num = input_size->size - 2;
+  // Block_shape should be a 1D tensor with dimension [spatial_dims_num].
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->block_shape), 1);
   TF_LITE_ENSURE_EQ(context, op_context->block_shape->dims->data[0],
-                    kSpatialDimensionNum);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->crops),
-                    kSpatialDimensionNum);
+                    spatial_dims_num);
+  // Crops should be a 2D tensor with dimension [spatial_dims_num, 2].
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->crops), 2);
+  TF_LITE_ENSURE_EQ(context, op_context->crops->dims->data[0],
+                    spatial_dims_num);
+  TF_LITE_ENSURE_EQ(context, op_context->crops->dims->data[1], 2);
 
-  TF_LITE_ENSURE(context, crops[0] >= 0);
-  TF_LITE_ENSURE(context, crops[1] >= 0);
-  TF_LITE_ENSURE(context, crops[2] >= 0);
-  TF_LITE_ENSURE(context, crops[3] >= 0);
-
-  // Number of batch must be multiple of (block_shape[0] * block_shape[1]).
-  TF_LITE_ENSURE_EQ(context,
-                    input_size->data[0] % (block_shape[0] * block_shape[1]), 0);
-
-  const int output_batch_size =
-      input_size->data[0] / (block_shape[0] * block_shape[1]);
-
-  const int crops_top = crops[0];
-  const int crops_bottom = crops[1];
-  const int crops_left = crops[2];
-  const int crops_right = crops[3];
-  const int output_height =
-      input_size->data[1] * block_shape[0] - crops_top - crops_bottom;
-  const int output_width =
-      input_size->data[2] * block_shape[1] - crops_left - crops_right;
-
-  const int output_channel_size = input_size->data[3];
+  for (int i = 0; i < spatial_dims_num * 2; ++i) {
+    TF_LITE_ENSURE(context, crops[i] >= 0);
+  }
 
   TfLiteIntArray* output_size = TfLiteIntArrayCopy(input_size);
+  int output_batch_size = input_size->data[0];
+  for (int dim = 0; dim < spatial_dims_num; ++dim) {
+    // Number of batch must be multiple of (block_shape[dim]).
+    TF_LITE_ENSURE_EQ(context, output_batch_size % block_shape[dim], 0);
+    output_batch_size = output_batch_size / block_shape[dim];
+    output_size->data[dim + 1] = input_size->data[dim + 1] * block_shape[dim] -
+                                 crops[dim * 2] - crops[dim * 2 + 1];
+  }
+
   output_size->data[0] = output_batch_size;
-  output_size->data[1] = output_height;
-  output_size->data[2] = output_width;
-  output_size->data[3] = output_channel_size;
+  output_size->data[input_size->size - 1] =
+      input_size->data[input_size->size - 1];
 
   return context->ResizeTensor(context, op_context->output, output_size);
 }
@@ -105,8 +98,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
   BatchToSpaceNDContext op_context(context, node);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.input),
-                    kInputDimensionNum);
+  TF_LITE_ENSURE(context,
+                 NumDimensions(op_context.input) >= kInputMinDimensionNum);
+  TF_LITE_ENSURE(context,
+                 NumDimensions(op_context.input) <= kInputMaxDimensionNum);
   TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
 
   if (!IsConstantTensor(op_context.block_shape) ||
diff --git a/tensorflow/lite/kernels/batch_to_space_nd_test.cc b/tensorflow/lite/kernels/batch_to_space_nd_test.cc
index 3ed62dbfcaa..a279fd45f55 100644
--- a/tensorflow/lite/kernels/batch_to_space_nd_test.cc
+++ b/tensorflow/lite/kernels/batch_to_space_nd_test.cc
@@ -66,9 +66,10 @@ class BatchToSpaceNDOpConstModel : public BatchToSpaceNDOpModel {
                              std::initializer_list<int> block_shape,
                              std::initializer_list<int> crops,
                              const TensorType& type = TensorType_FLOAT32) {
+    int spatial_dims = static_cast<int>(block_shape.size());
     input_ = AddInput(type);
-    block_shape_ = AddConstInput(TensorType_INT32, block_shape, {2});
-    crops_ = AddConstInput(TensorType_INT32, crops, {2, 2});
+    block_shape_ = AddConstInput(TensorType_INT32, block_shape, {spatial_dims});
+    crops_ = AddConstInput(TensorType_INT32, crops, {spatial_dims, 2});
     output_ = AddOutput(type);
 
     SetBuiltinOp(BuiltinOperator_BATCH_TO_SPACE_ND,
@@ -95,10 +96,11 @@ class BatchToSpaceNDOpDynamicModel : public BatchToSpaceNDOpModel {
     crops_ = AddInput(TensorType_INT32);
     output_ = AddOutput(type);
 
+    int spatial_dims = static_cast<int>(input_shape.size()) - 2;
     SetBuiltinOp(BuiltinOperator_BATCH_TO_SPACE_ND,
                  BuiltinOptions_BatchToSpaceNDOptions,
                  CreateBatchToSpaceNDOptions(builder_).Union());
-    BuildInterpreter({input_shape, {2}, {2, 2}});
+    BuildInterpreter({input_shape, {spatial_dims}, {spatial_dims, 2}});
   }
 };
 
@@ -174,7 +176,7 @@ TEST(BatchToSpaceNDOpTest, InvalidCropsDynamicTest) {
   m.SetInput<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.SetBlockShape({2, 2});
   m.SetCrops({0, 0, -1, 0});
-  ASSERT_NE(m.InvokeUnchecked(), kTfLiteOk) << "crops.2. >= 0 was not true.";
+  ASSERT_NE(m.InvokeUnchecked(), kTfLiteOk) << "crops.i. >= 0 was not true.";
 }
 
 TEST(BatchToSpaceNDOpTest, SimpleDynamicTestInt8EmptyOutput) {
@@ -200,9 +202,51 @@ TEST(BatchToSpaceNDOpTest, InvalidShapeTest) {
 
 TEST(BatchToSpaceNDOpTest, InvalidCropsConstTest) {
   EXPECT_DEATH(BatchToSpaceNDOpConstModel({3, 2, 2, 1}, {2, 2}, {0, 0, 0, -1}),
-               "crops.3. >= 0 was not true.");
+               "crops.i. >= 0 was not true.");
 }
 #endif
 
+TEST(BatchToSpaceNDOpTest, Simple3DConstTest) {
+  BatchToSpaceNDOpConstModel m({4, 4, 1}, {2}, {0, 0});
+  m.SetInput<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 8, 1}));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(
+                  {1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16}));
+}
+
+TEST(BatchToSpaceNDOpTest, Simple3DConstTestWithCrops) {
+  BatchToSpaceNDOpConstModel m({4, 4, 1}, {2}, {1, 1});
+  m.SetInput<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 6, 1}));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray({9, 2, 10, 3, 11, 4, 13, 6, 14, 7, 15, 8}));
+}
+
+TEST(BatchToSpaceNDOpTest, Simple3DDynamicTest) {
+  BatchToSpaceNDOpDynamicModel m({4, 4, 1});
+  m.SetInput<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetBlockShape({2});
+  m.SetCrops({0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 8, 1}));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(
+                  {1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16}));
+}
+
+TEST(BatchToSpaceNDOpTest, Simple3DDynamicTestWithCrops) {
+  BatchToSpaceNDOpDynamicModel m({4, 4, 1});
+  m.SetInput<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetBlockShape({2});
+  m.SetCrops({1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 6, 1}));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray({9, 2, 10, 3, 11, 4, 13, 6, 14, 7, 15, 8}));
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/concatenation_test.cc b/tensorflow/lite/kernels/concatenation_test.cc
index 54f0b0b178a..8f4abe0bcda 100644
--- a/tensorflow/lite/kernels/concatenation_test.cc
+++ b/tensorflow/lite/kernels/concatenation_test.cc
@@ -273,7 +273,7 @@ struct ConcatenationOpTestTyped : public testing::Test {
   using TestType = Type;
 
   enum TensorType tensor_type =
-      std::is_same<Type, int16_t>::value ? TensorType_INT16 : TensorType_INT8;
+      (std::is_same<Type, int16_t>::value ? TensorType_INT16 : TensorType_INT8);
 };
 
 using TestTypes = testing::Types<int8_t, int16_t>;
diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
index 48dbfafe60d..cf7b6b68ccd 100644
--- a/tensorflow/lite/kernels/fully_connected.cc
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -318,8 +318,7 @@ TfLiteStatus EvalPie(TfLiteContext* context, TfLiteNode* node,
   // Compute output += weight * input
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       GetTensorData<float>(filter), num_units, input_size,
-      GetTensorData<float>(input), batch_size, GetTensorData<float>(output),
-      /*result_stride=*/1);
+      GetTensorData<float>(input), batch_size, GetTensorData<float>(output));
 
   // Apply activation function
   tensor_utils::ApplyActivationToVector(
@@ -384,12 +383,11 @@ TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       filter_data, num_units, input_size, quant_data, scaling_factors_ptr,
       batch_size, scratch, GetTensorData<float>(output),
-      /*result_stride=*/1, CpuBackendContext::GetFromContext(context));
+      CpuBackendContext::GetFromContext(context));
 #else
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       filter_data, num_units, input_size, quant_data, scaling_factors_ptr,
-      batch_size, GetTensorData<float>(output),
-      /*result_stride=*/1);
+      batch_size, GetTensorData<float>(output));
 #endif
   // Apply activation function to floats.
   tensor_utils::ApplyActivationToVector(
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index c7a76d79e48..376c3d5d34b 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -229,7 +229,6 @@ cc_library(
         "optimized/integer_ops/mean.h",
         "optimized/integer_ops/mul.h",
         "optimized/integer_ops/pooling.h",
-        "optimized/integer_ops/softmax.h",
         "optimized/integer_ops/transpose_conv.h",
         "optimized/optimized_ops.h",
         "optimized/sparse_ops/fully_connected.h",
@@ -459,6 +458,7 @@ cc_library(
         "reference/softmax.h",
         "reference/sparse_ops/fully_connected.h",
         "reference/strided_slice.h",
+        "reference/sub.h",
         "reference/svdf.h",
     ],
     copts = tflite_copts(),
@@ -524,6 +524,7 @@ cc_library(
         "reference/round.h",
         "reference/softmax.h",
         "reference/strided_slice.h",
+        "reference/sub.h",
     ],
     copts = tflite_copts(),
     deps = [
@@ -1028,6 +1029,17 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "per_channel_dequantize_test",
+    srcs = ["per_channel_dequantize_test.cc"],
+    deps = [
+        ":reference_base",
+        ":types",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 exports_files(["optimized/eigen_tensor_reduced_instantiations_oss.h"])
 
 filegroup(
diff --git a/tensorflow/lite/kernels/internal/kernel_utils.cc b/tensorflow/lite/kernels/internal/kernel_utils.cc
index 63e84ed1d5f..21c058c394b 100644
--- a/tensorflow/lite/kernels/internal/kernel_utils.cc
+++ b/tensorflow/lite/kernels/internal/kernel_utils.cc
@@ -53,19 +53,19 @@ void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
     // Output += input * input_weights
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         input_weights_ptr, num_units, input_size, input_ptr_batch, batch_size,
-        output_ptr_batch, /*result_stride=*/1);
+        output_ptr_batch);
 
     // Output += aux_input * aux_input_weights (if they are not empty).
     if (aux_input_size > 0) {
       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
           aux_input_weights_ptr, num_units, aux_input_size, aux_input_ptr_batch,
-          batch_size, output_ptr_batch, /*result_stride=*/1);
+          batch_size, output_ptr_batch);
     }
 
     // Output += recurrent_weights * hidden_state
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         recurrent_weights_ptr, num_units, num_units, hidden_state_ptr_batch,
-        batch_size, output_ptr_batch, /*result_stride=*/1);
+        batch_size, output_ptr_batch);
 
     // Output = activation(Output) and update hidden_state
     tensor_utils::ApplyActivationToVector(
@@ -84,7 +84,7 @@ void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
           input_weights_ptr, num_units, input_size,
           input_ptr_batch + k * input_size, /*n_batch=*/1,
-          output_ptr_batch + k * output_batch_leading_dim, /*result_stride=*/1);
+          output_ptr_batch + k * output_batch_leading_dim);
     }
 
     // Output += aux_input * aux_input_weights (if they are not empty).
@@ -93,8 +93,7 @@ void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
         tensor_utils::MatrixBatchVectorMultiplyAccumulate(
             aux_input_weights_ptr, num_units, aux_input_size,
             aux_input_ptr_batch + k * aux_input_size,
-            /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
-            /*result_stride=*/1);
+            /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim);
       }
     }
 
@@ -103,8 +102,7 @@ void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
           recurrent_weights_ptr, num_units, num_units,
           hidden_state_ptr_batch + k * num_units,
-          /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
-          /*result_stride=*/1);
+          /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim);
     }
 
     // Output = activation(Output) and update hidden_state
@@ -175,7 +173,7 @@ void RnnBatchStep(
       // Output += input * input_weights
       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
           input_weights_ptr, num_units, input_size, quantized_input_ptr_batch,
-          scaling_factors, batch_size, output_ptr_batch, /*result_stride=*/1);
+          scaling_factors, batch_size, output_ptr_batch);
     }
 
     if (aux_input_ptr_batch &&
@@ -195,7 +193,7 @@ void RnnBatchStep(
       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
           aux_input_weights_ptr, num_units, aux_input_size,
           aux_quantized_input_ptr_batch, scaling_factors, batch_size,
-          output_ptr_batch, /*result_stride=*/1);
+          output_ptr_batch);
     }
 
     // Save quantization and matmul computation for all zero input.
@@ -216,7 +214,7 @@ void RnnBatchStep(
       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
           recurrent_weights_ptr, num_units, num_units,
           quantized_hidden_state_ptr_batch, scaling_factors, batch_size,
-          output_ptr_batch, /*result_stride=*/1);
+          output_ptr_batch);
     }
 
     // Output = activation(Output) and update hidden_state
@@ -252,8 +250,7 @@ void RnnBatchStep(
         tensor_utils::MatrixBatchVectorMultiplyAccumulate(
             input_weights_ptr, num_units, input_size,
             quantized_input_ptr_batch + k * input_size, &scaling_factors[k],
-            /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
-            /*result_stride=*/1);
+            /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim);
       }
     }
 
@@ -276,8 +273,7 @@ void RnnBatchStep(
             aux_input_weights_ptr, num_units, aux_input_size,
             aux_quantized_input_ptr_batch + k * aux_input_size,
             &scaling_factors[k],
-            /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
-            /*result_stride=*/1);
+            /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim);
       }
     }
 
@@ -301,8 +297,7 @@ void RnnBatchStep(
             recurrent_weights_ptr, num_units, num_units,
             quantized_hidden_state_ptr_batch + k * num_units,
             &scaling_factors[k],
-            /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
-            /*result_stride=*/1);
+            /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim);
       }
     }
 
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/softmax.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/softmax.h
deleted file mode 100644
index 22e65d650a3..00000000000
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/softmax.h
+++ /dev/null
@@ -1,237 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_SOFTMAX_H_
-#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_SOFTMAX_H_
-
-#include "fixedpoint/fixedpoint.h"
-#include "tensorflow/lite/experimental/ruy/profiler/instrumentation.h"
-#include "tensorflow/lite/kernels/internal/common.h"
-#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
-
-namespace tflite {
-namespace optimized_integer_ops {
-
-// Quantized softmax with int8 input and output.
-inline void Softmax(const SoftmaxParams& params,
-                    const RuntimeShape& input_shape, const int8* input_data,
-                    const RuntimeShape& output_shape, int8* output_data) {
-  const int32 input_beta_multiplier = params.input_multiplier;
-  const int32 input_beta_left_shift = params.input_left_shift;
-  const int diff_min = params.diff_min;
-  // The representation chosen for the input to the exp() function is Q5.26.
-  // We need to leave extra space since values that we skip might be as large as
-  // -32 before multiplying by input_beta_multiplier, and therefore as large as
-  // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
-  // accumulation, but exp(-16) definitely is.
-  static const int kScaledDiffIntegerBits = 5;
-  static const int kAccumulationIntegerBits = 12;
-  using FixedPointScaledDiff =
-      gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
-  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
-  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
-
-  ruy::profiler::ScopeLabel label("SoftmaxInt8/8bit");
-  const int trailing_dim = input_shape.DimensionsCount() - 1;
-  const int outer_size =
-      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
-  const int depth =
-      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
-
-  for (int b = 0; b < outer_size; ++b) {
-    const int8* input_data_ptr = input_data + b * depth;
-    int8* output_data_ptr = output_data + b * depth;
-
-    // Determine the largest entry in the current row
-    int8_t max_in_row = -128;
-    {
-      int c = 0;
-#ifdef USE_NEON
-      int8x16_t max16_0 = vdupq_n_s8(-128);
-      int8x16_t max16_1 = vdupq_n_s8(-128);
-      for (; c <= depth - 32; c += 32) {
-        max16_0 = vmaxq_s8(max16_0, vld1q_s8(input_data_ptr + c + 0));
-        max16_1 = vmaxq_s8(max16_1, vld1q_s8(input_data_ptr + c + 16));
-      }
-      int8x16_t max16 = vmaxq_s8(max16_0, max16_1);
-      if (c <= depth - 16) {
-        max16 = vmaxq_s8(max16, vld1q_s8(input_data_ptr + c));
-        c += 16;
-      }
-      int8x8_t max8 = vmax_s8(vget_low_s8(max16), vget_high_s8(max16));
-      if (c <= depth - 8) {
-        max8 = vmax_s8(max8, vld1_s8(input_data_ptr + c));
-        c += 8;
-      }
-      int8x8_t max4 = vmax_s8(max8, vext_s8(max8, max8, 4));
-      int8x8_t max2 = vmax_s8(max4, vext_s8(max4, max4, 2));
-      int8x8_t max1 = vpmax_s8(max2, max2);
-      max_in_row = vget_lane_s8(max1, 0);
-#endif
-      for (; c < depth; ++c) {
-        max_in_row = std::max(max_in_row, input_data_ptr[c]);
-      }
-    }
-
-#ifdef USE_NEON
-    using FixedPointAccumInt32x4 =
-        gemmlowp::FixedPoint<int32x4_t, kAccumulationIntegerBits>;
-    using FixedPointScaledDiffInt32x4 =
-        gemmlowp::FixedPoint<int32x4_t, kScaledDiffIntegerBits>;
-    using FixedPoint0Int32x4 = gemmlowp::FixedPoint<int32x4_t, 0>;
-    FixedPoint0Int32x4 input_beta_multiplier_f0 =
-        FixedPoint0Int32x4::FromScalarRaw(input_beta_multiplier);
-    int16x8_t max_in_row_s16 = vdupq_n_s16(max_in_row);
-#endif
-
-    // Compute the sum of exponentials of the differences of entries in the
-    // current row from the largest entry in the current row.
-    FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
-    {
-      int c = 0;
-#ifdef USE_NEON
-      int32x4_t diff_min_s32 = vdupq_n_s32(diff_min);
-      FixedPointAccumInt32x4 sum_of_exps_0 = FixedPointAccumInt32x4::Zero();
-      FixedPointAccumInt32x4 sum_of_exps_1 = FixedPointAccumInt32x4::Zero();
-      FixedPointAccumInt32x4 zeros = FixedPointAccumInt32x4::Zero();
-      for (; c <= depth - 8; c += 8) {
-        int16x8_t input_s16 = vmovl_s8(vld1_s8(input_data_ptr + c));
-        int16x8_t input_diff_s16 = vsubq_s16(input_s16, max_in_row_s16);
-        int32x4_t input_diff_s32_0 = vmovl_s16(vget_low_s16(input_diff_s16));
-        int32x4_t input_diff_s32_1 = vmovl_s16(vget_high_s16(input_diff_s16));
-        int32x4_t mask_0 =
-            gemmlowp::MaskIfGreaterThanOrEqual(input_diff_s32_0, diff_min_s32);
-        int32x4_t mask_1 =
-            gemmlowp::MaskIfGreaterThanOrEqual(input_diff_s32_1, diff_min_s32);
-        FixedPointScaledDiffInt32x4 scaled_diff_0 =
-            input_beta_multiplier_f0 *
-            FixedPointScaledDiffInt32x4::FromRaw(
-                gemmlowp::ShiftLeft(input_diff_s32_0, input_beta_left_shift));
-        FixedPointScaledDiffInt32x4 scaled_diff_1 =
-            input_beta_multiplier_f0 *
-            FixedPointScaledDiffInt32x4::FromRaw(
-                gemmlowp::ShiftLeft(input_diff_s32_1, input_beta_left_shift));
-        FixedPointAccumInt32x4 exps_0 =
-            gemmlowp::Rescale<kAccumulationIntegerBits>(
-                exp_on_negative_values(scaled_diff_0));
-        FixedPointAccumInt32x4 exps_1 =
-            gemmlowp::Rescale<kAccumulationIntegerBits>(
-                exp_on_negative_values(scaled_diff_1));
-        FixedPointAccumInt32x4 masked_exps_0 =
-            SelectUsingMask(mask_0, exps_0, zeros);
-        FixedPointAccumInt32x4 masked_exps_1 =
-            SelectUsingMask(mask_1, exps_1, zeros);
-        sum_of_exps_0 = sum_of_exps_0 + masked_exps_0;
-        sum_of_exps_1 = sum_of_exps_1 + masked_exps_1;
-      }
-      int32x4_t sum_of_exps_reduced_4 = (sum_of_exps_0 + sum_of_exps_1).raw();
-      int32x2_t sum_of_exps_reduced_2 =
-          vadd_s32(vget_low_s32(sum_of_exps_reduced_4),
-                   vget_high_s32(sum_of_exps_reduced_4));
-      int32x2_t sum_of_exps_reduced_1 =
-          vpadd_s32(sum_of_exps_reduced_2, sum_of_exps_reduced_2);
-      sum_of_exps =
-          FixedPointAccum::FromRaw(vget_lane_s32(sum_of_exps_reduced_1, 0));
-#endif
-      for (; c < depth; ++c) {
-        int32 input_diff = static_cast<int32>(input_data_ptr[c]) - max_in_row;
-        if (input_diff >= diff_min) {
-          const int32 input_diff_rescaled =
-              MultiplyByQuantizedMultiplierGreaterThanOne(
-                  input_diff, input_beta_multiplier, input_beta_left_shift);
-          const FixedPointScaledDiff scaled_diff_f8 =
-              FixedPointScaledDiff::FromRaw(input_diff_rescaled);
-          sum_of_exps =
-              sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
-                                exp_on_negative_values(scaled_diff_f8));
-        }
-      }
-    }
-
-    // Compute the fixed-point multiplier and shift that we need to apply to
-    // perform a division by the above-computed sum-of-exponentials.
-    int num_bits_over_unit;
-    FixedPoint0 shifted_scale = FixedPoint0::FromRaw(GetReciprocal(
-        sum_of_exps.raw(), kAccumulationIntegerBits, &num_bits_over_unit));
-
-    // Compute the quotients of exponentials of differences of entries in the
-    // current row from the largest entry, over the previously-computed sum of
-    // exponentials.
-    {
-      int c = 0;
-#ifdef USE_NEON
-      int16x8_t diff_min_s16 = vdupq_n_s16(diff_min);
-      for (; c <= depth - 8; c += 8) {
-        int16x8_t input_s16 = vmovl_s8(vld1_s8(input_data_ptr + c));
-        int16x8_t input_diff_s16 = vsubq_s16(input_s16, max_in_row_s16);
-        int32x4_t input_diff_s32_0 = vmovl_s16(vget_low_s16(input_diff_s16));
-        int32x4_t input_diff_s32_1 = vmovl_s16(vget_high_s16(input_diff_s16));
-        uint8x8_t mask = vmovn_u16(vcgeq_s16(input_diff_s16, diff_min_s16));
-        FixedPointScaledDiffInt32x4 scaled_diff_0 =
-            input_beta_multiplier_f0 *
-            FixedPointScaledDiffInt32x4::FromRaw(
-                gemmlowp::ShiftLeft(input_diff_s32_0, input_beta_left_shift));
-        FixedPointScaledDiffInt32x4 scaled_diff_1 =
-            input_beta_multiplier_f0 *
-            FixedPointScaledDiffInt32x4::FromRaw(
-                gemmlowp::ShiftLeft(input_diff_s32_1, input_beta_left_shift));
-        FixedPoint0Int32x4 exp_0 = exp_on_negative_values(scaled_diff_0);
-        FixedPoint0Int32x4 exp_1 = exp_on_negative_values(scaled_diff_1);
-        int32x4_t output_zero_point_s32 = vdupq_n_s32(-128);
-        int32x4_t output_s32_0 =
-            vaddq_s32(gemmlowp::RoundingDivideByPOT(
-                          vqrdmulhq_n_s32(exp_0.raw(), shifted_scale.raw()),
-                          num_bits_over_unit + 31 - 8),
-                      output_zero_point_s32);
-        int32x4_t output_s32_1 =
-            vaddq_s32(gemmlowp::RoundingDivideByPOT(
-                          vqrdmulhq_n_s32(exp_1.raw(), shifted_scale.raw()),
-                          num_bits_over_unit + 31 - 8),
-                      output_zero_point_s32);
-        int16x8_t output_s16 =
-            vcombine_s16(vqmovn_s32(output_s32_0), vqmovn_s32(output_s32_1));
-        int8x8_t output_s8 = vqmovn_s16(output_s16);
-        int8x8_t masked_output = vbsl_s8(mask, output_s8, vdup_n_s8(-128));
-        vst1_s8(output_data_ptr + c, masked_output);
-      }
-#endif
-      for (; c < depth; ++c) {
-        int32 input_diff = static_cast<int32>(input_data_ptr[c]) - max_in_row;
-        if (input_diff >= diff_min) {
-          const int32 input_diff_rescaled =
-              MultiplyByQuantizedMultiplierGreaterThanOne(
-                  input_diff, input_beta_multiplier, input_beta_left_shift);
-          const FixedPointScaledDiff scaled_diff_f8 =
-              FixedPointScaledDiff::FromRaw(input_diff_rescaled);
-
-          FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
-          const int32 unsat_output = gemmlowp::RoundingDivideByPOT(
-              (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
-          const int32 shifted_output = unsat_output - 128;
-
-          output_data_ptr[c] =
-              static_cast<int8>(std::max(std::min(shifted_output, 127), -128));
-
-        } else {
-          output_data_ptr[c] = -128;
-        }
-      }
-    }
-  }
-}
-
-}  // namespace optimized_integer_ops
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_SOFTMAX_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
index b6549a2ecf1..2de4c209209 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -219,8 +219,7 @@ inline int32x4x2_t MultiplyByQuantizedMultiplier2Rows(
 
 void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
                                              int m_cols, const float* vector,
-                                             int n_batch, float* result,
-                                             int result_stride) {
+                                             int n_batch, float* result) {
   // If v_size is not divisible by the vector size, then we need to process the
   // final few elements sequentially. postamble_start shows the start index
   // where this should happen.
@@ -228,7 +227,7 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
       RoundDownVectors<kFloatValuesPerNeonVector>(m_cols);
 
   for (int b = 0; b < n_batch; b++) {
-    float* result_in_batch = result + b * m_rows * result_stride;
+    float* result_in_batch = result + b * m_rows;
     const float* vector_in_batch = vector + b * m_cols;
     const float* matrix_row = matrix;
 
@@ -250,7 +249,7 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
         *result_in_batch += matrix_row[c] * vector_in_batch[c];
       }
       matrix_row += m_cols;
-      result_in_batch += result_stride;
+      ++result_in_batch;
     }
   }
 }
@@ -1068,21 +1067,22 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
                                       output_zp, scratch, output);
 }
 
-void NeonMatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, int result_stride) {
+void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
+                                             const int m_rows, const int m_cols,
+                                             const int8_t* __restrict__ vectors,
+                                             const float* scaling_factors,
+                                             int n_batch,
+                                             float* __restrict__ result) {
 #ifdef __aarch64__
   if (HasSdotInstruction() && m_cols % 16 == 0 && m_rows % 2 == 0 &&
       m_rows >= n_batch) {
-    if (n_batch % 4 == 0 && result_stride == 1) {
+    if (n_batch % 4 == 0) {
       // Benchmarks suggest that it's always better to use the batch code
       // when we can, even on small matrices.
       DotprodMatrixBatchFourVectorMultiplyAccumulate(
           matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result);
       return;
-    } else if (result_stride == 1 && n_batch >= 2 &&
-               m_rows * m_cols >= 128 * 128) {
+    } else if (n_batch >= 2 && m_rows * m_cols >= 128 * 128) {
       DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
           matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result);
       return;
@@ -1128,7 +1128,7 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
     // Copy the vector data to an aligned vector.
     memcpy(aligned_vec, vectors + batch * m_cols, sizeof(int8_t) * m_cols);
     // Compute dot-product for every column.
-    for (int row = 0; row < m_rows; ++row, result += result_stride) {
+    for (int row = 0; row < m_rows; ++row) {
       // Get the address of the first element of the row.
       int8_t* row_ptr = (int8_t*)matrix + row * m_cols;  // NOLINT
       if (unaligned) {
@@ -1193,6 +1193,7 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
       }  // for col
 
       *result += dotprod * batch_scaling_factor;
+      ++result;
     }  // for row
   }    // for batch
 
@@ -1202,12 +1203,14 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
   free(aligned_vec_free);
 }
 
-void NeonMatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, int32_t* scratch, float* __restrict__ result,
-    int result_stride, CpuBackendContext* context) {
-  if (m_rows % 4 == 0 && result_stride == 1) {
+void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
+                                             const int m_rows, const int m_cols,
+                                             const int8_t* __restrict__ vectors,
+                                             const float* scaling_factors,
+                                             int n_batch, int32_t* scratch,
+                                             float* __restrict__ result,
+                                             CpuBackendContext* context) {
+  if (m_rows % 4 == 0) {
     const int32_t* bias = static_cast<const int32_t*>(nullptr);
     NeonCpuBackendGemm(vectors, bias, matrix, n_batch, m_cols, m_rows,
                        /*output_zp =*/0, scratch, context);
@@ -1215,7 +1218,7 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
     // Multiply by float scaling factors and write to result
     const int total_size = n_batch * m_rows;
     int i = 0;
-    for (; i <= total_size - 8; i += 8, result += 8 * result_stride) {
+    for (; i <= total_size - 8; i += 8, result += 8) {
       const float batch_scaling_factor0 = scaling_factors[i / m_rows];
       const float batch_scaling_factor1 = scaling_factors[(i + 4) / m_rows];
       const float32x4_t scaling_factor0 = vdupq_n_f32(batch_scaling_factor0);
@@ -1226,22 +1229,22 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
       const float32x4_t float_val1 = vcvtq_f32_s32(scratch_val1);
       const float32x4_t result0 =
           vmlaq_f32(vld1q_f32(result), float_val0, scaling_factor0);
-      const float32x4_t result1 = vmlaq_f32(
-          vld1q_f32(result + 4 * result_stride), float_val1, scaling_factor1);
+      const float32x4_t result1 =
+          vmlaq_f32(vld1q_f32(result + 4), float_val1, scaling_factor1);
       vst1q_f32(result, result0);
-      vst1q_f32(result + 4 * result_stride, result1);
+      vst1q_f32(result + 4, result1);
     }
     scratch += i;
-    for (; i < total_size; i++, result += result_stride) {
+    for (; i < total_size; i++) {
       const float batch_scaling_factor = scaling_factors[i / m_rows];
       int32_t x = *(scratch++);
       *result += x * batch_scaling_factor;
+      ++result;
     }
     return;
   }
   NeonMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors,
-                                          scaling_factors, n_batch, result,
-                                          result_stride);
+                                          scaling_factors, n_batch, result);
 }
 
 void NeonMatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
@@ -1270,19 +1273,17 @@ void NeonMatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
 void NeonMatrixBatchVectorMultiplyAccumulateImpl(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, int result_stride,
-    const float* per_channel_scale, const int32_t* input_offset,
-    int32_t* row_sums) {
+    int n_batch, float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* row_sums) {
 #ifdef __aarch64__
   if (HasSdotInstruction() && m_cols % 16 == 0 && m_rows % 2 == 0 &&
       m_rows >= n_batch) {
-    if (n_batch % 4 == 0 && result_stride == 1) {
+    if (n_batch % 4 == 0) {
       DotprodMatrixBatchFourVectorMultiplyAccumulate(
           matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
           per_channel_scale, input_offset, row_sums);
       return;
-    } else if (result_stride == 1 && n_batch >= 2 &&
-               m_rows * m_cols >= 128 * 128) {
+    } else if (n_batch >= 2 && m_rows * m_cols >= 128 * 128) {
       DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
           matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
           per_channel_scale, input_offset, row_sums);
@@ -1313,7 +1314,7 @@ void NeonMatrixBatchVectorMultiplyAccumulateImpl(
     const float batch_scaling_factor = scaling_factors[batch];
     const int batch_input_offset = input_offset[batch];
     memcpy(aligned_vec, vectors + batch * m_cols, sizeof(int8_t) * m_cols);
-    for (int row = 0; row < m_rows; ++row, result += result_stride) {
+    for (int row = 0; row < m_rows; ++row) {
       int8_t* row_ptr = (int8_t*)matrix + row * m_cols;  // NOLINT
       if (unaligned) {
         memcpy(aligned_row, row_ptr, sizeof(int8_t) * m_cols);
@@ -1394,6 +1395,7 @@ void NeonMatrixBatchVectorMultiplyAccumulateImpl(
       }  // for col
       dotprod -= row_sum * batch_input_offset;
       *result += dotprod * scale;
+      ++result;
     }  // for row
   }    // for batch
   if (unaligned) {
@@ -1405,10 +1407,9 @@ void NeonMatrixBatchVectorMultiplyAccumulateImpl(
 void NeonMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, int result_stride,
-    const float* per_channel_scale, const int32_t* input_offset,
-    int32_t* scratch, int32_t* row_sums, bool* compute_row_sums,
-    CpuBackendContext* context) {
+    int n_batch, float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* context) {
   if (compute_row_sums == nullptr || *compute_row_sums) {
     memset(row_sums, 0, sizeof(int32_t) * m_rows);
     NeonReductionSumVector(matrix, row_sums, m_rows, m_cols);
@@ -1418,7 +1419,7 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
   }
 
 #ifdef TFLITE_WITH_RUY_GEMV
-  if (m_rows % 4 == 0 && result_stride == 1) {
+  if (m_rows % 4 == 0) {
     const int32_t* bias = static_cast<const int32_t*>(nullptr);
     NeonCpuBackendGemm(vectors, bias, matrix, n_batch, m_cols, m_rows, 0,
                        scratch, context);
@@ -1427,7 +1428,7 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
     const int total_size = n_batch * m_rows;
     int i = 0;
     int32_t* scratch_ptr = scratch;
-    for (; i <= total_size - 8; i += 8, result += 8 * result_stride) {
+    for (; i <= total_size - 8; i += 8, result += 8) {
       float batch_scaling_factor0 = scaling_factors[i / m_rows];
       float batch_scaling_factor1 = scaling_factors[(i + 4) / m_rows];
       if (per_channel_scale) {
@@ -1452,36 +1453,37 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
       const float32x4_t float_val1 = vcvtq_f32_s32(dotprod1);
       const float32x4_t result0 =
           vmlaq_f32(vld1q_f32(result), float_val0, scaling_factor0);
-      const float32x4_t result1 = vmlaq_f32(
-          vld1q_f32(result + 4 * result_stride), float_val1, scaling_factor1);
+      const float32x4_t result1 =
+          vmlaq_f32(vld1q_f32(result + 4), float_val1, scaling_factor1);
       vst1q_f32(result, result0);
-      vst1q_f32(result + 4 * result_stride, result1);
+      vst1q_f32(result + 4, result1);
     }
 
     scratch_ptr += i;
-    for (; i < total_size; i++, result += result_stride) {
+    for (; i < total_size; i++) {
       const float batch_scaling_factor = scaling_factors[i / m_rows];
       const int32_t zero_point = input_offset[i / m_rows];
       int32_t x = *(scratch_ptr++);
       x -= row_sums[i % m_rows] * zero_point;
       *result += x * batch_scaling_factor;
+      ++result;
     }
     return;
   }
 #endif
   NeonMatrixBatchVectorMultiplyAccumulateImpl(
       matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
-      result_stride, per_channel_scale, input_offset, row_sums);
+      per_channel_scale, input_offset, row_sums);
 }
 
 void NeonMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, int result_stride,
-    const float* per_channel_scale, const int32_t* input_offset) {
+    int n_batch, float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset) {
   NeonMatrixBatchVectorMultiplyAccumulateImpl(
       matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
-      result_stride, per_channel_scale, input_offset, nullptr);
+      per_channel_scale, input_offset, nullptr);
 }
 
 inline int64x2x2_t MulAdd(int32x4_t acc, int32x4_t lhs, int32x4_t rhs) {
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
index 5db126c1a11..d873e046729 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
@@ -28,51 +28,51 @@ namespace tensor_utils {
 
 void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
                                          int m_cols, const float* vector,
-                                         int n_batch, float* result,
-                                         int result_stride) {
+                                         int n_batch, float* result) {
   NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
-                   vector, n_batch, result, result_stride);
+                   vector, n_batch, result);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
+                                         const int m_rows, const int m_cols,
+                                         const int8_t* __restrict__ vectors,
+                                         const float* scaling_factors,
+                                         int n_batch,
+                                         float* __restrict__ result) {
+  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
+                   vectors, scaling_factors, n_batch, result);
+}
+
+void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
+                                         const int m_rows, const int m_cols,
+                                         const int8_t* __restrict__ vectors,
+                                         const float* scaling_factors,
+                                         int n_batch, int32_t* scratch,
+                                         float* __restrict__ result,
+                                         CpuBackendContext* context) {
+  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
+                   vectors, scaling_factors, n_batch, scratch, result, context);
 }
 
 void MatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, int result_stride) {
+    int n_batch, float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset) {
   NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
-                   vectors, scaling_factors, n_batch, result, result_stride);
+                   vectors, scaling_factors, n_batch, result, per_channel_scale,
+                   input_offset);
 }
 
 void MatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, int32_t* scratch, float* __restrict__ result,
-    int result_stride, CpuBackendContext* context) {
+    int n_batch, float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* context) {
   NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
-                   vectors, scaling_factors, n_batch, scratch, result,
-                   result_stride, context);
-}
-
-void MatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, int result_stride,
-    const float* per_channel_scale, const int32_t* input_offset) {
-  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
-                   vectors, scaling_factors, n_batch, result, result_stride,
-                   per_channel_scale, input_offset);
-}
-
-void MatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, int result_stride,
-    const float* per_channel_scale, const int32_t* input_offset,
-    int32_t* scratch, int32_t* row_sums, bool* compute_row_sums,
-    CpuBackendContext* context) {
-  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
-                   vectors, scaling_factors, n_batch, result, result_stride,
-                   per_channel_scale, input_offset, scratch, row_sums,
-                   compute_row_sums, context);
+                   vectors, scaling_factors, n_batch, result, per_channel_scale,
+                   input_offset, scratch, row_sums, compute_row_sums, context);
 }
 
 void SparseMatrixBatchVectorMultiplyAccumulate(
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
index 7b476d30092..059accb0222 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
@@ -34,37 +34,39 @@ namespace tensor_utils {
 // vector.
 void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
                                              int m_cols, const float* vector,
-                                             int n_batch, float* result,
-                                             int result_stride);
+                                             int n_batch, float* result);
 
 // Matrix multiplication for quantized values using symmetric quantization.
-void NeonMatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, int result_stride);
+void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
+                                             const int m_rows, const int m_cols,
+                                             const int8_t* __restrict__ vectors,
+                                             const float* scaling_factors,
+                                             int n_batch,
+                                             float* __restrict__ result);
 
 // Same as above but with a scratch buffer and CpuBackendContext for the
 // int8 x int8 -> int32 accumulation computation
-void NeonMatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, int32_t* scratch, float* __restrict__ result,
-    int result_stride, CpuBackendContext* context);
+void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
+                                             const int m_rows, const int m_cols,
+                                             const int8_t* __restrict__ vectors,
+                                             const float* scaling_factors,
+                                             int n_batch, int32_t* scratch,
+                                             float* __restrict__ result,
+                                             CpuBackendContext* context);
 
 // Matrix multiplication for quantized values using asymmetric quantization.
 void NeonMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, int result_stride,
-    const float* per_channel_scale, const int32_t* input_offset,
-    int32_t* scratch, int32_t* row_sums, bool* compute_row_sums,
-    CpuBackendContext* context);
+    int n_batch, float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* context);
 
 void NeonMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, int result_stride,
-    const float* per_channel_scale, const int32_t* input_offset);
+    int n_batch, float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset);
 
 void NeonApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
                         const int32_t* bias, int32_t layer_norm_scale_a,
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index 7149cfaaaeb..15006d12c08 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -1318,7 +1318,7 @@ inline void HybridConv(const ConvParams& params, float* scaling_factors_ptr,
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       filter_data, filter_rows, filter_cols, gemm_input_data,
       scaling_factors_ptr, /*n_batch=*/gemm_input_rows, accum_scratch,
-      output_data, /*result_stride=*/1, context);
+      output_data, context);
   AddBiasAndEvalActivationFunction(output_activation_min, output_activation_max,
                                    bias_shape, bias_data, output_shape,
                                    output_data);
@@ -3973,7 +3973,7 @@ inline void LocalResponseNormalization(
   // In a few cases, the pow computation could benefit from speedups.
   if (op_params.beta == 1) {
     data_out.array() = data_in.array() * data_out.array().inverse();
-  } else if (op_params.beta == 0.5) {
+  } else if (op_params.beta == 0.5f) {
     data_out.array() = data_in.array() * data_out.array().sqrt().inverse();
   } else {
     data_out.array() = data_in.array() * data_out.array().pow(-op_params.beta);
@@ -4071,16 +4071,20 @@ inline void Softmax(const SoftmaxParams& params,
   }
 }
 
-inline int32_t QuantizeSoftmaxOutput(int8_t* output_data, float prob_rescaled,
-                                     int32_t zero_point) {
+template <typename T>
+inline int32_t QuantizeSoftmaxOutput(float prob_rescaled, int32_t zero_point) {
   const int32_t prob_rnd = static_cast<int32_t>(std::round(prob_rescaled));
   return prob_rnd + zero_point;
 }
 
-inline int32_t QuantizeSoftmaxOutput(uint8_t* output_data, float prob_rescaled,
-                                     int32_t zero_point) {
-  return static_cast<int32_t>(prob_rescaled + 0.5);
+#if !__aarch64__
+// With ARM64, rounding is faster than add + truncation.
+template <>
+inline int32_t QuantizeSoftmaxOutput<uint8_t>(float prob_rescaled,
+                                              int32_t zero_point) {
+  return static_cast<int32_t>(prob_rescaled + 0.5f);
 }
+#endif
 
 inline void PopulateSoftmaxLookupTable(SoftmaxParams* data, float input_scale,
                                        float beta) {
@@ -4123,7 +4127,7 @@ inline void Softmax(const SoftmaxParams& params,
     for (int j = 0; j < last_dim; ++j) {
       const float prob_rescaled = table_offset[input_data[j]] * inv_sum_exp;
       const int32_t prob_quantized =
-          QuantizeSoftmaxOutput(output_data, prob_rescaled, params.zero_point);
+          QuantizeSoftmaxOutput<T>(prob_rescaled, params.zero_point);
       output_data[j] = static_cast<T>(
           std::max(std::min(clamp_max, prob_quantized), clamp_min));
     }
@@ -4964,12 +4968,24 @@ inline void BatchToSpaceND(
     const RuntimeShape& unextended_output_shape, T* output_data) {
   ruy::profiler::ScopeLabel label("BatchToSpaceND");
 
+  TFLITE_DCHECK_GE(unextended_input1_shape.DimensionsCount(), 3);
   TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
-  const RuntimeShape input1_shape =
-      RuntimeShape::ExtendedShape(4, unextended_input1_shape);
-  const RuntimeShape output_shape =
-      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+  TFLITE_DCHECK_EQ(unextended_input1_shape.DimensionsCount(),
+                   unextended_output_shape.DimensionsCount());
+
+  // Extends the input/output shape from 3D to 4D if needed, NHC -> NH1C.
+  auto extend_shape = [](const RuntimeShape& shape) {
+    if (shape.DimensionsCount() == 4) {
+      return shape;
+    }
+    RuntimeShape new_shape(4, 1);
+    new_shape.SetDim(0, shape.Dims(0));
+    new_shape.SetDim(1, shape.Dims(1));
+    new_shape.SetDim(3, shape.Dims(2));
+    return new_shape;
+  };
+  const RuntimeShape input1_shape = extend_shape(unextended_input1_shape);
+  const RuntimeShape output_shape = extend_shape(unextended_output_shape);
 
   const int output_width = output_shape.Dims(2);
   const int output_height = output_shape.Dims(1);
@@ -4980,10 +4996,12 @@ inline void BatchToSpaceND(
   const int input_height = input1_shape.Dims(1);
   const int input_batch_size = input1_shape.Dims(0);
 
-  const int block_shape_width = block_shape_data[1];
   const int block_shape_height = block_shape_data[0];
+  const int block_shape_width =
+      unextended_input1_shape.DimensionsCount() == 4 ? block_shape_data[1] : 1;
   const int crops_top = crops_data[0];
-  const int crops_left = crops_data[2];
+  const int crops_left =
+      unextended_input1_shape.DimensionsCount() == 4 ? crops_data[2] : 0;
 
   for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) {
     const int out_batch = in_batch % output_batch_size;
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc
index 947c64bd5e9..26395a2a704 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc
@@ -91,11 +91,11 @@ void SseMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors,
     const float* __restrict__ scaling_factors, int n_batch,
-    float* __restrict__ result, int result_stride) {
+    float* __restrict__ result) {
   for (int batch = 0; batch < n_batch; ++batch) {
     const float batch_scaling_factor = scaling_factors[batch];
     // Compute dot-product for every column.
-    for (int row = 0; row < m_rows; ++row, result += result_stride) {
+    for (int row = 0; row < m_rows; ++row) {
       // Get the address of the first element of the row.
       const int8_t* __restrict__ row_ptr = matrix + row * m_cols;
 
@@ -152,6 +152,7 @@ void SseMatrixBatchVectorMultiplyAccumulate(
       }  // for col
 
       *result += sum * batch_scaling_factor;
+      ++result;
     }  // for row
 
     vectors += m_cols;
@@ -162,13 +163,12 @@ void SseMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors,
     const float* __restrict__ scaling_factors, int n_batch,
-    float* __restrict__ result, int result_stride,
-    const float* __restrict__ per_channel_scale,
+    float* __restrict__ result, const float* __restrict__ per_channel_scale,
     const int32_t* __restrict__ input_offset) {
   static constexpr int kBlockSize = 16;
   for (int batch = 0; batch < n_batch; ++batch) {
     const float batch_scaling_factor = scaling_factors[batch];
-    for (int row = 0; row < m_rows; ++row, result += result_stride) {
+    for (int row = 0; row < m_rows; ++row) {
       const int8_t* __restrict__ row_ptr = matrix + row * m_cols;
       float scale = batch_scaling_factor;
       if (per_channel_scale != nullptr) {
@@ -204,6 +204,7 @@ void SseMatrixBatchVectorMultiplyAccumulate(
       }  // for col
       sum -= row_sum * input_offset[batch];
       *result += sum * scale;
+      ++result;
     }  // for row
     vectors += m_cols;
   }  // for batch
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
index fce9aeb1691..fa6f2c7a8db 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
@@ -39,32 +39,29 @@ namespace tensor_utils {
 
 void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
                                          int m_cols, const float* vector,
-                                         int n_batch, float* result,
-                                         int result_stride) {
+                                         int n_batch, float* result) {
   NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
-                   vector, n_batch, result, result_stride);
+                   vector, n_batch, result);
 }
 
 void MatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors,
     const float* __restrict__ scaling_factors, int n_batch,
-    float* __restrict__ result, int result_stride) {
+    float* __restrict__ result) {
   SSE_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
-                  vectors, scaling_factors, n_batch, result, result_stride);
+                  vectors, scaling_factors, n_batch, result);
 }
 
 void MatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, int result_stride,
-    const float* per_channel_scale, const int32_t* input_offset,
-    int32_t* scratch, int32_t* row_sums, bool* compute_row_sums,
-    CpuBackendContext* context) {
+    int n_batch, float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* context) {
   NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
-                   vectors, scaling_factors, n_batch, result, result_stride,
-                   per_channel_scale, input_offset, scratch, row_sums,
-                   compute_row_sums, context);
+                   vectors, scaling_factors, n_batch, result, per_channel_scale,
+                   input_offset, scratch, row_sums, compute_row_sums, context);
 }
 
 void MatrixBatchVectorMultiplyAccumulate(
@@ -72,21 +69,20 @@ void MatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ vectors,
     const float* __restrict__ scaling_factors, int n_batch,
     int32_t* __restrict__ scratch, float* __restrict__ result,
-    int result_stride, CpuBackendContext* __restrict__ context) {
+    CpuBackendContext* __restrict__ context) {
   SSE_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
-                  vectors, scaling_factors, n_batch, result, result_stride);
+                  vectors, scaling_factors, n_batch, result);
 }
 
 void MatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors,
     const float* __restrict__ scaling_factors, int n_batch,
-    float* __restrict__ result, int result_stride,
-    const float* __restrict__ per_channel_scale,
+    float* __restrict__ result, const float* __restrict__ per_channel_scale,
     const int32_t* __restrict__ input_offset) {
   SSE_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
-                  vectors, scaling_factors, n_batch, result, result_stride,
-                  per_channel_scale, input_offset);
+                  vectors, scaling_factors, n_batch, result, per_channel_scale,
+                  input_offset);
 }
 
 void SparseMatrixBatchVectorMultiplyAccumulate(
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h
index a9b8175e06d..1996b1f30a9 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h
@@ -31,15 +31,14 @@ void SseMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors,
     const float* __restrict__ scaling_factors, int n_batch,
-    float* __restrict__ result, int result_stride);
+    float* __restrict__ result);
 
 // Matrix multiplication for quantized values using asymmetric quantization.
 void SseMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors,
     const float* __restrict__ scaling_factors, int n_batch,
-    float* __restrict__ result, int result_stride,
-    const float* __restrict__ per_channel_scale,
+    float* __restrict__ result, const float* __restrict__ per_channel_scale,
     const int32_t* __restrict__ input_offset);
 
 // Matrix multiplication for quantized values using symmetric quantization.
diff --git a/tensorflow/lite/kernels/internal/per_channel_dequantize_test.cc b/tensorflow/lite/kernels/internal/per_channel_dequantize_test.cc
new file mode 100644
index 00000000000..3ad125b86e4
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/per_channel_dequantize_test.cc
@@ -0,0 +1,121 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/kernels/internal/reference/dequantize.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/test_util.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+TEST(PerChannelDequantize, TestInt8ToFloat_2D) {
+  const std::vector<float> scales = {0.5, 0.25};
+  const std::vector<int> zero_points = {-1, -1};
+  const int quantized_dimension = 0;
+
+  const RuntimeShape shape({2, 5});
+
+  const std::vector<int8_t> input = {-128, -127, -126, -125, -124,
+                                     123,  124,  125,  126,  127};
+  std::vector<float> output(10, -1);
+
+  PerChannelDequantizationParams op_params;
+  op_params.zero_point = zero_points.data();
+  op_params.scale = scales.data();
+  op_params.quantized_dimension = quantized_dimension;
+  reference_ops::PerChannelDequantize(op_params, shape, input.data(), shape,
+                                      output.data());
+  EXPECT_THAT(output,
+              ElementsAreArray(ArrayFloatNear({-63.5, -63, -62.5, -62, -61.5,
+                                               31, 31.25, 31.5, 31.75, 32})));
+}
+
+TEST(PerChannelDequantize, TestInt8ToFloat_3D) {
+  const std::vector<float> scales = {0.5, 0.25, 0.5, 0.25, 1.0};
+  const std::vector<int> zero_points = {-1, 1, -1, 1, 0};
+  const int quantized_dimension = 2;
+
+  const RuntimeShape shape({1, 2, 5});
+
+  const std::vector<int8_t> input = {-128, -127, -126, -125, -124,
+                                     123,  124,  125,  126,  127};
+  std::vector<float> output(10, -1);
+
+  PerChannelDequantizationParams op_params;
+  op_params.zero_point = zero_points.data();
+  op_params.scale = scales.data();
+  op_params.quantized_dimension = quantized_dimension;
+  reference_ops::PerChannelDequantize(op_params, shape, input.data(), shape,
+                                      output.data());
+  EXPECT_THAT(output,
+              ElementsAreArray(ArrayFloatNear({-63.5, -32, -62.5, -31.5, -124,
+                                               62, 30.75, 63, 31.25, 127})));
+}
+
+TEST(PerChannelDequantize, TestInt8ToFloat_4DDim0) {
+  const std::vector<float> scales = {0.5, 0.25};
+  const std::vector<int> zero_points = {-1, 1};
+  const int quantized_dimension = 0;
+
+  RuntimeShape shape({2, 2, 5, 1});
+
+  const std::vector<int8_t> input = {-128, -127, -126, -125, -124, 123,  124,
+                                     125,  126,  127,  -128, -127, -126, -125,
+                                     -124, 123,  124,  125,  126,  127};
+  std::vector<float> output(20, -1);
+
+  PerChannelDequantizationParams op_params;
+  op_params.zero_point = zero_points.data();
+  op_params.scale = scales.data();
+  op_params.quantized_dimension = quantized_dimension;
+  reference_ops::PerChannelDequantize(op_params, shape, input.data(), shape,
+                                      output.data());
+  EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear(
+                          {-63.5,  -63,  -62.5, -62,    -61.5, 62,     62.5,
+                           63,     63.5, 64,    -32.25, -32,   -31.75, -31.5,
+                           -31.25, 30.5, 30.75, 31,     31.25, 31.5})));
+}
+
+TEST(PerChannelDequantize, TestInt8ToFloat_4DDim3) {
+  const std::vector<float> scales = {0.5, 0.25, 0.5, 0.25, 1.0};
+  const std::vector<int> zero_points = {-1, 1, -1, 1, 0};
+  const int quantized_dimension = 3;
+
+  RuntimeShape shape({1, 2, 2, 5});
+
+  const std::vector<int8_t> input = {-128, -127, -126, -125, -124, 123,  124,
+                                     125,  126,  127,  -128, -127, -126, -125,
+                                     -124, 123,  124,  125,  126,  127};
+  std::vector<float> output(20, -1);
+
+  PerChannelDequantizationParams op_params;
+  op_params.zero_point = zero_points.data();
+  op_params.scale = scales.data();
+  op_params.quantized_dimension = quantized_dimension;
+  reference_ops::PerChannelDequantize(op_params, shape, input.data(), shape,
+                                      output.data());
+  EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear(
+                          {-63.5, -32,   -62.5, -31.5, -124,  62,    30.75,
+                           63,    31.25, 127,   -63.5, -32,   -62.5, -31.5,
+                           -124,  62,    30.75, 63,    31.25, 127})));
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/reference/concatenation.h b/tensorflow/lite/kernels/internal/reference/concatenation.h
index f587cfd1c9c..b511826969b 100644
--- a/tensorflow/lite/kernels/internal/reference/concatenation.h
+++ b/tensorflow/lite/kernels/internal/reference/concatenation.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/round.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
@@ -121,9 +122,9 @@ inline void ConcatenationWithScaling(const ConcatenationParams& params,
         const float scale = input_scale[i] * inverse_output_scale;
         const float bias = -input_zeropoint[i] * scale;
         for (int j = 0; j < copy_size; ++j) {
-          const int32_t value =
-              static_cast<int32_t>(std::round(input_ptr[j] * scale + bias)) +
-              output_zeropoint;
+          const int32_t value = static_cast<int32_t>(tflite::TfLiteRound(
+                                    input_ptr[j] * scale + bias)) +
+                                output_zeropoint;
           output_ptr[j] = static_cast<uint8_t>(
               std::max<int32_t>(std::min<int32_t>(255, value), 0));
         }
diff --git a/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h b/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
index 46702846422..70e5dd4012f 100644
--- a/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
+++ b/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
@@ -241,7 +241,7 @@ struct DepthwiseConvBasicKernel {
                     // Accumulate with 32 bits accumulator.
                     // In the nudging process during model quantization, we
                     // force real value of 0.0 be represented by a quantized
-                    // value. This guarentees that the input_offset is a int8,
+                    // value. This guarantees that the input_offset is a int8,
                     // even though it is represented using int32. int32 += int8
                     // * (int8 - int8) so the highest value we can get from each
                     // accumulation is [-127, 127] * ([-128, 127] -
diff --git a/tensorflow/lite/kernels/internal/reference/dequantize.h b/tensorflow/lite/kernels/internal/reference/dequantize.h
index 6bc338c8e06..6bedcba1044 100644
--- a/tensorflow/lite/kernels/internal/reference/dequantize.h
+++ b/tensorflow/lite/kernels/internal/reference/dequantize.h
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <limits.h>
 
+#include <vector>
+
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
@@ -60,6 +62,35 @@ inline void DequantizeInteger(const tflite::DequantizationParams& op_params,
   }
 }
 
+// Dequantizes per-channel quantized tensor to float.
+template <typename T>
+inline void PerChannelDequantize(
+    const tflite::PerChannelDequantizationParams& op_params,
+    const RuntimeShape& input_shape, const T* input_data,
+    const RuntimeShape& output_shape, float* output_data) {
+  // Ensure flat size is same.
+  MatchingFlatSize(input_shape, output_shape);
+
+  const int32* zero_point = op_params.zero_point;
+  const float* scale = op_params.scale;
+  const int32 quantized_dimension = op_params.quantized_dimension;
+  const int32 num_dims = input_shape.DimensionsCount();
+  const int32* dims_data = input_shape.DimsData();
+  std::vector<int> current_dim(num_dims, 0);
+
+  do {
+    size_t offset =
+        ReducedOutputOffset(num_dims, reinterpret_cast<const int*>(dims_data),
+                            current_dim.data(), 0, nullptr);
+    const int channel = current_dim[quantized_dimension];
+    const int32 val = input_data[offset];
+    const float result =
+        static_cast<float>(scale[channel] * (val - zero_point[channel]));
+    output_data[offset] = result;
+  } while (NextIndex(num_dims, reinterpret_cast<const int*>(dims_data),
+                     current_dim.data()));
+}
+
 }  // namespace reference_ops
 
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
index 4fceb905426..045d7bdcb73 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -120,8 +120,7 @@ void PortableAsymmetricQuantizeFloats(const float* values, const int size,
 void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
                                                  int m_rows, int m_cols,
                                                  const float* vector,
-                                                 int n_batch, float* result,
-                                                 int result_stride) {
+                                                 int n_batch, float* result) {
   float* result_in_batch = result;
   for (int b = 0; b < n_batch; b++) {
     const float* matrix_ptr = matrix;
@@ -132,7 +131,7 @@ void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
         dot_prod += *matrix_ptr++ * *vector_in_batch++;
       }
       *result_in_batch += dot_prod;
-      result_in_batch += result_stride;
+      ++result_in_batch;
     }
   }
 }
@@ -140,12 +139,12 @@ void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
 void PortableMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, int result_stride) {
+    int n_batch, float* __restrict__ result) {
   for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
     const float batch_scaling_factor = scaling_factors[batch];
     // Get the address of the first row.
     const int8_t* row_ptr = matrix;
-    for (int row = 0; row < m_rows; ++row, result += result_stride) {
+    for (int row = 0; row < m_rows; ++row) {
       // Initialize the dot product sum for the row to 0.
       int32_t dotprod = 0;
 #if defined(__GNUC__)
@@ -157,6 +156,7 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
         dotprod += (*row_ptr) * (vectors[col]);
       }  // for col
       *result += dotprod * batch_scaling_factor;
+      ++result;
     }  // for row
   }    // for batch
 }
@@ -164,13 +164,13 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
 void PortableMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, int result_stride,
-    const float* per_channel_scale, const int32_t* input_offset) {
+    int n_batch, float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset) {
   for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
     const float batch_scaling_factor = scaling_factors[batch];
     const float batch_offset = input_offset[batch];
     const int8_t* row_ptr = matrix;
-    for (int row = 0; row < m_rows; ++row, result += result_stride) {
+    for (int row = 0; row < m_rows; ++row) {
       int32_t dotprod = 0;
       float scale = batch_scaling_factor;
       if (per_channel_scale) {
@@ -185,6 +185,7 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
         dotprod += (*row_ptr) * (vectors[col] - batch_offset);
       }  // for col
       *result += dotprod * scale;
+      ++result;
     }  // for row
   }    // for batch
 }
@@ -192,10 +193,9 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
 void PortableMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, int result_stride,
-    const float* per_channel_scale, const int32_t* input_offset,
-    int32_t* scratch, int32_t* row_sums, bool* compute_row_sums,
-    CpuBackendContext* context) {
+    int n_batch, float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* context) {
   if (!compute_row_sums || *compute_row_sums) {
     memset(row_sums, 0, sizeof(int32_t) * m_rows);
     PortableReductionSumVector(matrix, row_sums, m_rows, m_cols);
@@ -208,7 +208,7 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
     const float batch_scaling_factor = scaling_factors[batch];
     const float batch_offset = input_offset[batch];
     const int8_t* row_ptr = matrix;
-    for (int row = 0; row < m_rows; ++row, result += result_stride) {
+    for (int row = 0; row < m_rows; ++row) {
       int32_t dotprod = 0;
       float scale = batch_scaling_factor;
       if (per_channel_scale) {
@@ -224,6 +224,7 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
       }  // for col
       dotprod -= row_sums[row] * batch_offset;
       *result += dotprod * scale;
+      ++result;
     }  // for row
   }    // for batch
 }
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
index e51e5442c2a..c4f886f6a5c 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -61,52 +61,52 @@ void AsymmetricQuantizeFloats(const float* values, const int size,
 
 void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
                                          int m_cols, const float* vector,
-                                         int n_batch, float* result,
-                                         int result_stride) {
+                                         int n_batch, float* result) {
   PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector,
-                                              n_batch, result, result_stride);
+                                              n_batch, result);
 }
 
-void MatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vector, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, int result_stride) {
+void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
+                                         const int m_rows, const int m_cols,
+                                         const int8_t* __restrict__ vector,
+                                         const float* scaling_factors,
+                                         int n_batch,
+                                         float* __restrict__ result) {
   PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector,
-                                              scaling_factors, n_batch, result,
-                                              result_stride);
+                                              scaling_factors, n_batch, result);
 }
 
 void MatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, int result_stride,
-    const float* per_channel_scale, const int32_t* input_offset,
-    int32_t* scratch, int32_t* row_sums, bool* compute_row_sums,
-    CpuBackendContext* context) {
+    int n_batch, float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* context) {
   PortableMatrixBatchVectorMultiplyAccumulate(
       matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
-      result_stride, per_channel_scale, input_offset, scratch, row_sums,
-      compute_row_sums, context);
+      per_channel_scale, input_offset, scratch, row_sums, compute_row_sums,
+      context);
 }
 
-void MatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vector, const float* scaling_factors,
-    int n_batch, int32_t* scratch, float* __restrict__ result,
-    int result_stride, CpuBackendContext* context) {
+void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
+                                         const int m_rows, const int m_cols,
+                                         const int8_t* __restrict__ vector,
+                                         const float* scaling_factors,
+                                         int n_batch, int32_t* scratch,
+                                         float* __restrict__ result,
+                                         CpuBackendContext* context) {
   PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector,
-                                              scaling_factors, n_batch, result,
-                                              result_stride);
+                                              scaling_factors, n_batch, result);
 }
 
 void MatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, int result_stride,
-    const float* per_channel_scale, const int32_t* input_offset) {
-  PortableMatrixBatchVectorMultiplyAccumulate(
-      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
-      result_stride, per_channel_scale, input_offset);
+    int n_batch, float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset) {
+  PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors,
+                                              scaling_factors, n_batch, result,
+                                              per_channel_scale, input_offset);
 }
 
 void SparseMatrixBatchVectorMultiplyAccumulate(
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
index b14d4c5b3f0..04fedc327d0 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
@@ -59,33 +59,31 @@ void PortableAsymmetricQuantizeFloats(const float* values, const int size,
 void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
                                                  int m_rows, int m_cols,
                                                  const float* vector,
-                                                 int n_batch, float* result,
-                                                 int result_stride);
+                                                 int n_batch, float* result);
 
 void PortableMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, int result_stride);
+    int n_batch, float* __restrict__ result);
 
 void PortableMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, int result_stride,
-    const float* per_channel_scale, const int32_t* input_offset,
-    int32_t* scratch, int32_t* row_sums, bool* compute_row_sums,
-    CpuBackendContext* context);
+    int n_batch, float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* context);
 
 void PortableMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vector, const float* scaling_factors,
     int n_batch, int32_t* scratch, float* __restrict__ result,
-    int result_stride, CpuBackendContext* context);
+    CpuBackendContext* context);
 
 void PortableMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, int result_stride,
-    const float* per_channel_scale, const int32_t* input_offset);
+    int n_batch, float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset);
 
 void PortableSparseMatrixBatchVectorMultiplyAccumulate(
     const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index 815fd080001..a3430c72594 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -56,6 +56,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/round.h"
 #include "tensorflow/lite/kernels/internal/reference/softmax.h"
 #include "tensorflow/lite/kernels/internal/reference/strided_slice.h"
+#include "tensorflow/lite/kernels/internal/reference/sub.h"
 #include "tensorflow/lite/kernels/internal/round.h"
 #include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
@@ -265,30 +266,32 @@ inline void LeakyRelu(const tflite::LeakyReluParams& params,
 }
 
 template <typename T>
-inline void QuantizeLeakyRelu(const LeakyReluParams& params, T q_alpha,
+inline void QuantizeLeakyRelu(const LeakyReluParams& params,
                               const RuntimeShape& input_shape,
                               const T* input_data,
                               const RuntimeShape& output_shape,
                               T* output_data) {
-  ruy::profiler::ScopeLabel label("LeakyRelu (not fused)");
+  ruy::profiler::ScopeLabel label("Quantized LeakyRelu (not fused)");
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
   static const int32 quantized_min = std::numeric_limits<T>::min();
   static const int32 quantized_max = std::numeric_limits<T>::max();
-  static const int32 alpha_value = q_alpha - params.alpha_offset;
   for (int i = 0; i < flat_size; ++i) {
     const int32 input_value = input_data[i] - params.input_offset;
+    int32 unclamped_output;
     if (input_value >= 0) {
-      output_data[i] = input_data[i];
+      unclamped_output = params.output_offset +
+                         MultiplyByQuantizedMultiplier(
+                             input_value, params.output_multiplier_identity,
+                             params.output_shift_identity);
     } else {
-      const int32 unclamped_output =
-          params.output_offset + MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                                     input_value * alpha_value,
-                                     params.output_multiplier,
-                                     params.output_shift);
-      const T clamped_output =
-          std::min(quantized_max, std::max(quantized_min, unclamped_output));
-      output_data[i] = static_cast<uint8>(clamped_output);
+      unclamped_output = params.output_offset +
+                         MultiplyByQuantizedMultiplier(
+                             input_value, params.output_multiplier_alpha,
+                             params.output_shift_alpha);
     }
+    const T clamped_output =
+        std::min(quantized_max, std::max(quantized_min, unclamped_output));
+    output_data[i] = static_cast<T>(clamped_output);
   }
 }
 
@@ -651,302 +654,6 @@ inline void BroadcastDiv4DSlow(const ArithmeticParams& params,
   }
 }
 
-inline void SubNonBroadcast(const ArithmeticParams& params,
-                            const RuntimeShape& input1_shape,
-                            const float* input1_data,
-                            const RuntimeShape& input2_shape,
-                            const float* input2_data,
-                            const RuntimeShape& output_shape,
-                            float* output_data) {
-  const int flat_size =
-      MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  for (int i = 0; i < flat_size; ++i) {
-    output_data[i] = ActivationFunctionWithMinMax(
-        input1_data[i] - input2_data[i], params.float_activation_min,
-        params.float_activation_max);
-  }
-}
-
-inline void SubNonBroadcast(const ArithmeticParams& params,
-                            const RuntimeShape& input1_shape,
-                            const int32* input1_data,
-                            const RuntimeShape& input2_shape,
-                            const int32* input2_data,
-                            const RuntimeShape& output_shape,
-                            int32* output_data) {
-  const int flat_size =
-      MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  for (int i = 0; i < flat_size; ++i) {
-    output_data[i] = ActivationFunctionWithMinMax(
-        input1_data[i] - input2_data[i], params.quantized_activation_min,
-        params.quantized_activation_max);
-  }
-}
-
-// TODO(jiawen): We can implement BroadcastSub on buffers of arbitrary
-// dimensionality if the runtime code does a single loop over one dimension
-// that handles broadcasting as the base case. The code generator would then
-// generate max(D1, D2) nested for loops.
-// TODO(benoitjacob): BroadcastSub is intentionally duplicated from
-// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
-// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
-// reference_ops.h.
-inline void BroadcastSub4DSlow(const ArithmeticParams& params,
-                               const RuntimeShape& input1_shape,
-                               const float* input1_data,
-                               const RuntimeShape& input2_shape,
-                               const float* input2_data,
-                               const RuntimeShape& output_shape,
-                               float* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastSub4DSlow/float");
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
-                                      &desc2);
-  const RuntimeShape extended_output_shape =
-      RuntimeShape::ExtendedShape(4, output_shape);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
-    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
-      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
-        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          output_data[Offset(extended_output_shape, b, y, x, c)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
-                      input2_data[SubscriptToIndex(desc2, b, y, x, c)],
-                  params.float_activation_min, params.float_activation_max);
-        }
-      }
-    }
-  }
-}
-
-inline void BroadcastSub4DSlow(const ArithmeticParams& params,
-                               const RuntimeShape& input1_shape,
-                               const uint8* input1_data,
-                               const RuntimeShape& input2_shape,
-                               const uint8* input2_data,
-                               const RuntimeShape& output_shape,
-                               uint8* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastSub4DSlow/uint8");
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
-                                      &desc2);
-  const RuntimeShape extended_output_shape =
-      RuntimeShape::ExtendedShape(4, output_shape);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
-    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
-      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
-        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          const int32 input1_val =
-              params.input1_offset +
-              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
-          const int32 input2_val =
-              params.input2_offset +
-              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
-          const int32 shifted_input1_val =
-              input1_val * (1 << params.left_shift);
-          const int32 shifted_input2_val =
-              input2_val * (1 << params.left_shift);
-          const int32 scaled_input1_val =
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  shifted_input1_val, params.input1_multiplier,
-                  params.input1_shift);
-          const int32 scaled_input2_val =
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  shifted_input2_val, params.input2_multiplier,
-                  params.input2_shift);
-          const int32 raw_sub = scaled_input1_val - scaled_input2_val;
-          const int32 raw_output =
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  raw_sub, params.output_multiplier, params.output_shift) +
-              params.output_offset;
-          const int32 clamped_output =
-              std::min(params.quantized_activation_max,
-                       std::max(params.quantized_activation_min, raw_output));
-          output_data[Offset(extended_output_shape, b, y, x, c)] =
-              static_cast<uint8>(clamped_output);
-        }
-      }
-    }
-  }
-}
-
-inline void BroadcastSub4DSlow(const ArithmeticParams& params,
-                               const RuntimeShape& input1_shape,
-                               const int32* input1_data,
-                               const RuntimeShape& input2_shape,
-                               const int32* input2_data,
-                               const RuntimeShape& output_shape,
-                               int32* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastSub4DSlow/int32");
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
-                                      &desc2);
-  const RuntimeShape extended_output_shape =
-      RuntimeShape::ExtendedShape(4, output_shape);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
-    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
-      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
-        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          output_data[Offset(extended_output_shape, b, y, x, c)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
-                      input2_data[SubscriptToIndex(desc2, b, y, x, c)],
-                  params.quantized_activation_min,
-                  params.quantized_activation_max);
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-void BroadcastSub4DSlow(const ArithmeticParams& params,
-                        const RuntimeShape& input1_shape, const T* input1_data,
-                        const RuntimeShape& input2_shape, const T* input2_data,
-                        const RuntimeShape& output_shape, T* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastSub4DSlow/templated");
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
-                                      &desc2);
-  const RuntimeShape extended_output_shape =
-      RuntimeShape::ExtendedShape(4, output_shape);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
-    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
-      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
-        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          output_data[Offset(extended_output_shape, b, y, x, c)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
-                      input2_data[SubscriptToIndex(desc2, b, y, x, c)],
-                  params.quantized_activation_min,
-                  params.quantized_activation_max);
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
-         const T* input1_data, const RuntimeShape& input2_shape,
-         const T* input2_data, const RuntimeShape& output_shape,
-         T* output_data) {
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
-                                      &desc2);
-  const RuntimeShape extended_output_shape =
-      RuntimeShape::ExtendedShape(4, output_shape);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
-    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
-      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
-        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
-          output_data[Offset(extended_output_shape, b, y, x, c)] =
-              input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
-              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
-        }
-      }
-    }
-  }
-}
-
-inline void SubWithActivation(const ArithmeticParams& params,
-                              const RuntimeShape& input1_shape,
-                              const int32* input1_data,
-                              const RuntimeShape& input2_shape,
-                              const int32* input2_data,
-                              const RuntimeShape& output_shape,
-                              int32* output_data) {
-  ruy::profiler::ScopeLabel label("SubWithActivation");
-  const int flat_size =
-      MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  for (int i = 0; i < flat_size; ++i) {
-    output_data[i] = ActivationFunctionWithMinMax(
-        input1_data[i] - input2_data[i], params.quantized_activation_min,
-        params.quantized_activation_max);
-  }
-}
-
-inline void SubWithActivation(const ArithmeticParams& params,
-                              const RuntimeShape& input1_shape,
-                              const float* input1_data,
-                              const RuntimeShape& input2_shape,
-                              const float* input2_data,
-                              const RuntimeShape& output_shape,
-                              float* output_data) {
-  const int flat_size =
-      MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  for (int i = 0; i < flat_size; ++i) {
-    output_data[i] = ActivationFunctionWithMinMax(
-        input1_data[i] - input2_data[i], params.float_activation_min,
-        params.float_activation_max);
-  }
-}
-
 inline void Sub16(const ArithmeticParams& params,
                   const RuntimeShape& input1_shape, const int16_t* input1_data,
                   const RuntimeShape& input2_shape, const int16_t* input2_data,
@@ -2028,12 +1735,24 @@ inline void SpaceToBatchND(
     const RuntimeShape& unextended_input3_shape, const int32* paddings_data,
     const RuntimeShape& unextended_output_shape, T* output_data) {
   ruy::profiler::ScopeLabel label("SpaceToBatchND");
+  TFLITE_DCHECK_GE(unextended_input1_shape.DimensionsCount(), 3);
   TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
-  const RuntimeShape input1_shape =
-      RuntimeShape::ExtendedShape(4, unextended_input1_shape);
-  const RuntimeShape output_shape =
-      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+  TFLITE_DCHECK_EQ(unextended_input1_shape.DimensionsCount(),
+                   unextended_output_shape.DimensionsCount());
+
+  // Extends the input/output shape from 3D to 4D if needed, NHC -> NH1C.
+  auto extend_shape = [](const RuntimeShape& shape) {
+    if (shape.DimensionsCount() == 4) {
+      return shape;
+    }
+    RuntimeShape new_shape(4, 1);
+    new_shape.SetDim(0, shape.Dims(0));
+    new_shape.SetDim(1, shape.Dims(1));
+    new_shape.SetDim(3, shape.Dims(2));
+    return new_shape;
+  };
+  const RuntimeShape input1_shape = extend_shape(unextended_input1_shape);
+  const RuntimeShape output_shape = extend_shape(unextended_output_shape);
 
   const int depth = input1_shape.Dims(3);
   const int input_width = input1_shape.Dims(2);
@@ -2045,13 +1764,14 @@ inline void SpaceToBatchND(
   const int output_batch_size = output_shape.Dims(0);
 
   const int block_shape_height = block_shape_data[0];
-  const int block_shape_width = block_shape_data[1];
+  const int block_shape_width =
+      unextended_input1_shape.DimensionsCount() == 4 ? block_shape_data[1] : 1;
   const int padding_top = paddings_data[0];
-  const int padding_left = paddings_data[2];
+  const int padding_left =
+      unextended_input1_shape.DimensionsCount() == 4 ? paddings_data[2] : 0;
 
   // For uint8 quantized, the correct padding "zero value" is the output offset.
   const int32_t pad_value = params.output_offset;
-
   for (int out_b = 0; out_b < output_batch_size; ++out_b) {
     int input_batch = out_b % input_batch_size;
     int shift_w = (out_b / input_batch_size) % block_shape_width;
@@ -2086,12 +1806,24 @@ inline void BatchToSpaceND(
     const RuntimeShape& unextended_input3_shape, const int32* crops_data,
     const RuntimeShape& unextended_output_shape, T* output_data) {
   ruy::profiler::ScopeLabel label("BatchToSpaceND");
+  TFLITE_DCHECK_GE(unextended_input1_shape.DimensionsCount(), 3);
   TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
-  const RuntimeShape input1_shape =
-      RuntimeShape::ExtendedShape(4, unextended_input1_shape);
-  const RuntimeShape output_shape =
-      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+  TFLITE_DCHECK_EQ(unextended_input1_shape.DimensionsCount(),
+                   unextended_output_shape.DimensionsCount());
+
+  // Extends the input/output shape from 3D to 4D if needed, NHC -> NH1C.
+  auto extend_shape = [](const RuntimeShape& shape) {
+    if (shape.DimensionsCount() == 4) {
+      return shape;
+    }
+    RuntimeShape new_shape(4, 1);
+    new_shape.SetDim(0, shape.Dims(0));
+    new_shape.SetDim(1, shape.Dims(1));
+    new_shape.SetDim(3, shape.Dims(2));
+    return new_shape;
+  };
+  const RuntimeShape input1_shape = extend_shape(unextended_input1_shape);
+  const RuntimeShape output_shape = extend_shape(unextended_output_shape);
 
   const int output_width = output_shape.Dims(2);
   const int output_height = output_shape.Dims(1);
@@ -2102,11 +1834,12 @@ inline void BatchToSpaceND(
   const int input_height = input1_shape.Dims(1);
   const int input_batch_size = input1_shape.Dims(0);
 
-  const int block_shape_width = block_shape_data[1];
   const int block_shape_height = block_shape_data[0];
+  const int block_shape_width =
+      unextended_input1_shape.DimensionsCount() == 4 ? block_shape_data[1] : 1;
   const int crops_top = crops_data[0];
-  const int crops_left = crops_data[2];
-
+  const int crops_left =
+      unextended_input1_shape.DimensionsCount() == 4 ? crops_data[2] : 0;
   for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) {
     const int out_batch = in_batch % output_batch_size;
     const int spatial_offset = in_batch / output_batch_size;
diff --git a/tensorflow/lite/kernels/internal/reference/sub.h b/tensorflow/lite/kernels/internal/reference/sub.h
new file mode 100644
index 00000000000..4f4a9156121
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/sub.h
@@ -0,0 +1,492 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/experimental/ruy/profiler/instrumentation.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+
+namespace reference_ops {
+
+inline void SubNonBroadcast(const ArithmeticParams& params,
+                            const RuntimeShape& input1_shape,
+                            const float* input1_data,
+                            const RuntimeShape& input2_shape,
+                            const float* input2_data,
+                            const RuntimeShape& output_shape,
+                            float* output_data) {
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] - input2_data[i], params.float_activation_min,
+        params.float_activation_max);
+  }
+}
+
+inline void SubNonBroadcast(const ArithmeticParams& params,
+                            const RuntimeShape& input1_shape,
+                            const int32* input1_data,
+                            const RuntimeShape& input2_shape,
+                            const int32* input2_data,
+                            const RuntimeShape& output_shape,
+                            int32* output_data) {
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] - input2_data[i], params.quantized_activation_min,
+        params.quantized_activation_max);
+  }
+}
+
+// TODO(b/151345304): We can implement BroadcastSub on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+// TODO(b/151345101): BroadcastSub is intentionally duplicated from
+// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
+// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
+// reference_ops.h.
+inline void BroadcastSub4DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const float* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const float* input2_data,
+                               const RuntimeShape& output_shape,
+                               float* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastSub4DSlow/float");
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
+                      input2_data[SubscriptToIndex(desc2, b, y, x, c)],
+                  params.float_activation_min, params.float_activation_max);
+        }
+      }
+    }
+  }
+}
+
+inline void BroadcastSub4DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const uint8* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const uint8* input2_data,
+                               const RuntimeShape& output_shape,
+                               uint8* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastSub4DSlow/uint8");
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          const int32 input1_val =
+              params.input1_offset +
+              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
+          const int32 input2_val =
+              params.input2_offset +
+              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+          const int32 shifted_input1_val =
+              input1_val * (1 << params.left_shift);
+          const int32 shifted_input2_val =
+              input2_val * (1 << params.left_shift);
+          const int32 scaled_input1_val =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input1_val, params.input1_multiplier,
+                  params.input1_shift);
+          const int32 scaled_input2_val =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input2_val, params.input2_multiplier,
+                  params.input2_shift);
+          const int32 raw_sub = scaled_input1_val - scaled_input2_val;
+          const int32 raw_output =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  raw_sub, params.output_multiplier, params.output_shift) +
+              params.output_offset;
+          const int32 clamped_output =
+              std::min(params.quantized_activation_max,
+                       std::max(params.quantized_activation_min, raw_output));
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              static_cast<uint8>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+inline void BroadcastSub4DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const int32* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const int32* input2_data,
+                               const RuntimeShape& output_shape,
+                               int32* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastSub4DSlow/int32");
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
+                      input2_data[SubscriptToIndex(desc2, b, y, x, c)],
+                  params.quantized_activation_min,
+                  params.quantized_activation_max);
+        }
+      }
+    }
+  }
+}
+
+inline void BroadcastSub4DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const int8_t* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const int8_t* input2_data,
+                               const RuntimeShape& output_shape,
+                               int8_t* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastSub4DSlow/int8");
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          const int32_t input1_val =
+              params.input1_offset +
+              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
+          const int32_t input2_val =
+              params.input2_offset +
+              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+          const int32_t shifted_input1_val =
+              input1_val * (1 << params.left_shift);
+          const int32_t shifted_input2_val =
+              input2_val * (1 << params.left_shift);
+          const int32_t scaled_input1_val =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input1_val, params.input1_multiplier,
+                  params.input1_shift);
+          const int32_t scaled_input2_val =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input2_val, params.input2_multiplier,
+                  params.input2_shift);
+          const int32_t raw_sub = scaled_input1_val - scaled_input2_val;
+          const int32_t raw_output =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  raw_sub, params.output_multiplier, params.output_shift) +
+              params.output_offset;
+          const int32_t clamped_output =
+              std::min(params.quantized_activation_max,
+                       std::max(params.quantized_activation_min, raw_output));
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              static_cast<int8_t>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void BroadcastSub4DSlow(const ArithmeticParams& params,
+                        const RuntimeShape& input1_shape, const T* input1_data,
+                        const RuntimeShape& input2_shape, const T* input2_data,
+                        const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastSub4DSlow/templated");
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
+                      input2_data[SubscriptToIndex(desc2, b, y, x, c)],
+                  params.quantized_activation_min,
+                  params.quantized_activation_max);
+        }
+      }
+    }
+  }
+}
+
+
+// Element-wise Sub that can often be used for inner loop of broadcast sub as
+// well as the non-broadcast sub.
+inline void SubElementwise(int size, const ArithmeticParams& params,
+                           const uint8* input1_data, const uint8* input2_data,
+                           uint8* output_data) {
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+
+  for (int i = 0; i < size; ++i) {
+    const int32 input1_val = params.input1_offset + input1_data[i];
+    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32 scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, params.input1_multiplier, params.input1_shift);
+    const int32 scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    const int32 raw_sub = scaled_input1_val - scaled_input2_val;
+    const int32 raw_output =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            raw_sub, params.output_multiplier, params.output_shift) +
+        params.output_offset;
+    const int32 clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, raw_output));
+    output_data[i] = static_cast<uint8>(clamped_output);
+  }
+}
+
+// Element-wise add that can often be used for inner loop of broadcast add as
+// well as the non-broadcast add.
+inline void SubElementwise(int size, const ArithmeticParams& params,
+                           const int8_t* input1_data, const int8_t* input2_data,
+                           int8_t* output_data) {
+  const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
+  TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
+  TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
+  TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
+  TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
+
+  for (int i = 0; i < size; ++i) {
+    const int32 input1_val = params.input1_offset + input1_data[i];
+    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32 scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, params.input1_multiplier, params.input1_shift);
+    const int32 scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    const int32 raw_sub = scaled_input1_val - scaled_input2_val;
+    const int32 raw_output =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            raw_sub, params.output_multiplier, params.output_shift) +
+        params.output_offset;
+    const int32 clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, raw_output));
+    output_data[i] = static_cast<int8_t>(clamped_output);
+  }
+}
+
+inline void Sub(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const uint8* input1_data,
+                const RuntimeShape& input2_shape, const uint8* input2_data,
+                const RuntimeShape& output_shape, uint8* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+  SubElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void Sub(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int8_t* input1_data,
+                const RuntimeShape& input2_shape, const int8_t* input2_data,
+                const RuntimeShape& output_shape, int8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
+  TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
+  TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
+  TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
+  TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
+  SubElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+template <typename T>
+void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
+         const T* input1_data, const RuntimeShape& input2_shape,
+         const T* input2_data, const RuntimeShape& output_shape,
+         T* output_data) {
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
+              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+        }
+      }
+    }
+  }
+}
+
+inline void SubWithActivation(const ArithmeticParams& params,
+                              const RuntimeShape& input1_shape,
+                              const int32* input1_data,
+                              const RuntimeShape& input2_shape,
+                              const int32* input2_data,
+                              const RuntimeShape& output_shape,
+                              int32* output_data) {
+  ruy::profiler::ScopeLabel label("SubWithActivation");
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] - input2_data[i], params.quantized_activation_min,
+        params.quantized_activation_max);
+  }
+}
+
+inline void SubWithActivation(const ArithmeticParams& params,
+                              const RuntimeShape& input1_shape,
+                              const float* input1_data,
+                              const RuntimeShape& input2_shape,
+                              const float* input2_data,
+                              const RuntimeShape& output_shape,
+                              float* output_data) {
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] - input2_data[i], params.float_activation_min,
+        params.float_activation_max);
+  }
+}
+
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SUB_H_
diff --git a/tensorflow/lite/kernels/internal/reference/svdf.h b/tensorflow/lite/kernels/internal/reference/svdf.h
index adaef5d0f9b..10c2e2cd849 100644
--- a/tensorflow/lite/kernels/internal/reference/svdf.h
+++ b/tensorflow/lite/kernels/internal/reference/svdf.h
@@ -199,18 +199,19 @@ inline void EvalFloatSVDF(TfLiteContext* context, TfLiteNode* node,
   std::copy(state_ptr + 1, state_ptr + batch_size * memory_size * num_filters,
             state_ptr);
 
-  // Clear the latest activation (the rightmost column).
-  for (int i = 0; i < batch_size * num_filters; ++i) {
-    state_ptr[i * memory_size + memory_size - 1] = 0.0f;
-  }
+  // Clear scratch (the matmul is accumulative).
+  std::fill_n(scratch_ptr, batch_size * num_filters, 0.0f);
 
   // Compute conv1d(inputs, weights_feature).
-  // The state's rightmost column is used to save current cycle activation. This
-  // is achieved by starting at state_ptr[memory_size - 1] and having the stride
-  // equal to memory_size.
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       weights_feature_ptr, num_filters, input_size, input_ptr, batch_size,
-      &state_ptr[memory_size - 1], memory_size);
+      scratch_ptr);
+
+  // Copy the latest activation from scratch into activation_state:
+  // The last, i.e. (memory_size-1)th entry for each batch, and filter.
+  for (int i = 0; i < batch_size * num_filters; ++i) {
+    state_ptr[i * memory_size + memory_size - 1] = scratch_ptr[i];
+  }
 
   ApplyTimeWeightsBiasAndActivation(
       batch_size, memory_size, num_filters, num_units, rank, weights_time_ptr,
@@ -252,10 +253,8 @@ inline void EvalHybridSVDF(
   std::copy(state_ptr + 1, state_ptr + batch_size * memory_size * num_filters,
             state_ptr);
 
-  // Clear the latest activation (the rightmost column).
-  for (int i = 0; i < batch_size * num_filters; ++i) {
-    state_ptr[i * memory_size + memory_size - 1] = 0.0f;
-  }
+  // Clear scratch (the matmul is accumulative).
+  std::fill_n(scratch_ptr, batch_size * num_filters, 0.0f);
 
   if (!tensor_utils::IsZeroVector(input_ptr, batch_size * input_size)) {
     // Quantize input from float to int8.
@@ -269,13 +268,15 @@ inline void EvalHybridSVDF(
     }
 
     // Compute conv1d(inputs, weights_feature).
-    // The rightmost column of state is used to save the current cycle
-    // activation. This is achieved by starting at state_ptr[memory_size - 1]
-    // and having the stride equal to memory_size.
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         weights_feature_ptr, num_filters, input_size, quantized_input_ptr,
-        scaling_factors_ptr, batch_size, &state_ptr[memory_size - 1],
-        memory_size);
+        scaling_factors_ptr, batch_size, scratch_ptr);
+  }
+
+  // Copy the latest activation from scratch into activation_state:
+  // The last, i.e. (memory_size-1)th entry for each batch, and filter.
+  for (int i = 0; i < batch_size * num_filters; ++i) {
+    state_ptr[i * memory_size + memory_size - 1] = scratch_ptr[i];
   }
 
   // TODO(alanchiao): can optimize hybrid case ~5% by unrolling loop in applying
diff --git a/tensorflow/lite/kernels/internal/softmax_quantized_test.cc b/tensorflow/lite/kernels/internal/softmax_quantized_test.cc
index 95fc8f1a7e2..0ca030eda80 100644
--- a/tensorflow/lite/kernels/internal/softmax_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/softmax_quantized_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/kernels/internal/optimized/integer_ops/softmax.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/softmax.h"
@@ -88,14 +87,18 @@ void CheckOutputData(const T* test_output, const T* reference_output,
   // We either check for bit exactness (against the reference quantized version)
   // or for general accuracy, allowing off-by-one (against the float reference).
   if (be_exacting) {
-    ASSERT_TRUE(std::abs(min_diff) == 0 && std::abs(max_diff) == 0);
+    ASSERT_EQ(std::abs(min_diff), 0);
+    ASSERT_EQ(std::abs(max_diff), 0);
   } else {
     // For small numbers of samples, the estimates of the means vary more.
     // Rather than widen the tolerances, we skip the smaller tests.
-    ASSERT_TRUE(((std::abs(mean_diff) < 2e-2f && mean_abs_diff < 3e-2f) ||
-                 buffer_size < 10000) &&
-                std::abs(median_diff) == 0 && std::abs(min_diff) <= 1 &&
-                std::abs(max_diff) <= 1);
+    if (buffer_size >= 10000) {
+      ASSERT_LT(std::abs(mean_diff), 2e-2f);
+      ASSERT_LT(mean_abs_diff, 3e-2f);
+    }
+    ASSERT_EQ(std::abs(median_diff), 0);
+    ASSERT_LE(std::abs(min_diff), 1);
+    ASSERT_LE(std::abs(max_diff), 1);
   }
 }
 
@@ -178,68 +181,6 @@ bool TryOneUniformSoftmax() {
   return true;
 }
 
-// Runs the Int8 quatnized Softmax and compares reference implementation with
-// optimized implementation.
-void RunOneSoftmaxTestInt8(const int8* input_data,
-                           const RuntimeShape& shape_common, int32 input_offset,
-                           const double input_scale, int stride, float beta) {
-  const int buffer_size = shape_common.FlatSize();
-  std::vector<int8> optimized_quant_softmax_output(buffer_size);
-  std::vector<int8> reference_quant_softmax_output(buffer_size);
-
-  int32 input_beta_multiplier;
-  int input_beta_left_shift;
-  static const int kScaledDiffIntegerBits = 5;
-  tflite::PreprocessSoftmaxScaling(beta, input_scale, kScaledDiffIntegerBits,
-                                   &input_beta_multiplier,
-                                   &input_beta_left_shift);
-  // diff_min has a negative value, and is used to limit the maximum magnitude
-  // of the diffs, which are <= 0.
-  const int diff_min = -tflite::CalculateInputRadius(kScaledDiffIntegerBits,
-                                                     input_beta_left_shift);
-
-  SoftmaxParams params;
-  params.input_multiplier = input_beta_multiplier;
-  params.input_left_shift = input_beta_left_shift;
-  params.diff_min = diff_min;
-  optimized_integer_ops::Softmax(params, shape_common, input_data, shape_common,
-                                 optimized_quant_softmax_output.data());
-  reference_integer_ops::Softmax(params, shape_common, input_data, shape_common,
-                                 reference_quant_softmax_output.data());
-
-  CheckOutputData<int8_t>(optimized_quant_softmax_output.data(),
-                          reference_quant_softmax_output.data(), shape_common,
-                          "Int8 quant refernece vs optimized", true);
-}
-
-// This function picks some random Softmax params, which are checked for
-// desirability.  If not acceptable, it returns false. If they're OK,
-// it runs the Softmax and test the results between reference int8 and optimized
-// int8 kernels.
-bool TryOneUniformSoftmaxInt8() {
-  // We pick mostly positive values, on the whole emphasizing smaller values and
-  // therefore faster tests.  We test a wider range of depths.  In the case of
-  // Softmax, the width and height really just create test repetitions.
-  const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
-  const int input_depth = ExponentialRandomPositiveInt(0.75f, 175, 500);
-  const int input_width = ExponentialRandomPositiveInt(0.8f, 20, 200);
-  const int input_height = ExponentialRandomPositiveInt(0.8f, 20, 200);
-  const int stride = ExponentialRandomPositiveInt(0.9f, 3, 8);
-  const double input_scale = std::pow(10.0, UniformRandomFloat(-2.0, 1.0));
-  const int32 input_offset = UniformRandomInt(-128, 127);
-  const float beta = 1.0f + ExponentialRandomPositiveFloat(0.9f, 2, 10);
-
-  auto shape_common =
-      RuntimeShape({batch, input_height, input_width, input_depth});
-  const int buffer_size = shape_common.FlatSize();
-
-  std::vector<int8> input_data(buffer_size);
-  FillRandom(&input_data);
-  RunOneSoftmaxTestInt8(input_data.data(), shape_common, input_offset,
-                        input_scale, stride, beta);
-  return true;
-}
-
 // See TryOneUniformSoftmax() for a general description.
 //
 // Tests with "skyscraper" input patterns are included for two reasons. (a)
@@ -287,14 +228,6 @@ TEST(TestQuantizedSoftmax, UniformSoftmaxTests) {
   }
 }
 
-TEST(TestQuantizedSoftmax, UniformSoftmaxTestsInt8) {
-  const int kTestsToRun = 100;
-  for (int i = 0; i < kTestsToRun; i++) {
-    while (!TryOneUniformSoftmaxInt8()) {
-    }
-  }
-}
-
 TEST(TestQuantizedSoftmax, SkyscraperSoftmaxTests) {
   const int kTestsToRun = 100;
   for (int i = 0; i < kTestsToRun; i++) {
diff --git a/tensorflow/lite/kernels/internal/tensor_utils.h b/tensorflow/lite/kernels/internal/tensor_utils.h
index ebc32a8afe3..775b2e58bea 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/tensor_utils.h
@@ -60,17 +60,10 @@ void AsymmetricQuantizeFloats(const float* values, const int size,
 // of the multiplication is accumulated to the passed result buffer.
 // More specifically, for a matrix M of shape [n, i] and a batched-vector
 // of shape [i, batch] it will first compute the product of shape [n, batch].
-// This product will be accumulated to the result buffer, using a stride value
-// provided in result_stride (the number of elements between consecutive result
-// values). For example result_stride = 1, will cause the output to look like
-// this:
-// [O_1, 0_2, ... O_rows]
-// but result_stride = 3, will cause it to be arranged like this in memory:
-// [O_1, x, x, 0_2, x, x, ..., O_rows]
+// This product will be accumulated to the result buffer.
 void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
                                          int m_cols, const float* vector,
-                                         int n_batch, float* result,
-                                         int result_stride);
+                                         int n_batch, float* result);
 
 // Same as the function above, but the matrix is stored in block compressed
 // sparse row format with block pattern 1x16 which consists of two arrays:
@@ -98,7 +91,7 @@ void MatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors,
     const float* __restrict__ scaling_factors, int n_batch,
-    float* __restrict__ result, int result_stride);
+    float* __restrict__ result);
 
 // Same as the function above, but provide a scratch buffer for the
 // int8 x int8 -> int32 and a CpuBackendContext for the accumulator
@@ -108,7 +101,7 @@ void MatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ vectors,
     const float* __restrict__ scaling_factors, int n_batch,
     int32_t* __restrict__ scratch, float* __restrict__ result,
-    int result_stride, CpuBackendContext* __restrict__ context);
+    CpuBackendContext* __restrict__ context);
 
 // Same as the function above except that vector values
 // are quantized with asymmetric quantization per-batch and the matrix
@@ -117,18 +110,16 @@ void MatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors,
     const float* __restrict__ scaling_factors, int n_batch,
-    float* __restrict__ result, int result_stride,
-    const float* __restrict__ per_channel_scale,
+    float* __restrict__ result, const float* __restrict__ per_channel_scale,
     const int32_t* __restrict__ input_offset);
 
 // Same as the function above except that can make use of cached row sums.
 void MatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, int result_stride,
-    const float* per_channel_scale, const int32_t* input_offset,
-    int32_t* scratch, int32_t* row_sums, bool* compute_row_sums,
-    CpuBackendContext* context);
+    int n_batch, float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* context);
 
 // Same as the function above, but the matrix is stored in block compressed
 // sparse row format with block pattern 1x16 which consists of two arrays:
diff --git a/tensorflow/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
index 9136fe0ff48..3c34c435c2e 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils_test.cc
+++ b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
@@ -331,18 +331,9 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateTest) {
   std::vector<float> output(kRow * kBatch);
   std::fill(output.begin(), output.end(), 3.0);
   MatrixBatchVectorMultiplyAccumulate(matrix, kRow, kCol, vector, kBatch,
-                                      output.data(), /*result_stride=*/1);
+                                      output.data());
   EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear({1., 5., 13.,  //
                                                        -1., 7., 23.})));
-
-  std::vector<float> output_with_stride2(kRow * kBatch * 2);
-  std::fill(output_with_stride2.begin(), output_with_stride2.end(), 3.0);
-  MatrixBatchVectorMultiplyAccumulate(matrix, kRow, kCol, vector, kBatch,
-                                      output_with_stride2.data(),
-                                      /*result_stride=*/2);
-  EXPECT_THAT(output_with_stride2,
-              ElementsAreArray(ArrayFloatNear({1., 3., 5., 3., 13., 3.,  //
-                                               -1., 3., 7., 3., 23., 3.})));
 }
 
 // Quantized matmul with 2 * 30 input and 9 * 30 matrix.
@@ -445,7 +436,7 @@ TEST(uKernels, HybridMatrixBatchVectorMultiplyAccumulate8x8_16Test) {
   bool compute_row_sums = true;
   MatrixBatchVectorMultiplyAccumulate(
       input_to_gate_weights.data(), /*m_rows=*/8, /*m_cols=*/32, input.data(),
-      scaling_factors.data(), /*n_batch*/ 4, output.data(), 1, nullptr,
+      scaling_factors.data(), /*n_batch*/ 4, output.data(), nullptr,
       input_offsets.data(), scratch.data(), row_sums, &compute_row_sums,
       &context);
 
@@ -461,7 +452,7 @@ TEST(uKernels, HybridMatrixBatchVectorMultiplyAccumulate8x8_16Test) {
   std::vector<float> output2(4 * 8, 0);
   MatrixBatchVectorMultiplyAccumulate(
       input_to_gate_weights.data(), /*m_rows=*/8, /*m_cols=*/32, input.data(),
-      scaling_factors.data(), /*n_batch*/ 4, output2.data(), 1, nullptr,
+      scaling_factors.data(), /*n_batch*/ 4, output2.data(), nullptr,
       input_offsets.data(), scratch.data(), row_sums, &compute_row_sums,
       &context);
 
@@ -1085,7 +1076,7 @@ std::vector<float> TestDotprodMatrixBatchVectorMultiply(
   // an exact result.
   MatrixBatchVectorMultiplyAccumulate(
       data.matrix.data(), rows, cols, data.vectors.data(),
-      data.scale_factors.data(), batch, &data.results[0], 1);
+      data.scale_factors.data(), batch, &data.results[0]);
   return data.results;
 }
 
@@ -1106,7 +1097,7 @@ std::vector<float> TestPerChannelDotprodMatrixBatchVectorMultiply(
 
   MatrixBatchVectorMultiplyAccumulate(
       data.matrix.data(), rows, cols, data.vectors.data(),
-      data.scale_factors.data(), batch, &data.results[0], 1,
+      data.scale_factors.data(), batch, &data.results[0],
       data.per_channel_scales.data(), data.input_offsets.data());
   return data.results;
 }
@@ -1386,8 +1377,7 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) {
       scaling_factor_a * scaling_factor_b[1],
   };
   MatrixBatchVectorMultiplyAccumulate(a_int8_data, a_rows, a_cols, b_int8_data,
-                                      scaling_factor_c, batches, c_float_data,
-                                      /*result_stride=*/1);
+                                      scaling_factor_c, batches, c_float_data);
 
   // Assert we obtain the expected recovered float values.
   const float expected_c_float_data[] = {
@@ -1404,8 +1394,7 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) {
   CpuBackendContext context;
   MatrixBatchVectorMultiplyAccumulate(
       a_int8_data, a_rows, a_cols, b_int8_data, scaling_factor_c, batches,
-      accum_scratch.data(), c_float_data_2.data(),
-      /*result_stride=*/1, &context);
+      accum_scratch.data(), c_float_data_2.data(), &context);
 
   // Assert (again) we obtain the expected recovered float values.
   for (int i = 0; i < a_rows * b_cols * batches; ++i) {
@@ -1482,7 +1471,7 @@ TEST(uKernels, SparseMatrixBatchVectorMultiplyAccumulateTest) {
 
   std::vector<float> dense_output(kRow * kBatch, 0.0);
   MatrixBatchVectorMultiplyAccumulate(matrix, kRow, kCol, vector, kBatch,
-                                      dense_output.data(), /*result_stride=*/1);
+                                      dense_output.data());
 
   EXPECT_THAT(dense_output, ElementsAreArray(ArrayFloatNear(
                                 {-13.69, 6.06001, 272.7, -608.03, -9.66602,
@@ -1564,8 +1553,7 @@ TEST(uKernels,
   std::vector<float> dense_output(kRow * kBatch, 0.0);
   MatrixBatchVectorMultiplyAccumulate(quantized_matrix, kRow, kCol,
                                       quantized_vector, result_scaling_factor,
-                                      kBatch, dense_output.data(),
-                                      /*result_stride=*/1);
+                                      kBatch, dense_output.data());
 
   EXPECT_THAT(dense_output,
               ElementsAreArray(ArrayFloatNear(
diff --git a/tensorflow/lite/kernels/internal/types.h b/tensorflow/lite/kernels/internal/types.h
index f4d2b5c0dad..9b93e8156e5 100644
--- a/tensorflow/lite/kernels/internal/types.h
+++ b/tensorflow/lite/kernels/internal/types.h
@@ -863,6 +863,12 @@ struct DequantizationParams {
   int32 zero_point;
 };
 
+struct PerChannelDequantizationParams {
+  const float* scale;
+  const int32* zero_point;
+  int32 quantized_dimension;
+};
+
 struct FakeQuantParams {
   MinMax minmax;
   int32 num_bits;
@@ -1082,10 +1088,11 @@ struct UnpackParams {
 struct LeakyReluParams {
   float alpha;
   int32 input_offset;
-  int32 alpha_offset;
   int32 output_offset;
-  int32 output_multiplier;
-  int output_shift;
+  int32 output_multiplier_alpha;
+  int32 output_shift_alpha;
+  int32 output_multiplier_identity;
+  int32 output_shift_identity;
 };
 
 template <typename P>
diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index 5691a7df8a5..85c3f506df4 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -41,16 +41,15 @@ inline void MatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
     int n_batch, int32_t* scratch, float* __restrict__ result,
-    int result_stride, CpuBackendContext* context) {
+    CpuBackendContext* context) {
 // TODO(b/148289189) Remove when Ruy GEMV is the default.
 #ifdef TFLITE_WITH_RUY_GEMV
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, scratch,
-      result, result_stride, context);
+      result, context);
 #else
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
-      result_stride);
+      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result);
 #endif
 }
 
@@ -181,18 +180,18 @@ inline void LstmStepFloat(
     if (!use_cifg) {
       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
           input_to_input_weights_ptr, n_cell, n_input, input_ptr, n_batch,
-          input_gate_scratch, /*result_stride=*/1);
+          input_gate_scratch);
     }
 
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         input_to_forget_weights_ptr, n_cell, n_input, input_ptr, n_batch,
-        forget_gate_scratch, /*result_stride=*/1);
+        forget_gate_scratch);
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         input_to_cell_weights_ptr, n_cell, n_input, input_ptr, n_batch,
-        cell_scratch, /*result_stride=*/1);
+        cell_scratch);
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         input_to_output_weights_ptr, n_cell, n_input, input_ptr, n_batch,
-        output_gate_scratch, /*result_stride=*/1);
+        output_gate_scratch);
   }
 
   // For each batch and cell: compute aux_input_weight * aux_input.
@@ -202,38 +201,35 @@ inline void LstmStepFloat(
     if (!use_cifg) {
       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
           aux_input_to_input_weights_ptr, n_cell, n_aux_input, aux_input_ptr,
-          n_batch, input_gate_scratch,
-          /*result_stride=*/1);
+          n_batch, input_gate_scratch);
     }
 
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         aux_input_to_forget_weights_ptr, n_cell, n_aux_input, aux_input_ptr,
-        n_batch, forget_gate_scratch, /*result_stride=*/1);
+        n_batch, forget_gate_scratch);
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         aux_input_to_cell_weights_ptr, n_cell, n_aux_input, aux_input_ptr,
-        n_batch, cell_scratch, /*result_stride=*/1);
+        n_batch, cell_scratch);
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         aux_input_to_output_weights_ptr, n_cell, n_aux_input, aux_input_ptr,
-        n_batch, output_gate_scratch, /*result_stride=*/1);
+        n_batch, output_gate_scratch);
   }
 
   // For each batch and cell: compute recurrent_weight * output_state.
   if (!use_cifg) {
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         recurrent_to_input_weights_ptr, n_cell, n_output, output_state_ptr,
-        n_batch, input_gate_scratch, /*result_stride=*/1);
+        n_batch, input_gate_scratch);
   }
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       recurrent_to_forget_weights_ptr, n_cell, n_output, output_state_ptr,
-      n_batch, forget_gate_scratch,
-      /*result_stride=*/1);
+      n_batch, forget_gate_scratch);
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       recurrent_to_cell_weights_ptr, n_cell, n_output, output_state_ptr,
-      n_batch, cell_scratch, /*result_stride=*/1);
+      n_batch, cell_scratch);
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       recurrent_to_output_weights_ptr, n_cell, n_output, output_state_ptr,
-      n_batch, output_gate_scratch,
-      /*result_stride=*/1);
+      n_batch, output_gate_scratch);
 
   // For each batch and cell: update input gate.
   if (!use_cifg) {
@@ -344,8 +340,7 @@ inline void LstmStepFloat(
       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
           projection_weights_ptr, n_output, n_cell,
           output_gate_scratch + b * n_cell,
-          /*n_batch=*/1, output_ptr + b * output_batch_leading_dim,
-          /*result_stride=*/1);
+          /*n_batch=*/1, output_ptr + b * output_batch_leading_dim);
       if (params->proj_clip > 0.0) {
         tensor_utils::ClipVector(output_ptr + b * output_batch_leading_dim,
                                  n_output, params->proj_clip,
@@ -523,22 +518,20 @@ inline void LstmStepHybrid(
         product_scaling_factors[b] =
             scaling_factors[b] * input_to_input_weights_scale;
       }
-      MatrixBatchVectorMultiplyAccumulate(input_to_input_weights_ptr, n_cell,
-                                          n_input, quantized_input_ptr,
-                                          product_scaling_factors, n_batch,
-                                          accum_scratch_ptr, input_gate_scratch,
-                                          /*result_stride=*/1, context);
+      MatrixBatchVectorMultiplyAccumulate(
+          input_to_input_weights_ptr, n_cell, n_input, quantized_input_ptr,
+          product_scaling_factors, n_batch, accum_scratch_ptr,
+          input_gate_scratch, context);
     }
 
     for (int b = 0; b < n_batch; ++b) {
       product_scaling_factors[b] =
           scaling_factors[b] * input_to_forget_weights_scale;
     }
-    MatrixBatchVectorMultiplyAccumulate(input_to_forget_weights_ptr, n_cell,
-                                        n_input, quantized_input_ptr,
-                                        product_scaling_factors, n_batch,
-                                        accum_scratch_ptr, forget_gate_scratch,
-                                        /*result_stride=*/1, context);
+    MatrixBatchVectorMultiplyAccumulate(
+        input_to_forget_weights_ptr, n_cell, n_input, quantized_input_ptr,
+        product_scaling_factors, n_batch, accum_scratch_ptr,
+        forget_gate_scratch, context);
 
     for (int b = 0; b < n_batch; ++b) {
       product_scaling_factors[b] =
@@ -547,17 +540,16 @@ inline void LstmStepHybrid(
     MatrixBatchVectorMultiplyAccumulate(
         input_to_cell_weights_ptr, n_cell, n_input, quantized_input_ptr,
         product_scaling_factors, n_batch, accum_scratch_ptr, cell_scratch,
-        /*result_stride=*/1, context);
+        context);
 
     for (int b = 0; b < n_batch; ++b) {
       product_scaling_factors[b] =
           scaling_factors[b] * input_to_output_weights_scale;
     }
-    MatrixBatchVectorMultiplyAccumulate(input_to_output_weights_ptr, n_cell,
-                                        n_input, quantized_input_ptr,
-                                        product_scaling_factors, n_batch,
-                                        accum_scratch_ptr, output_gate_scratch,
-                                        /*result_stride=*/1, context);
+    MatrixBatchVectorMultiplyAccumulate(
+        input_to_output_weights_ptr, n_cell, n_input, quantized_input_ptr,
+        product_scaling_factors, n_batch, accum_scratch_ptr,
+        output_gate_scratch, context);
   }
 
   // For each batch and cell: compute aux_input_weight * aux_input.
@@ -579,7 +571,7 @@ inline void LstmStepHybrid(
       MatrixBatchVectorMultiplyAccumulate(
           aux_input_to_input_weights_ptr, n_cell, n_aux_input,
           quantized_aux_input_ptr, product_scaling_factors, n_batch,
-          accum_scratch_ptr, input_gate_scratch, /*result_stride=*/1, context);
+          accum_scratch_ptr, input_gate_scratch, context);
     }
 
     for (int b = 0; b < n_batch; ++b) {
@@ -589,7 +581,7 @@ inline void LstmStepHybrid(
     MatrixBatchVectorMultiplyAccumulate(
         aux_input_to_forget_weights_ptr, n_cell, n_aux_input,
         quantized_aux_input_ptr, product_scaling_factors, n_batch,
-        accum_scratch_ptr, forget_gate_scratch, /*result_stride=*/1, context);
+        accum_scratch_ptr, forget_gate_scratch, context);
 
     for (int b = 0; b < n_batch; ++b) {
       product_scaling_factors[b] =
@@ -598,7 +590,7 @@ inline void LstmStepHybrid(
     MatrixBatchVectorMultiplyAccumulate(
         aux_input_to_cell_weights_ptr, n_cell, n_aux_input,
         quantized_aux_input_ptr, product_scaling_factors, n_batch,
-        accum_scratch_ptr, cell_scratch, /*result_stride=*/1, context);
+        accum_scratch_ptr, cell_scratch, context);
 
     for (int b = 0; b < n_batch; ++b) {
       product_scaling_factors[b] =
@@ -607,7 +599,7 @@ inline void LstmStepHybrid(
     MatrixBatchVectorMultiplyAccumulate(
         aux_input_to_output_weights_ptr, n_cell, n_aux_input,
         quantized_aux_input_ptr, product_scaling_factors, n_batch,
-        accum_scratch_ptr, output_gate_scratch, /*result_stride=*/1, context);
+        accum_scratch_ptr, output_gate_scratch, context);
   }
 
   if (!tensor_utils::IsZeroVector(output_state_ptr, n_batch * n_output)) {
@@ -629,7 +621,7 @@ inline void LstmStepHybrid(
       MatrixBatchVectorMultiplyAccumulate(
           recurrent_to_input_weights_ptr, n_cell, n_output,
           quantized_output_state_ptr, product_scaling_factors, n_batch,
-          accum_scratch_ptr, input_gate_scratch, /*result_stride=*/1, context);
+          accum_scratch_ptr, input_gate_scratch, context);
     }
 
     for (int b = 0; b < n_batch; ++b) {
@@ -639,7 +631,7 @@ inline void LstmStepHybrid(
     MatrixBatchVectorMultiplyAccumulate(
         recurrent_to_forget_weights_ptr, n_cell, n_output,
         quantized_output_state_ptr, product_scaling_factors, n_batch,
-        accum_scratch_ptr, forget_gate_scratch, /*result_stride=*/1, context);
+        accum_scratch_ptr, forget_gate_scratch, context);
 
     for (int b = 0; b < n_batch; ++b) {
       product_scaling_factors[b] =
@@ -648,7 +640,7 @@ inline void LstmStepHybrid(
     MatrixBatchVectorMultiplyAccumulate(
         recurrent_to_cell_weights_ptr, n_cell, n_output,
         quantized_output_state_ptr, product_scaling_factors, n_batch,
-        accum_scratch_ptr, cell_scratch, /*result_stride=*/1, context);
+        accum_scratch_ptr, cell_scratch, context);
 
     for (int b = 0; b < n_batch; ++b) {
       product_scaling_factors[b] =
@@ -657,7 +649,7 @@ inline void LstmStepHybrid(
     MatrixBatchVectorMultiplyAccumulate(
         recurrent_to_output_weights_ptr, n_cell, n_output,
         quantized_output_state_ptr, product_scaling_factors, n_batch,
-        accum_scratch_ptr, output_gate_scratch, /*result_stride=*/1, context);
+        accum_scratch_ptr, output_gate_scratch, context);
   }
 
   // For each batch and cell: update input gate.
@@ -793,8 +785,7 @@ inline void LstmStepHybrid(
             projection_weights_ptr, n_output, n_cell,
             quantized_cell_state_ptr + b * n_cell, &product_scaling_factors[b],
             /*n_batch=*/1, accum_scratch_ptr,
-            output_ptr + b * output_batch_leading_dim,
-            /*result_stride=*/1, context);
+            output_ptr + b * output_batch_leading_dim, context);
       }
     }
     if (params->proj_clip > 0.0) {
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index 51534375e5f..a0b8330a679 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -30,16 +30,6 @@ TfLiteRegistration* Register_DETECTION_POSTPROCESS();
 
 namespace builtin {
 
-const TfLiteRegistration* BuiltinOpResolver::FindOp(tflite::BuiltinOperator op,
-                                                    int version) const {
-  return MutableOpResolver::FindOp(op, version);
-}
-
-const TfLiteRegistration* BuiltinOpResolver::FindOp(const char* op,
-                                                    int version) const {
-  return MutableOpResolver::FindOp(op, version);
-}
-
 BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_ABS, Register_ABS());
   AddBuiltin(BuiltinOperator_HARD_SWISH, Register_HARD_SWISH());
@@ -68,7 +58,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
              /* max_version */ 3);
   AddBuiltin(BuiltinOperator_SVDF, Register_SVDF(),
              /* min_version */ 1,
-             /* max_version */ 2);
+             /* max_version */ 3);
   AddBuiltin(BuiltinOperator_RNN, Register_RNN(),
              /* min_version */ 1,
              /* max_version */ 2);
@@ -101,10 +91,10 @@ BuiltinOpResolver::BuiltinOpResolver() {
              /* max_version */ 2);
   AddBuiltin(BuiltinOperator_SPACE_TO_BATCH_ND, Register_SPACE_TO_BATCH_ND(),
              /* min_version */ 1,
-             /* max_version */ 2);
+             /* max_version */ 3);
   AddBuiltin(BuiltinOperator_BATCH_TO_SPACE_ND, Register_BATCH_TO_SPACE_ND(),
              /* min_version */ 1,
-             /* max_version */ 2);
+             /* max_version */ 3);
   AddBuiltin(BuiltinOperator_MUL, Register_MUL(), /* min_version */ 1,
              /* max_version */ 3);
   AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2_NORMALIZATION(),
@@ -152,7 +142,9 @@ BuiltinOpResolver::BuiltinOpResolver() {
              /* max_version */ 2);
   AddBuiltin(BuiltinOperator_SPLIT, Register_SPLIT(), /* min_version */ 1,
              /* max_version */ 3);
-  AddBuiltin(BuiltinOperator_SPLIT_V, Register_SPLIT_V());
+  AddBuiltin(BuiltinOperator_SPLIT_V, Register_SPLIT_V(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_SQUEEZE, Register_SQUEEZE());
   AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE(),
              /* min_version */ 1,
@@ -257,7 +249,9 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_ZEROS_LIKE, Register_ZEROS_LIKE());
   AddBuiltin(BuiltinOperator_FLOOR_MOD, Register_FLOOR_MOD());
   AddBuiltin(BuiltinOperator_RANGE, Register_RANGE());
-  AddBuiltin(BuiltinOperator_LEAKY_RELU, Register_LEAKY_RELU());
+  AddBuiltin(BuiltinOperator_LEAKY_RELU, Register_LEAKY_RELU(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_SQUARED_DIFFERENCE, Register_SQUARED_DIFFERENCE());
   AddBuiltin(BuiltinOperator_FILL, Register_FILL());
   AddBuiltin(BuiltinOperator_MIRROR_PAD, Register_MIRROR_PAD());
diff --git a/tensorflow/lite/kernels/register.h b/tensorflow/lite/kernels/register.h
index 7e9367204d8..3e5bd298baf 100644
--- a/tensorflow/lite/kernels/register.h
+++ b/tensorflow/lite/kernels/register.h
@@ -26,10 +26,6 @@ namespace builtin {
 class BuiltinOpResolver : public MutableOpResolver {
  public:
   BuiltinOpResolver();
-
-  const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
-                                   int version) const override;
-  const TfLiteRegistration* FindOp(const char* op, int version) const override;
 };
 
 }  // namespace builtin
diff --git a/tensorflow/lite/kernels/space_to_batch_nd.cc b/tensorflow/lite/kernels/space_to_batch_nd.cc
index dabbf1395c8..e8756ef5f2e 100644
--- a/tensorflow/lite/kernels/space_to_batch_nd.cc
+++ b/tensorflow/lite/kernels/space_to_batch_nd.cc
@@ -48,12 +48,12 @@ struct SpaceToBatchNDContext {
   TfLiteTensor* output;
 };
 
-// Currently, only 4D NHWC input/output op_context are supported.
+// Currently, only 3D NHC and 4D NHWC input/output op_context are supported.
+// In case of 3D input, it will be extended to 3D NHWC by adding W=1.
 // The 4D array need to have exactly 2 spatial dimensions.
 // TODO(b/149952582): Support arbitrary dimension in SpaceToBatchND.
-const int kInputDimensionNum = 4;
-const int kBlockSizeDimensionNum = 1;
-const int kSpatialDimensionNum = 2;
+const int kInputMinDimensionNum = 3;
+const int kInputMaxDimensionNum = 4;
 
 TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
                                 SpaceToBatchNDContext* op_context) {
@@ -61,30 +61,33 @@ TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
   const int32* block_shape = GetTensorData<int32>(op_context->block_shape);
   const int32* paddings_data = GetTensorData<int32>(op_context->paddings);
 
-  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->block_shape),
-                    kBlockSizeDimensionNum);
+  int spatial_dims_num = input_size->size - 2;
+  // Block_shape should be a 1D tensor with dimension [spatial_dims_num].
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->block_shape), 1);
   TF_LITE_ENSURE_EQ(context, op_context->block_shape->dims->data[0],
-                    kSpatialDimensionNum);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->paddings),
-                    kSpatialDimensionNum);
+                    spatial_dims_num);
+  // Paddings should be a 2D tensor with dimension [spatial_dims_num, 2].
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->paddings), 2);
+  TF_LITE_ENSURE_EQ(context, op_context->paddings->dims->data[0],
+                    spatial_dims_num);
+  TF_LITE_ENSURE_EQ(context, op_context->paddings->dims->data[1], 2);
 
   TfLiteIntArray* output_size = TfLiteIntArrayCopy(input_size);
 
   // Ensures the input height and width (with padding) is a multiple of block
   // shape height and width.
-  for (int dim = 0; dim < kSpatialDimensionNum; ++dim) {
+  int output_batch_size = input_size->data[0];
+  for (int dim = 0; dim < spatial_dims_num; ++dim) {
     int final_dim_size = (input_size->data[dim + 1] + paddings_data[dim * 2] +
                           paddings_data[dim * 2 + 1]);
     TF_LITE_ENSURE_EQ(context, final_dim_size % block_shape[dim], 0);
     output_size->data[dim + 1] = final_dim_size / block_shape[dim];
+    output_batch_size *= block_shape[dim];
   }
 
-  const int output_batch_size =
-      input_size->data[0] * block_shape[0] * block_shape[1];
-  const int output_channel_size = input_size->data[3];
-
   output_size->data[0] = output_batch_size;
-  output_size->data[3] = output_channel_size;
+  output_size->data[input_size->size - 1] =
+      input_size->data[input_size->size - 1];
 
   return context->ResizeTensor(context, op_context->output, output_size);
 }
@@ -94,8 +97,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
   SpaceToBatchNDContext op_context(context, node);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.input),
-                    kInputDimensionNum);
+  TF_LITE_ENSURE(context,
+                 NumDimensions(op_context.input) >= kInputMinDimensionNum);
+  TF_LITE_ENSURE(context,
+                 NumDimensions(op_context.input) <= kInputMaxDimensionNum);
   TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
 
   if (!IsConstantTensor(op_context.block_shape) ||
diff --git a/tensorflow/lite/kernels/space_to_batch_nd_test.cc b/tensorflow/lite/kernels/space_to_batch_nd_test.cc
index 72432a1b00c..d34989f0fb7 100644
--- a/tensorflow/lite/kernels/space_to_batch_nd_test.cc
+++ b/tensorflow/lite/kernels/space_to_batch_nd_test.cc
@@ -68,13 +68,14 @@ class SpaceToBatchNDOpModel : public SingleOpModel {
 //    m.Invoke();
 class SpaceToBatchNDOpConstModel : public SpaceToBatchNDOpModel {
  public:
-  SpaceToBatchNDOpConstModel(const TensorData& input,
-                             std::initializer_list<int> block_shape,
-                             std::initializer_list<int> paddings,
-                             const TensorData& output) {
+  SpaceToBatchNDOpConstModel(
+      const TensorData& input, std::initializer_list<int> block_shape,
+      std::initializer_list<int> paddings, const TensorData& output,
+      std::initializer_list<int> paddings_dims = {2, 2}) {
     input_ = AddInput(input);
-    block_shape_ = AddConstInput(TensorType_INT32, block_shape, {2});
-    paddings_ = AddConstInput(TensorType_INT32, paddings, {2, 2});
+    block_shape_ = AddConstInput(TensorType_INT32, block_shape,
+                                 {static_cast<int>(block_shape.size())});
+    paddings_ = AddConstInput(TensorType_INT32, paddings, paddings_dims);
     output_ = AddOutput(output);
 
     SetBuiltinOp(BuiltinOperator_SPACE_TO_BATCH_ND,
@@ -94,8 +95,10 @@ class SpaceToBatchNDOpConstModel : public SpaceToBatchNDOpModel {
 //    m.Invoke();
 class SpaceToBatchNDOpDynamicModel : public SpaceToBatchNDOpModel {
  public:
-  SpaceToBatchNDOpDynamicModel(const TensorData& input,
-                               const TensorData& output) {
+  SpaceToBatchNDOpDynamicModel(
+      const TensorData& input, const TensorData& output,
+      std::initializer_list<int> block_shape_dims = {2},
+      std::initializer_list<int> paddings_dims = {2, 2}) {
     input_ = AddInput(input);
     block_shape_ = AddInput(TensorType_INT32);
     paddings_ = AddInput(TensorType_INT32);
@@ -104,7 +107,7 @@ class SpaceToBatchNDOpDynamicModel : public SpaceToBatchNDOpModel {
     SetBuiltinOp(BuiltinOperator_SPACE_TO_BATCH_ND,
                  BuiltinOptions_SpaceToBatchNDOptions,
                  CreateSpaceToBatchNDOptions(builder_).Union());
-    BuildInterpreter({input.shape, {2}, {2, 2}});
+    BuildInterpreter({input.shape, block_shape_dims, paddings_dims});
   }
 };
 
@@ -332,5 +335,53 @@ TEST_F(QuantizedSpaceToBatchNDOpTest, ComplexPaddingDynamicTest) {
                   -1.0, 1.0)));
 }
 
+TEST(SpaceToBatchNDOpTest, Simple3DConstTest) {
+  SpaceToBatchNDOpConstModel m({TensorType_FLOAT32, {1, 4, 4}}, {2}, {0, 0},
+                               {TensorType_FLOAT32}, {1, 2});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 4}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 9, 10, 11, 12, 5, 6,
+                                               7, 8, 13, 14, 15, 16}));
+}
+
+TEST(SpaceToBatchNDOpTest, Simple3DPaddingConstTest) {
+  SpaceToBatchNDOpConstModel m({TensorType_FLOAT32, {1, 4, 4}}, {2}, {2, 2},
+                               {TensorType_FLOAT32}, {1, 2});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 4, 4}));
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray({0, 0, 0, 0, 1, 2, 3, 4, 9,  10, 11, 12, 0, 0, 0, 0,
+                        0, 0, 0, 0, 5, 6, 7, 8, 13, 14, 15, 16, 0, 0, 0, 0}));
+}
+
+TEST(SpaceToBatchNDOpTest, Simple3DDynamicTest) {
+  SpaceToBatchNDOpDynamicModel m({TensorType_FLOAT32, {1, 4, 4}},
+                                 {TensorType_FLOAT32}, {1}, {1, 2});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetBlockShape({2});
+  m.SetPaddings({0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 4}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 9, 10, 11, 12, 5, 6,
+                                               7, 8, 13, 14, 15, 16}));
+}
+
+TEST(SpaceToBatchNDOpTest, Simple3DPaddingDynamicTest) {
+  SpaceToBatchNDOpDynamicModel m({TensorType_FLOAT32, {1, 4, 4}},
+                                 {TensorType_FLOAT32}, {1}, {1, 2});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetBlockShape({2});
+  m.SetPaddings({2, 2});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 4, 4}));
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray({0, 0, 0, 0, 1, 2, 3, 4, 9,  10, 11, 12, 0, 0, 0, 0,
+                        0, 0, 0, 0, 5, 6, 7, 8, 13, 14, 15, 16, 0, 0, 0, 0}));
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/split_v.cc b/tensorflow/lite/kernels/split_v.cc
index 69a2e47508b..b5529b98ecb 100644
--- a/tensorflow/lite/kernels/split_v.cc
+++ b/tensorflow/lite/kernels/split_v.cc
@@ -128,7 +128,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context,
                  input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
                      input_type == kTfLiteInt16 || input_type == kTfLiteInt32 ||
-                     input_type == kTfLiteInt64);
+                     input_type == kTfLiteInt64 || input_type == kTfLiteInt8);
   for (int i = 0; i < NumOutputs(node); ++i) {
     GetOutput(context, node, i)->type = input_type;
   }
@@ -192,6 +192,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_SPLIT_V(int64_t);
       break;
     }
+    case kTfLiteInt8: {
+      TF_LITE_SPLIT_V(int8_t);
+      break;
+    }
     default:
       context->ReportError(context, "Type %s currently not supported.",
                            TfLiteTypeGetName(op_context.input->type));
diff --git a/tensorflow/lite/kernels/split_v_test.cc b/tensorflow/lite/kernels/split_v_test.cc
index 6bb4399cdfc..4e143cabe58 100644
--- a/tensorflow/lite/kernels/split_v_test.cc
+++ b/tensorflow/lite/kernels/split_v_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <initializer_list>
+
 #include <gtest/gtest.h>
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
@@ -72,7 +73,7 @@ class SplitVOpModel : public SingleOpModel {
   std::vector<int> outputs_;
 };
 
-template <typename T, TensorType T1>
+template <typename T>
 void Check(int axis, std::initializer_list<int> input_shape,
            std::initializer_list<int> size_splits_shape,
            std::vector<std::initializer_list<int>> output_shapes,
@@ -80,8 +81,9 @@ void Check(int axis, std::initializer_list<int> input_shape,
            const std::initializer_list<int>& size_splits_data,
            const std::vector<std::initializer_list<T>>& output_data) {
   int num_splits = size_splits_data.size();
-  SplitVOpModel m({T1, input_shape}, {TensorType_INT32, size_splits_shape},
-                  num_splits, kAxisIsATensor);
+  SplitVOpModel m({GetTensorType<T>(), input_shape},
+                  {TensorType_INT32, size_splits_shape}, num_splits,
+                  kAxisIsATensor);
   m.SetInput<T>(input_data);
   m.SetSizeSplits(size_splits_data);
   m.SetAxis(axis);
@@ -91,7 +93,7 @@ void Check(int axis, std::initializer_list<int> input_shape,
     EXPECT_THAT(m.GetOutputShape(i), ElementsAreArray(output_shapes[i]));
   }
 
-  SplitVOpModel const_m({T1, input_shape},
+  SplitVOpModel const_m({GetTensorType<T>(), input_shape},
                         {TensorType_INT32, size_splits_shape}, num_splits,
                         axis);
   const_m.SetInput<T>(input_data);
@@ -103,7 +105,13 @@ void Check(int axis, std::initializer_list<int> input_shape,
   }
 }
 
-TEST(SplitVOpTest, TwoDimensional) {
+template <typename T>
+class SplitVOpTypedTest : public ::testing::Test {};
+
+using DataTypes = ::testing::Types<float, uint8_t, int8_t, int16_t, int32_t>;
+TYPED_TEST_SUITE(SplitVOpTypedTest, DataTypes);
+
+TYPED_TEST(SplitVOpTypedTest, TwoDimensional) {
   // Input shape: {4, 3}
   // size_splits: {1, 1, 2}
   // axis: 0
@@ -111,35 +119,35 @@ TEST(SplitVOpTest, TwoDimensional) {
   //  output 1 : {1, 3}
   //  output 2 : {1, 3}
   //  output 3 : {2, 3}
-  Check<float, TensorType_FLOAT32>(
+  Check<TypeParam>(
       /*axis=*/0, {4, 3}, {3}, {{1, 3}, {1, 3}, {2, 3}},
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, {1, 1, 2},
       {{1, 2, 3}, {4, 5, 6}, {7, 8, 9, 10, 11, 12}});
 }
 
-TEST(SplitVOpTest, FourDimensional) {
-  Check<float, TensorType_FLOAT32>(
+TYPED_TEST(SplitVOpTypedTest, FourDimensional) {
+  Check<TypeParam>(
       /*axis=*/0, {2, 2, 2, 2}, {2}, {{1, 2, 2, 2}, {1, 2, 2, 2}},
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, 1},
       {
           {1, 2, 3, 4, 5, 6, 7, 8},
           {9, 10, 11, 12, 13, 14, 15, 16},
       });
-  Check<float, TensorType_FLOAT32>(
+  Check<TypeParam>(
       /*axis=*/1, {2, 2, 2, 2}, {2}, {{2, 1, 2, 2}, {2, 1, 2, 2}},
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, -1},
       {
           {1, 2, 3, 4, 9, 10, 11, 12},
           {5, 6, 7, 8, 13, 14, 15, 16},
       });
-  Check<float, TensorType_FLOAT32>(
+  Check<TypeParam>(
       /*axis=*/2, {2, 2, 2, 2}, {2}, {{2, 2, 1, 2}, {2, 2, 1, 2}},
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, 1},
       {
           {1, 2, 5, 6, 9, 10, 13, 14},
           {3, 4, 7, 8, 11, 12, 15, 16},
       });
-  Check<float, TensorType_FLOAT32>(
+  Check<TypeParam>(
       /*axis=*/3, {2, 2, 2, 2}, {2}, {{2, 2, 2, 1}, {2, 2, 2, 1}},
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, 1},
       {
@@ -148,22 +156,22 @@ TEST(SplitVOpTest, FourDimensional) {
       });
 }
 
-TEST(SplitVOpTest, OneDimensional) {
-  Check<float, TensorType_FLOAT32>(
+TYPED_TEST(SplitVOpTypedTest, OneDimensional) {
+  Check<TypeParam>(
       /*axis=*/0, {8}, {8}, {{1}, {1}, {1}, {1}, {1}, {1}, {1}, {1}},
       {1, 2, 3, 4, 5, 6, 7, 8}, {1, 1, 1, 1, 1, 1, 1, 1},
       {{1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}});
 }
 
-TEST(SplitVOpTest, OneDimensional2) {
-  Check<float, TensorType_FLOAT32>(
+TYPED_TEST(SplitVOpTypedTest, OneDimensional2) {
+  Check<TypeParam>(
       /*axis=*/0, {8}, {8}, {{1}, {1}, {1}, {1}, {1}, {1}, {2}, {0}},
       {1, 2, 3, 4, 5, 6, 7, 8}, {1, 1, 1, 1, 1, 1, 2, -1},
       {{1}, {2}, {3}, {4}, {5}, {6}, {7, 8}, {}});
 }
 
-TEST(SplitVOpTest, NegativeAxis) {
-  Check<float, TensorType_FLOAT32>(
+TYPED_TEST(SplitVOpTypedTest, NegativeAxis) {
+  Check<TypeParam>(
       /*axis=*/-4, {2, 2, 2, 2}, {2}, {{1, 2, 2, 2}, {1, 2, 2, 2}},
       {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, 1},
       {
@@ -172,61 +180,5 @@ TEST(SplitVOpTest, NegativeAxis) {
       });
 }
 
-TEST(SplitVOpTest, TwoDimensionalUint8) {
-  // Input shape: {4, 3}
-  // size_splits: {1, 1, 2}
-  // axis: 0
-  // We should have 3 outpus with shapes respectively:
-  //  output 1 : {1, 3}
-  //  output 2 : {1, 3}
-  //  output 3 : {2, 3}
-  Check<uint8_t, TensorType_UINT8>(
-      /*axis=*/0, {4, 3}, {3}, {{1, 3}, {1, 3}, {2, 3}},
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, {1, 1, 2},
-      {{1, 2, 3}, {4, 5, 6}, {7, 8, 9, 10, 11, 12}});
-}
-
-TEST(SplitVOpTest, TwoDimensionalInt16) {
-  // Input shape: {4, 3}
-  // size_splits: {1, 1, 2}
-  // axis: 0
-  // We should have 3 outpus with shapes respectively:
-  //  output 1 : {1, 3}
-  //  output 2 : {1, 3}
-  //  output 3 : {2, 3}
-  Check<int16_t, TensorType_INT16>(
-      /*axis=*/0, {4, 3}, {3}, {{1, 3}, {1, 3}, {2, 3}},
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, {1, 1, 2},
-      {{1, 2, 3}, {4, 5, 6}, {7, 8, 9, 10, 11, 12}});
-}
-
-TEST(SplitVOpTest, TwoDimensionalInt32) {
-  // Input shape: {4, 3}
-  // size_splits: {1, 1, 2}
-  // axis: 0
-  // We should have 3 outpus with shapes respectively:
-  //  output 1 : {1, 3}
-  //  output 2 : {1, 3}
-  //  output 3 : {2, 3}
-  Check<int32_t, TensorType_INT32>(
-      /*axis=*/0, {4, 3}, {3}, {{1, 3}, {1, 3}, {2, 3}},
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, {1, 1, 2},
-      {{1, 2, 3}, {4, 5, 6}, {7, 8, 9, 10, 11, 12}});
-}
-
-TEST(SplitVOpTest, TwoDimensionalInt64) {
-  // Input shape: {4, 3}
-  // size_splits: {1, 1, 2}
-  // axis: 0
-  // We should have 3 outpus with shapes respectively:
-  //  output 1 : {1, 3}
-  //  output 2 : {1, 3}
-  //  output 3 : {2, 3}
-  Check<int64_t, TensorType_INT64>(
-      /*axis=*/0, {4, 3}, {3}, {{1, 3}, {1, 3}, {2, 3}},
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, {1, 1, 2},
-      {{1, 2, 3}, {4, 5, 6}, {7, 8, 9, 10, 11, 12}});
-}
-
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/lib_package/create_ios_frameworks.sh b/tensorflow/lite/lib_package/create_ios_frameworks.sh
index d9e3b1e96a1..71aee65848f 100755
--- a/tensorflow/lite/lib_package/create_ios_frameworks.sh
+++ b/tensorflow/lite/lib_package/create_ios_frameworks.sh
@@ -45,7 +45,7 @@ if [ $USE_GPU_DELEGATE == "true" ] ; then
   for filename in metal_delegate.h libmetal_delegate.a ; do
     if [[ ! -f "${TFLITE_DIR}/delegates/gpu/${filename}" ]] ; then
       echo "File ${TFLITE_DIR}/delegates/gpu/${filename} doesn't exist."
-      echo "It's requried for building TFLite Framework with GPU. Aborting."
+      echo "It's required for building TFLite Framework with GPU. Aborting."
       exit 1
     fi
   done
diff --git a/tensorflow/lite/micro/BUILD b/tensorflow/lite/micro/BUILD
index f7a110b5546..3aebedcf498 100644
--- a/tensorflow/lite/micro/BUILD
+++ b/tensorflow/lite/micro/BUILD
@@ -78,10 +78,22 @@ cc_library(
     deps = [
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
-        "//tensorflow/lite/kernels/internal:compatibility",
     ],
 )
 
+cc_library(
+    name = "micro_time",
+    srcs = [
+        "posix/micro_time.cc",
+    ],
+    hdrs = [
+        "micro_time.h",
+    ],
+    build_for_embedded = True,
+    copts = micro_copts(),
+    deps = ["//tensorflow/lite/c:common"],
+)
+
 cc_library(
     name = "micro_utils",
     srcs = [
@@ -197,3 +209,14 @@ tflite_micro_cc_test(
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
+
+tflite_micro_cc_test(
+    name = "micro_time_test",
+    srcs = [
+        "micro_time_test.cc",
+    ],
+    deps = [
+        ":micro_time",
+        "//tensorflow/lite/micro/testing:micro_test",
+    ],
+)
diff --git a/tensorflow/lite/micro/examples/magic_wand/train/data_load.py b/tensorflow/lite/micro/examples/magic_wand/train/data_load.py
index ceb24a7712a..fc9ba7189ab 100644
--- a/tensorflow/lite/micro/examples/magic_wand/train/data_load.py
+++ b/tensorflow/lite/micro/examples/magic_wand/train/data_load.py
@@ -64,7 +64,7 @@ class DataLoader(object):
     return data, label, length
 
   def pad(self, data, seq_length, dim):
-    """Get neighboor padding."""
+    """Get neighbour padding."""
     noise_level = 20
     padded_data = []
     # Before- Neighbour padding
diff --git a/tensorflow/lite/micro/examples/micro_speech/nxp_k66f/audio_provider.cc b/tensorflow/lite/micro/examples/micro_speech/nxp_k66f/audio_provider.cc
index 8fe4a88c368..fb7df6b8845 100644
--- a/tensorflow/lite/micro/examples/micro_speech/nxp_k66f/audio_provider.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/nxp_k66f/audio_provider.cc
@@ -254,7 +254,7 @@ void Da7212Initialize(void) {
   }
 }
 
-// Initalization for receiving audio data
+// Initialization for receiving audio data
 TfLiteStatus InitAudioRecording(tflite::ErrorReporter *error_reporter) {
   edma_config_t dma_config = {0};
   sai_config_t sai_config;
diff --git a/tensorflow/lite/micro/examples/person_detection/himax_driver/HM01B0.c b/tensorflow/lite/micro/examples/person_detection/himax_driver/HM01B0.c
index 78c1ef3f18b..1cb9d45ea2e 100644
--- a/tensorflow/lite/micro/examples/person_detection/himax_driver/HM01B0.c
+++ b/tensorflow/lite/micro/examples/person_detection/himax_driver/HM01B0.c
@@ -94,7 +94,7 @@ static uint32_t hm01b0_write_reg(hm01b0_cfg_t* psCfg, uint16_t ui16Reg,
   Transaction.ui32StatusSetClr = 0;
 
   //
-  // Execute the transction over IOM.
+  // Execute the transaction over IOM.
   //
   if (am_hal_iom_blocking_transfer(psCfg->pIOMHandle, &Transaction)) {
     return HM01B0_ERR_I2C;
@@ -138,7 +138,7 @@ static uint32_t hm01b0_read_reg(hm01b0_cfg_t* psCfg, uint16_t ui16Reg,
   Transaction.ui32StatusSetClr = 0;
 
   //
-  // Execute the transction over IOM.
+  // Execute the transaction over IOM.
   //
   if (am_hal_iom_blocking_transfer(psCfg->pIOMHandle, &Transaction)) {
     return HM01B0_ERR_I2C;
@@ -468,7 +468,7 @@ uint32_t hm01b0_get_modelid(hm01b0_cfg_t* psCfg, uint16_t* pui16MID) {
 //! @param ui32ScriptCmdNum     - No. of commands in HM01B0 initialization
 //! script.
 //!
-//! This function initilizes HM01B0 with a given script.
+//! This function initializes HM01B0 with a given script.
 //!
 //! @return Error code.
 //
diff --git a/tensorflow/lite/micro/examples/person_detection/himax_driver/HM01B0.h b/tensorflow/lite/micro/examples/person_detection/himax_driver/HM01B0.h
index 718a8fe3715..49f01ddca58 100644
--- a/tensorflow/lite/micro/examples/person_detection/himax_driver/HM01B0.h
+++ b/tensorflow/lite/micro/examples/person_detection/himax_driver/HM01B0.h
@@ -265,7 +265,7 @@ uint32_t hm01b0_get_modelid(hm01b0_cfg_t *psCfg, uint16_t *pui16MID);
 //! @param ui32ScriptCmdNum     - No. of commands in HM01B0 initialization
 //! script.
 //!
-//! This function initilizes HM01B0 with a given script.
+//! This function initializes HM01B0 with a given script.
 //!
 //! @return Error code.
 //
diff --git a/tensorflow/lite/micro/kernels/BUILD b/tensorflow/lite/micro/kernels/BUILD
index 0df1052e4b7..288f603a1f0 100644
--- a/tensorflow/lite/micro/kernels/BUILD
+++ b/tensorflow/lite/micro/kernels/BUILD
@@ -22,6 +22,7 @@ cc_library(
         "add.cc",
         "arg_min_max.cc",
         "ceil.cc",
+        "circular_buffer.cc",
         "comparisons.cc",
         "concatenation.cc",
         "conv.cc",
@@ -46,6 +47,7 @@ cc_library(
         "softmax.cc",
         "split.cc",
         "strided_slice.cc",
+        "sub.cc",
         "svdf.cc",
         "unpack.cc",
     ],
@@ -92,6 +94,7 @@ cc_library(
         "add.cc",
         "arg_min_max.cc",
         "ceil.cc",
+        "circular_buffer.cc",
         "comparisons.cc",
         "concatenation.cc",
         "conv.cc",
@@ -116,6 +119,7 @@ cc_library(
         "softmax.cc",
         "split.cc",
         "strided_slice.cc",
+        "sub.cc",
         "svdf.cc",
         "unpack.cc",
     ],
@@ -354,6 +358,19 @@ tflite_micro_cc_test(
     ],
 )
 
+tflite_micro_cc_test(
+    name = "sub_test",
+    srcs = [
+        "sub_test.cc",
+    ],
+    deps = [
+        ":all_ops_resolver",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro/testing:micro_test",
+    ],
+)
+
 tflite_micro_cc_test(
     name = "arg_min_max_test",
     srcs = [
@@ -375,9 +392,9 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":all_ops_resolver",
+        ":micro_utils",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:micro_framework",
-        "//tensorflow/lite/micro/kernels:micro_utils",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -497,9 +514,9 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":all_ops_resolver",
+        ":micro_utils",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:micro_framework",
-        "//tensorflow/lite/micro/kernels:micro_utils",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -511,9 +528,9 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":all_ops_resolver",
+        ":micro_utils",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:micro_framework",
-        "//tensorflow/lite/micro/kernels:micro_utils",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -589,3 +606,16 @@ tflite_micro_cc_test(
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
+
+tflite_micro_cc_test(
+    name = "circular_buffer_test",
+    srcs = [
+        "circular_buffer_test.cc",
+    ],
+    deps = [
+        ":all_ops_resolver",
+        ":micro_ops",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro/testing:micro_test",
+    ],
+)
diff --git a/tensorflow/lite/micro/kernels/activations_test.cc b/tensorflow/lite/micro/kernels/activations_test.cc
index cd375b00b9b..7c6a55f306c 100644
--- a/tensorflow/lite/micro/kernels/activations_test.cc
+++ b/tensorflow/lite/micro/kernels/activations_test.cc
@@ -39,7 +39,7 @@ void TestReluFloat(const int* input_dims_data, const float* input_data,
   };
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
@@ -95,7 +95,7 @@ void TestRelu6Float(const int* input_dims_data, const float* input_data,
   };
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
@@ -156,7 +156,7 @@ void TestReluUint8(const int* input_dims_data, const float* input_data,
   };
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
@@ -221,7 +221,7 @@ void TestRelu6Uint8(const int* input_dims_data, const float* input_data,
   };
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
@@ -285,7 +285,7 @@ void TestReluInt8(const int* input_dims_data, const float* input_data,
   };
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
@@ -351,7 +351,7 @@ void TestRelu6Int8(const int* input_dims_data, const float* input_data,
   };
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
diff --git a/tensorflow/lite/micro/kernels/add_test.cc b/tensorflow/lite/micro/kernels/add_test.cc
index 1cb60d99df8..96c3aabeb8e 100644
--- a/tensorflow/lite/micro/kernels/add_test.cc
+++ b/tensorflow/lite/micro/kernels/add_test.cc
@@ -67,7 +67,7 @@ void ValidateAddGoldens(TfLiteTensor* tensors, int tensors_size,
                         TfLiteFusedActivation activation,
                         float tolerance = 1e-5) {
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
diff --git a/tensorflow/lite/micro/kernels/all_ops_resolver.cc b/tensorflow/lite/micro/kernels/all_ops_resolver.cc
index 726c933c19e..8eb22a705b4 100644
--- a/tensorflow/lite/micro/kernels/all_ops_resolver.cc
+++ b/tensorflow/lite/micro/kernels/all_ops_resolver.cc
@@ -65,6 +65,7 @@ AllOpsResolver::AllOpsResolver() {
   AddBuiltin(BuiltinOperator_NEG, Register_NEG());
   AddBuiltin(BuiltinOperator_ADD, Register_ADD(), 1, 2);
   AddBuiltin(BuiltinOperator_MUL, Register_MUL(), 1, 3);
+  AddBuiltin(BuiltinOperator_SUB, Register_SUB(), 1, 2);
   AddBuiltin(BuiltinOperator_QUANTIZE, Register_QUANTIZE());
   AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE(), 1, 2);
   AddBuiltin(BuiltinOperator_RELU, Register_RELU());
diff --git a/tensorflow/lite/micro/kernels/arg_min_max_test.cc b/tensorflow/lite/micro/kernels/arg_min_max_test.cc
index fc4110fc3fd..ea95b8adabf 100644
--- a/tensorflow/lite/micro/kernels/arg_min_max_test.cc
+++ b/tensorflow/lite/micro/kernels/arg_min_max_test.cc
@@ -27,7 +27,7 @@ void ValidateArgMinMaxGoldens(TfLiteTensor* tensors, int tensors_size,
                               const int32_t* golden, int32_t* output,
                               int output_size, bool using_min) {
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration;
   if (using_min) {
diff --git a/tensorflow/lite/micro/kernels/ceil_test.cc b/tensorflow/lite/micro/kernels/ceil_test.cc
index 6802aac43c7..ccd5c712fef 100644
--- a/tensorflow/lite/micro/kernels/ceil_test.cc
+++ b/tensorflow/lite/micro/kernels/ceil_test.cc
@@ -36,7 +36,7 @@ void TestCeil(const int* input_dims_data, const float* input_data,
       CreateFloatTensor(output_data, output_dims, "output_tensor"),
   };
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
       resolver.FindOp(tflite::BuiltinOperator_CEIL, 1);
diff --git a/tensorflow/lite/micro/kernels/circular_buffer.cc b/tensorflow/lite/micro/kernels/circular_buffer.cc
new file mode 100644
index 00000000000..6b024696faa
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/circular_buffer.cc
@@ -0,0 +1,166 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+/*
+ * The circular buffer custom operator is used to implement strided streaming
+ * convolutions on TFLite Micro.  Each time this operator is invoked, it checks
+ * whether or not to run, based on a predetermined stride in time.  If the op
+ * runs, it inserts the input into the end of the output buffer and shifts the
+ * output values towards the start of the buffer.  It discards the oldest value
+ * in the output buffer.
+ *
+ * Input: [<input N+1]
+ * Before shifting:
+ * Output: [<input 1>, <input 2>, <input ...>, <input N>]
+ *
+ * After shifting:
+ * Output: [<input 2>, <input 3>, <input ...>, <input N+1>]
+ *
+ * We make some assumptions in this custom operator:
+ * - Input shape must be [1, 1, 1, depth]
+ * - Output shape must be [1, num_slots, 1, depth]
+ * - Input and output types must match.
+ * - Input and output quantization params must be identical.
+ */
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace circular_buffer {
+
+namespace {
+
+// The CircularBuffer op has one input and one output tensor.
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+// TODO(b/149795762): Add this to TfLiteStatus enum.
+constexpr int kTfLiteAbort = -9;
+
+// These fields control the stride period of a strided streaming model. This op
+// returns kTfLiteAbort until cycles_until_run-- is zero.  At this time,
+// cycles_until_run is reset to cycles_max.
+struct OpData {
+  int cycles_until_run;
+  int cycles_max;
+};
+
+// These constants represent constants specific to the music detect model.
+// They exist until (b/132070898) is fixed.
+constexpr int kMaxOpDataSize = 7;
+int op_data_counter = 0;
+OpData op_data_array[kMaxOpDataSize];
+
+}  // namespace
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE(context, input != nullptr);
+  TF_LITE_ENSURE(context, output != nullptr);
+  TF_LITE_ENSURE_EQ(context, 1, output->dims->data[0]);
+  TF_LITE_ENSURE_EQ(context, 1, input->dims->data[0]);
+  TF_LITE_ENSURE_EQ(context, 1, input->dims->data[1]);
+  TF_LITE_ENSURE_EQ(context, 1, output->dims->data[2]);
+  TF_LITE_ENSURE_EQ(context, 1, input->dims->data[2]);
+  TF_LITE_ENSURE_EQ(context, output->dims->data[3], input->dims->data[3]);
+
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+  // The circular buffer custom operator currently only supports int8.
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteInt8);
+
+  // TODO(b/132070898): Use statically slotted OpData structures until a
+  // scratch memory API is ready.
+  OpData* op_data = &op_data_array[op_data_counter++];
+  // The last circular buffer layer (length 5) simply accumulates outputs, and
+  // does not run periodically.
+  // TODO(b/150001379): Move this special case logic to the tflite flatbuffer.
+  if (output->dims->data[1] == 5) {
+    op_data->cycles_max = 1;
+  } else {
+    op_data->cycles_max = 2;
+  }
+  op_data->cycles_until_run = op_data->cycles_max;
+  node->user_data = op_data;
+
+  return kTfLiteOk;
+}
+
+// Shifts buffer over by the output depth, and write new input to end of buffer.
+// num_slots is the number of samples stored in the output buffer.
+// depth is the size of each sample.
+void EvalInt8(const int8_t* input, int num_slots, int depth, int8_t* output) {
+  memmove(output, &output[depth], (num_slots - 1) * depth);
+  memcpy(&output[(num_slots - 1) * depth], input, depth);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  int num_slots = output->dims->data[1];
+  int depth = output->dims->data[3];
+
+  if (input->type == kTfLiteInt8) {
+    EvalInt8(GetTensorData<int8_t>(input), num_slots, depth,
+             GetTensorData<int8_t>(output));
+  } else {
+    TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                       TfLiteTypeGetName(input->type), input->type);
+    return kTfLiteError;
+  }
+
+  if (--data->cycles_until_run != 0) {
+    // Signal the interpreter to end current run if the delay before op invoke
+    // has not been reached.
+    // TODO(b/149795762): Add kTfLiteAbort to TfLiteStatus enum.
+    return static_cast<TfLiteStatus>(kTfLiteAbort);
+  }
+
+  // If prepare is ever called more than one time (for example, when testing the
+  // ambient model, the interpreter is created a few times), this op data
+  // counter needs to be reset so that future instances do not overrun this op
+  // data array.
+  op_data_counter = 0;
+
+  data->cycles_until_run = data->cycles_max;
+
+  return kTfLiteOk;
+}
+
+}  // namespace circular_buffer
+
+TfLiteRegistration* Register_CIRCULAR_BUFFER() {
+  static TfLiteRegistration r = {};
+  r.prepare = circular_buffer::Prepare;
+  r.invoke = circular_buffer::Eval;
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/circular_buffer_test.cc b/tensorflow/lite/micro/kernels/circular_buffer_test.cc
new file mode 100644
index 00000000000..1fd19bb9d19
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/circular_buffer_test.cc
@@ -0,0 +1,205 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/micro_ops.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+#include "tensorflow/lite/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+constexpr int kRunPeriod = 2;
+
+// TODO(b/149795762): Add this to TfLiteStatus enum.
+constexpr int kTfLiteAbort = -9;
+
+TfLiteNode PrepareCircularBufferInt8(const int* input_dims_data,
+                                     const int8_t* input_data,
+                                     const int* output_dims_data,
+                                     const int8_t* expected_output_data,
+                                     int8_t* output_data) {
+  const TfLiteRegistration* registration =
+      ops::micro::Register_CIRCULAR_BUFFER();
+
+  TfLiteNode node;
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  constexpr int inputs_size = 2;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input_data, input_dims, 1, 0, "input_tensor"),
+      CreateQuantizedTensor(output_data, output_dims, 1, 0, "output_tensor"),
+  };
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  // There is one input - tensor 0.
+  const int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  // There is one output - tensor 1.
+  const int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  // There are no intermediates.
+  const int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.builtin_data = nullptr;
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->prepare);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  return node;
+}
+
+// Run invoke cycles_until_output times with the supplied input, expecting
+// invoke to return kTfLiteAbort until the last iteration, at which point the
+// output should match expected_output_data.
+TfLiteStatus InvokeCircularBufferInt8(const int* input_dims_data,
+                                      const int8_t* input_data,
+                                      const int* output_dims_data,
+                                      const int8_t* expected_output_data,
+                                      int8_t* output_data, TfLiteNode* node) {
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+
+  const int output_dims_count = ElementCount(*output_dims);
+  const TfLiteRegistration* registration =
+      ops::micro::Register_CIRCULAR_BUFFER();
+
+  constexpr int inputs_size = 2;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input_data, input_dims, 1, 0, "input_tensor"),
+      CreateQuantizedTensor(output_data, output_dims, 1, 0, "output_tensor"),
+  };
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  // There is one input - tensor 0.
+  const int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  // There is one output - tensor 1.
+  const int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  // There are no intermediates.
+  const int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  node->inputs = inputs_array;
+  node->outputs = outputs_array;
+  node->temporaries = temporaries_array;
+  node->builtin_data = nullptr;
+  node->custom_initial_data = nullptr;
+  node->custom_initial_data_size = 0;
+  node->delegate = nullptr;
+
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+
+  TfLiteStatus status = registration->invoke(&context, node);
+
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data[i], output_data[i]);
+  }
+  return status;
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(OutputTensorLength4) {
+  constexpr int depth = 3;
+  constexpr int num_slots = 4;
+  int8_t output_data[depth * num_slots];
+
+  memset(output_data, 0, sizeof(output_data));
+  // There are four input dimensions - [1, 1, 1, depth].
+  const int input_dims[] = {4, 1, 1, 1, depth};
+  // There are four output dimensions - [1, num_slots, 1, depth].
+  const int output_dims[] = {4, 1, num_slots, 1, depth};
+
+  const int8_t goldens[5][16] = {{0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3},
+                                 {0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6},
+                                 {0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+                                 {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
+                                 {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}};
+
+  int8_t input[depth];
+  TfLiteNode node = tflite::testing::PrepareCircularBufferInt8(
+      input_dims, input, output_dims, goldens[0], output_data);
+  // Expect the circular buffer to run every other invoke for 4xN output.
+  for (int i = 0; i < 5; i++) {
+    for (int j = 0; j < depth; j++) {
+      input[j] = i * depth + j + 1;
+    }
+    TfLiteStatus status = tflite::testing::InvokeCircularBufferInt8(
+        input_dims, input, output_dims, goldens[i], output_data, &node);
+
+    // Every kRunPeriod iterations, the circular buffer should return kTfLiteOk.
+    if (i % tflite::testing::kRunPeriod == tflite::testing::kRunPeriod - 1) {
+      TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, status);
+    } else {
+      TF_LITE_MICRO_EXPECT_EQ(tflite::testing::kTfLiteAbort, status);
+    }
+  }
+}
+
+TF_LITE_MICRO_TEST(OutputTensorLength5) {
+  constexpr int depth = 4;
+  constexpr int num_slots = 5;
+  int8_t output_data[depth * num_slots];
+
+  memset(output_data, 0, sizeof(output_data));
+  const int input_dims[] = {4, 1, 1, 1, depth};
+  const int output_dims[] = {4, 1, num_slots, 1, depth};
+
+  const int8_t goldens[6][20] = {
+      {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4},
+      {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8},
+      {0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
+      {0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20},
+      {5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
+       15, 16, 17, 18, 19, 20, 21, 22, 23, 24}};
+
+  int8_t input[depth];
+  TfLiteNode node = tflite::testing::PrepareCircularBufferInt8(
+      input_dims, input, output_dims, goldens[0], output_data);
+  // Expect circular buffer to run every cycle for 5xN output.
+  for (int i = 0; i < 6; i++) {
+    for (int j = 0; j < depth; j++) {
+      input[j] = i * depth + j + 1;
+    }
+    TF_LITE_MICRO_EXPECT_EQ(
+        kTfLiteOk,
+        tflite::testing::InvokeCircularBufferInt8(
+            input_dims, input, output_dims, goldens[i], output_data, &node));
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc b/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
index c856c23b4af..54dcf64118e 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
@@ -227,7 +227,7 @@ TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, &data));
 
-  // Inputs and outputs share the same type, guarenteed by the converter.
+  // Inputs and outputs share the same type, guaranteed by the converter.
   switch (input->type) {
     case kTfLiteFloat32:
       AverageEvalFloat(context, node, params, &data, input, output);
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/softmax.cc b/tensorflow/lite/micro/kernels/cmsis-nn/softmax.cc
new file mode 100644
index 00000000000..90fe83b744b
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/softmax.cc
@@ -0,0 +1,229 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/softmax.h"
+
+#include "arm_nnfunctions.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace activations {
+namespace {
+
+struct OpData {
+  int32_t input_multiplier = 0;
+  int input_left_shift = 0;
+  int32_t input_range_radius = 0;
+  int diff_min = 0;
+};
+
+TfLiteStatus CalculateSoftmaxOpData(TfLiteContext* context,
+                                    const TfLiteTensor* input,
+                                    TfLiteTensor* output,
+                                    const TfLiteSoftmaxParams* params,
+                                    OpData* data) {
+  if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
+    if (input->type == kTfLiteUInt8) {
+      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+    } else {
+      TF_LITE_ENSURE_EQ(context, output->params.zero_point, -128);
+    }
+
+    TF_LITE_ENSURE(context, (output->params.scale == 1.f / 256) ||
+                                (output->params.scale == 1.f / 255));
+
+    static const int kScaledDiffIntegerBits = 5;
+
+    tflite::PreprocessSoftmaxScaling(
+        params->beta, input->params.scale, kScaledDiffIntegerBits,
+        &data->input_multiplier, &data->input_left_shift);
+    data->diff_min = -1.0 * tflite::CalculateInputRadius(
+                                kScaledDiffIntegerBits, data->input_left_shift);
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  return nullptr;
+}
+
+void Free(TfLiteContext* context, void* buffer) {}
+
+TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+// Takes a 1D tensor and performs softmax along it.
+void Softmax1DFloat(const TfLiteTensor* input, TfLiteTensor* output,
+                    TfLiteSoftmaxParams* params) {
+  const int input_size = input->dims->data[0];
+  tflite::reference_ops::Softmax(input->data.f, input_size, 1, params->beta,
+                                 output->data.f);
+}
+
+// Takes a 2D tensor and perform softmax along the last dimension.
+void Softmax2DFloat(const TfLiteTensor* input, TfLiteTensor* output,
+                    TfLiteSoftmaxParams* params) {
+  const int batch_size = input->dims->data[0];
+  const int input_size = input->dims->data[1];
+  tflite::reference_ops::Softmax(input->data.f, input_size, batch_size,
+                                 params->beta, output->data.f);
+}
+
+void Softmax1DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
+                        TfLiteSoftmaxParams* params, OpData* data) {
+  const int input_size = input->dims->data[0];
+  const int32_t shape_data[4] = {1, 1, 1, input_size};
+  RuntimeShape shape(4, shape_data);
+  SoftmaxParams op_params;
+  op_params.input_multiplier = data->input_multiplier;
+  op_params.input_left_shift = data->input_left_shift;
+  op_params.diff_min = data->diff_min;
+  if (input->type == kTfLiteUInt8) {
+    tflite::reference_ops::Softmax(op_params, shape,
+                                   GetTensorData<uint8_t>(input), shape,
+                                   GetTensorData<uint8_t>(output));
+  } else {
+    arm_softmax_s8(GetTensorData<int8_t>(input), shape_data[0], shape_data[3],
+                   op_params.input_multiplier, op_params.input_left_shift,
+                   op_params.diff_min, GetTensorData<int8_t>(output));
+  }
+}
+
+void Softmax2DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
+                        TfLiteSoftmaxParams* params, OpData* data) {
+  const int batch_size = input->dims->data[0];
+  const int input_size = input->dims->data[1];
+  const int32_t shape_data[4] = {batch_size, 1, 1, input_size};
+  RuntimeShape shape(4, shape_data);
+  SoftmaxParams op_params;
+  op_params.input_multiplier = data->input_multiplier;
+  op_params.input_left_shift = data->input_left_shift;
+  op_params.diff_min = data->diff_min;
+  if (input->type == kTfLiteUInt8) {
+    tflite::reference_ops::Softmax(op_params, shape,
+                                   GetTensorData<uint8_t>(input), shape,
+                                   GetTensorData<uint8_t>(output));
+  } else {
+    arm_softmax_s8(GetTensorData<int8_t>(input), shape_data[0], shape_data[3],
+                   op_params.input_multiplier, op_params.input_left_shift,
+                   op_params.diff_min, GetTensorData<int8_t>(output));
+  }
+}
+
+// Takes a 4D tensor and perform softmax along the forth dimension.
+void Softmax4DFloat(const TfLiteTensor* input, TfLiteTensor* output,
+                    TfLiteSoftmaxParams* params) {
+  SoftmaxParams op_params;
+  op_params.beta = params->beta;
+  tflite::reference_ops::Softmax(
+      op_params, GetTensorShape(input), GetTensorData<float>(input),
+      GetTensorShape(output), GetTensorData<float>(output));
+}
+
+void Softmax4DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
+                        TfLiteSoftmaxParams* params, OpData* data) {
+  SoftmaxParams op_params;
+  op_params.input_multiplier = data->input_multiplier;
+  op_params.input_left_shift = data->input_left_shift;
+  op_params.diff_min = data->diff_min;
+  if (input->type == kTfLiteUInt8) {
+    tflite::reference_ops::Softmax(
+        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+        GetTensorShape(output), GetTensorData<uint8_t>(output));
+  } else {
+    arm_softmax_s8(
+        GetTensorData<int8_t>(input),
+        input->dims->data[0] * input->dims->data[1] * input->dims->data[2],
+        input->dims->data[3], op_params.input_multiplier,
+        op_params.input_left_shift, op_params.diff_min,
+        GetTensorData<int8_t>(output));
+  }
+}
+
+TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteSoftmaxParams*>(node->builtin_data);
+
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+
+  OpData local_data_object;
+  OpData* data = &local_data_object;
+  TF_LITE_ENSURE_STATUS(
+      CalculateSoftmaxOpData(context, input, output, params, data));
+
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      if (NumDimensions(input) == 1) {
+        Softmax1DFloat(input, output, params);
+        return kTfLiteOk;
+      }
+      if (NumDimensions(input) == 2) {
+        Softmax2DFloat(input, output, params);
+        return kTfLiteOk;
+      }
+      if (NumDimensions(input) == 4) {
+        Softmax4DFloat(input, output, params);
+        return kTfLiteOk;
+      }
+      TF_LITE_KERNEL_LOG(
+          context, "Only 1D, 2D and 4D tensors supported currently, got %dD.",
+          NumDimensions(input));
+      return kTfLiteError;
+    }
+    case kTfLiteUInt8:
+    case kTfLiteInt8: {
+      if (NumDimensions(input) == 1) {
+        Softmax1DQuantized(input, output, params, data);
+        return kTfLiteOk;
+      }
+      if (NumDimensions(input) == 2) {
+        Softmax2DQuantized(input, output, params, data);
+        return kTfLiteOk;
+      }
+      if (NumDimensions(input) == 4) {
+        Softmax4DQuantized(input, output, params, data);
+        return kTfLiteOk;
+      }
+      TF_LITE_KERNEL_LOG(context,
+                         "Only 2D and 4D tensors supported currently, got %dD.",
+                         NumDimensions(input));
+      return kTfLiteError;
+    }
+    default:
+      TF_LITE_KERNEL_LOG(
+          context,
+          "Only float32, uint8_t and int8_t supported currently, got %d.",
+          input->type);
+      return kTfLiteError;
+  }
+}
+}  // namespace activations
+
+TfLiteRegistration* Register_SOFTMAX() {
+  static TfLiteRegistration r = {activations::Init, activations::Free,
+                                 activations::SoftmaxPrepare,
+                                 activations::SoftmaxEval};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/comparisons_test.cc b/tensorflow/lite/micro/kernels/comparisons_test.cc
index 86fb9ea759b..8f7fb9263c0 100644
--- a/tensorflow/lite/micro/kernels/comparisons_test.cc
+++ b/tensorflow/lite/micro/kernels/comparisons_test.cc
@@ -34,7 +34,7 @@ void TestComparison(tflite::BuiltinOperator op, TfLiteTensor* tensors,
   const int output_dims_count = ElementCount(*tensors[inputs_size].dims);
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration = resolver.FindOp(op, 1);
diff --git a/tensorflow/lite/micro/kernels/concatenation_test.cc b/tensorflow/lite/micro/kernels/concatenation_test.cc
index 703ef83fe87..d841d508c80 100644
--- a/tensorflow/lite/micro/kernels/concatenation_test.cc
+++ b/tensorflow/lite/micro/kernels/concatenation_test.cc
@@ -45,7 +45,7 @@ void TestConcatenateTwoInputs(std::initializer_list<int> input1_dims_data,
       CreateFloatTensor(output_data, output_dims, "output_tensor")};
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
@@ -107,7 +107,7 @@ void TestConcatenateQuantizedTwoInputs(
                             output_min, output_max)};
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
diff --git a/tensorflow/lite/micro/kernels/conv_test.cc b/tensorflow/lite/micro/kernels/conv_test.cc
index 4f5b654e473..a1f155ecc56 100644
--- a/tensorflow/lite/micro/kernels/conv_test.cc
+++ b/tensorflow/lite/micro/kernels/conv_test.cc
@@ -55,7 +55,7 @@ TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
                                  TfLiteConvParams* conv_params,
                                  float tolerance = 1e-5) {
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
 
diff --git a/tensorflow/lite/micro/kernels/depthwise_conv_test.cc b/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
index 2201223d0c4..8b79885a8a8 100644
--- a/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
@@ -44,7 +44,7 @@ TfLiteStatus ValidateDepthwiseConvGoldens(const T* expected_output_data,
                                           float tolerance, int tensors_size,
                                           TfLiteTensor* tensors) {
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
diff --git a/tensorflow/lite/micro/kernels/dequantize_test.cc b/tensorflow/lite/micro/kernels/dequantize_test.cc
index b6a0ee37726..f1adc78e226 100644
--- a/tensorflow/lite/micro/kernels/dequantize_test.cc
+++ b/tensorflow/lite/micro/kernels/dequantize_test.cc
@@ -30,7 +30,7 @@ void ValidateDequantizeGoldens(TfLiteTensor* tensors, int tensors_size,
                                int output_length, float tolerance = 1e-5) {
   TfLiteContext context;
   ::tflite::ops::micro::AllOpsResolver resolver;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   // Version 2 of dequantize supports int8 quantization.
   const TfLiteRegistration* registration =
diff --git a/tensorflow/lite/micro/kernels/elementwise_test.cc b/tensorflow/lite/micro/kernels/elementwise_test.cc
index 7c0d8a4c231..f009dca181a 100644
--- a/tensorflow/lite/micro/kernels/elementwise_test.cc
+++ b/tensorflow/lite/micro/kernels/elementwise_test.cc
@@ -45,7 +45,7 @@ void TestElementwiseFloat(tflite::BuiltinOperator op,
   }
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
       resolver.FindOp(op, /* version= */ 1);
@@ -111,7 +111,7 @@ void TestElementwiseBool(tflite::BuiltinOperator op,
   }
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
       resolver.FindOp(op, /* version= */ 1);
diff --git a/tensorflow/lite/micro/kernels/floor_test.cc b/tensorflow/lite/micro/kernels/floor_test.cc
index fdf81f55fd1..ab9cae36177 100644
--- a/tensorflow/lite/micro/kernels/floor_test.cc
+++ b/tensorflow/lite/micro/kernels/floor_test.cc
@@ -37,7 +37,7 @@ void TestFloor(const int* input_dims_data, const float* input_data,
       CreateFloatTensor(output_data, output_dims, "output_tensor"),
   };
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
       resolver.FindOp(tflite::BuiltinOperator_FLOOR, 1);
diff --git a/tensorflow/lite/micro/kernels/fully_connected.cc b/tensorflow/lite/micro/kernels/fully_connected.cc
index 1026278c58d..64bf788f538 100644
--- a/tensorflow/lite/micro/kernels/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/fully_connected.cc
@@ -77,6 +77,12 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 void Free(TfLiteContext* context, void* buffer) {}
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_MSG(context, input->type == filter->type,
+                     "Hybrid models are not supported on TFLite Micro.");
   return kTfLiteOk;
 }
 
@@ -178,7 +184,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, data_type, input,
                                         filter, bias, output, data));
 
-  switch (filter->type) {  // Already know in/out types are same.
+  // Checks in Prepare ensure input, output and filter types are all the same.
+  switch (input->type) {
     case kTfLiteFloat32:
       return EvalFloat(context, node, params, data, input, filter, bias,
                        output);
diff --git a/tensorflow/lite/micro/kernels/fully_connected_test.cc b/tensorflow/lite/micro/kernels/fully_connected_test.cc
index 81278ec9a2e..0859e4af591 100644
--- a/tensorflow/lite/micro/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/micro/kernels/fully_connected_test.cc
@@ -48,7 +48,7 @@ void TestFullyConnectedFloat(
   };
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
@@ -123,7 +123,7 @@ void TestFullyConnectedQuantized(
   };
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
diff --git a/tensorflow/lite/micro/kernels/logical_test.cc b/tensorflow/lite/micro/kernels/logical_test.cc
index bd2b10784c1..f6dc9608fa2 100644
--- a/tensorflow/lite/micro/kernels/logical_test.cc
+++ b/tensorflow/lite/micro/kernels/logical_test.cc
@@ -45,7 +45,7 @@ void TestLogicalOp(tflite::BuiltinOperator op,
   };
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration = resolver.FindOp(op, 1);
diff --git a/tensorflow/lite/micro/kernels/logistic_test.cc b/tensorflow/lite/micro/kernels/logistic_test.cc
index 73373d0cb6f..e9d5884ddb3 100644
--- a/tensorflow/lite/micro/kernels/logistic_test.cc
+++ b/tensorflow/lite/micro/kernels/logistic_test.cc
@@ -41,7 +41,7 @@ void TestLogisticFloat(std::initializer_list<int> input_dims_data,
   };
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
diff --git a/tensorflow/lite/micro/kernels/maximum_minimum_test.cc b/tensorflow/lite/micro/kernels/maximum_minimum_test.cc
index 3d10636c0df..17f13d4b0e9 100644
--- a/tensorflow/lite/micro/kernels/maximum_minimum_test.cc
+++ b/tensorflow/lite/micro/kernels/maximum_minimum_test.cc
@@ -46,7 +46,7 @@ void TestMaxMinFloat(tflite::BuiltinOperator op,
   };
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration = resolver.FindOp(op, 1);
@@ -105,7 +105,7 @@ void TestMaxMinQuantized(
   };
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration = resolver.FindOp(op, 1);
@@ -162,7 +162,7 @@ void TestMaxMinQuantizedInt32(
   };
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration = resolver.FindOp(op, 1);
diff --git a/tensorflow/lite/micro/kernels/micro_ops.h b/tensorflow/lite/micro/kernels/micro_ops.h
index d51fdd14f98..1433c2533ae 100644
--- a/tensorflow/lite/micro/kernels/micro_ops.h
+++ b/tensorflow/lite/micro/kernels/micro_ops.h
@@ -75,8 +75,10 @@ TfLiteRegistration* Register_SPLIT();
 TfLiteRegistration* Register_SQRT();
 TfLiteRegistration* Register_SQUARE();
 TfLiteRegistration* Register_STRIDED_SLICE();
+TfLiteRegistration* Register_SUB();
 TfLiteRegistration* Register_SVDF();
 TfLiteRegistration* Register_UNPACK();
+TfLiteRegistration* Register_CIRCULAR_BUFFER();
 
 }  // namespace micro
 }  // namespace ops
diff --git a/tensorflow/lite/micro/kernels/mul_test.cc b/tensorflow/lite/micro/kernels/mul_test.cc
index 9955adfd7b6..c740e57f4e2 100644
--- a/tensorflow/lite/micro/kernels/mul_test.cc
+++ b/tensorflow/lite/micro/kernels/mul_test.cc
@@ -47,7 +47,7 @@ void TestMulFloat(std::initializer_list<int> input1_dims_data,
   };
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   const TfLiteRegistration* registration =
       resolver.FindOp(tflite::BuiltinOperator_MUL, 1);
 
@@ -122,7 +122,7 @@ void TestMulQuantized(std::initializer_list<int> input1_dims_data,
   };
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   const TfLiteRegistration* registration =
       resolver.FindOp(tflite::BuiltinOperator_MUL, 1);
 
diff --git a/tensorflow/lite/micro/kernels/neg_test.cc b/tensorflow/lite/micro/kernels/neg_test.cc
index ea012483a23..ac2b79f8de1 100644
--- a/tensorflow/lite/micro/kernels/neg_test.cc
+++ b/tensorflow/lite/micro/kernels/neg_test.cc
@@ -40,7 +40,7 @@ void TestNegFloat(std::initializer_list<int> input_dims_data,
   };
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
       resolver.FindOp(tflite::BuiltinOperator_NEG, 1);
diff --git a/tensorflow/lite/micro/kernels/pack_test.cc b/tensorflow/lite/micro/kernels/pack_test.cc
index b218b43a894..b384cb78d4e 100644
--- a/tensorflow/lite/micro/kernels/pack_test.cc
+++ b/tensorflow/lite/micro/kernels/pack_test.cc
@@ -49,7 +49,7 @@ void TestPackTwoInputsFloat(std::initializer_list<int> input1_dims_data,
   }
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
       resolver.FindOp(tflite::BuiltinOperator_PACK, /* version= */ 1);
@@ -125,7 +125,7 @@ void TestPackThreeInputsFloat(std::initializer_list<int> input1_dims_data,
   }
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
@@ -199,7 +199,7 @@ void TestPackTwoInputsQuantized(
   }
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
       resolver.FindOp(tflite::BuiltinOperator_PACK, /* version= */ 1);
@@ -269,7 +269,7 @@ void TestPackTwoInputsQuantized32(
   }
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
       resolver.FindOp(tflite::BuiltinOperator_PACK, /* version= */ 1);
diff --git a/tensorflow/lite/micro/kernels/pad_test.cc b/tensorflow/lite/micro/kernels/pad_test.cc
index f659b73c0ef..a114ca0a56a 100644
--- a/tensorflow/lite/micro/kernels/pad_test.cc
+++ b/tensorflow/lite/micro/kernels/pad_test.cc
@@ -29,7 +29,7 @@ TfLiteStatus ValidatePadGoldens(TfLiteTensor* tensors, int tensors_size,
                                 const T* golden, T* output_data,
                                 int output_length) {
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
       resolver.FindOp(tflite::BuiltinOperator_PAD, 1);
@@ -66,7 +66,7 @@ TfLiteStatus ValidatePadV2Goldens(TfLiteTensor* tensors, int tensors_size,
                                   const T* golden, T* output_data,
                                   int output_length) {
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
       resolver.FindOp(tflite::BuiltinOperator_PADV2, 1);
diff --git a/tensorflow/lite/micro/kernels/pooling_test.cc b/tensorflow/lite/micro/kernels/pooling_test.cc
index 39488b22a56..8bfeb718a1b 100644
--- a/tensorflow/lite/micro/kernels/pooling_test.cc
+++ b/tensorflow/lite/micro/kernels/pooling_test.cc
@@ -47,7 +47,7 @@ void TestAveragePoolingFloat(std::initializer_list<int> input_dims_data,
   };
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
@@ -121,7 +121,7 @@ void TestAveragePoolingQuantized(
   };
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
@@ -188,7 +188,7 @@ void TestMaxPoolFloat(std::initializer_list<int> input_dims_data,
   };
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
@@ -264,7 +264,7 @@ void TestMaxPoolQuantized(std::initializer_list<int> input_dims_data,
   };
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
diff --git a/tensorflow/lite/micro/kernels/prelu_test.cc b/tensorflow/lite/micro/kernels/prelu_test.cc
index db770fd0f27..4b35dac5849 100644
--- a/tensorflow/lite/micro/kernels/prelu_test.cc
+++ b/tensorflow/lite/micro/kernels/prelu_test.cc
@@ -43,7 +43,7 @@ void TestPreluFloat(std::initializer_list<int> input_dims_data,
       CreateFloatTensor(output_data, output_dims, "output_tensor"),
   };
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
       resolver.FindOp(tflite::BuiltinOperator_PRELU, 1);
@@ -108,7 +108,7 @@ void TestPreluQuantized(std::initializer_list<int> input_dims_data,
                             output_min, output_max),
   };
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
       resolver.FindOp(tflite::BuiltinOperator_PRELU, 1);
diff --git a/tensorflow/lite/micro/kernels/quantize_test.cc b/tensorflow/lite/micro/kernels/quantize_test.cc
index b116eb439c6..869801afff8 100644
--- a/tensorflow/lite/micro/kernels/quantize_test.cc
+++ b/tensorflow/lite/micro/kernels/quantize_test.cc
@@ -53,7 +53,7 @@ void TestQuantize(const int* input_dims_data, const float* input_data,
   };
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   // Version 1 of quantize supports int8 and uint8 quantization.
   const TfLiteRegistration* registration =
diff --git a/tensorflow/lite/micro/kernels/reduce_test.cc b/tensorflow/lite/micro/kernels/reduce_test.cc
index 9ee09f817e8..a791cdeaba6 100644
--- a/tensorflow/lite/micro/kernels/reduce_test.cc
+++ b/tensorflow/lite/micro/kernels/reduce_test.cc
@@ -49,7 +49,7 @@ TfLiteStatus ValidateReduceGoldens(TfLiteTensor* tensors, int tensors_size,
                                    TfLiteReducerParams* params,
                                    float tolerance = 1e-5) {
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
 
diff --git a/tensorflow/lite/micro/kernels/reshape_test.cc b/tensorflow/lite/micro/kernels/reshape_test.cc
index 16d70a0159e..983b3da35d5 100644
--- a/tensorflow/lite/micro/kernels/reshape_test.cc
+++ b/tensorflow/lite/micro/kernels/reshape_test.cc
@@ -45,7 +45,7 @@ void TestReshapeImpl(TfLiteTensor* input_tensor, TfLiteTensor* shape_tensor,
     constexpr int tensors_size = inputs_size + outputs_size;
     tensors[0] = *input_tensor;
     tensors[1] = *output_tensor,
-    PopulateContext(tensors, tensors_size, &context);
+    PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
     node.inputs = IntArrayFromInitializer({1, 0});
     node.outputs = IntArrayFromInitializer({1, 1});
   } else {
@@ -55,7 +55,7 @@ void TestReshapeImpl(TfLiteTensor* input_tensor, TfLiteTensor* shape_tensor,
     tensors[0] = *input_tensor;
     tensors[1] = *shape_tensor;
     tensors[2] = *output_tensor;
-    PopulateContext(tensors, tensors_size, &context);
+    PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
     node.inputs = IntArrayFromInitializer({2, 0, 1});
     node.outputs = IntArrayFromInitializer({1, 2});
   }
diff --git a/tensorflow/lite/micro/kernels/round_test.cc b/tensorflow/lite/micro/kernels/round_test.cc
index f50f323a3fe..e19ea41f2d9 100644
--- a/tensorflow/lite/micro/kernels/round_test.cc
+++ b/tensorflow/lite/micro/kernels/round_test.cc
@@ -36,7 +36,7 @@ void TestRound(const int* input_dims_data, const float* input_data,
       CreateFloatTensor(output_data, output_dims, "output_tensor"),
   };
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
       resolver.FindOp(tflite::BuiltinOperator_ROUND, 1);
diff --git a/tensorflow/lite/micro/kernels/softmax.cc b/tensorflow/lite/micro/kernels/softmax.cc
index 41b7311977b..fe2bfce5c7a 100644
--- a/tensorflow/lite/micro/kernels/softmax.cc
+++ b/tensorflow/lite/micro/kernels/softmax.cc
@@ -237,9 +237,9 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
         Softmax4DQuantized(input, output, params, data);
         return kTfLiteOk;
       }
-      TF_LITE_KERNEL_LOG(context,
-                         "Only 2D and 4D tensors supported currently, got %dD.",
-                         NumDimensions(input));
+      TF_LITE_KERNEL_LOG(
+          context, "Only 1D, 2D and 4D tensors supported currently, got %dD.",
+          NumDimensions(input));
       return kTfLiteError;
     }
     default:
diff --git a/tensorflow/lite/micro/kernels/softmax_test.cc b/tensorflow/lite/micro/kernels/softmax_test.cc
index 55bd5bf5757..0e7715cccf2 100644
--- a/tensorflow/lite/micro/kernels/softmax_test.cc
+++ b/tensorflow/lite/micro/kernels/softmax_test.cc
@@ -41,7 +41,7 @@ void TestSoftmaxFloat(std::initializer_list<int> input_dims_data,
   };
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
@@ -107,7 +107,7 @@ void TestSoftmaxQuantized(std::initializer_list<int> input_dims_data,
   };
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
@@ -173,7 +173,7 @@ void TestSoftmaxQuantizedSigned(
   };
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
@@ -250,7 +250,7 @@ TF_LITE_MICRO_TEST(SimpleTest) {
       output_data);
 }
 
-TF_LITE_MICRO_TEST(SimpleTestQuantized) {
+TF_LITE_MICRO_TEST(SimpleTestQuantizedUnsigned) {
   using tflite::testing::F2Q;
 
   const float input_min = -63.5f;
@@ -322,4 +322,360 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedSigned) {
       output_data);
 }
 
+TF_LITE_MICRO_TEST(SimpleTestQuantizedSigned1D) {
+  using tflite::testing::F2QS;
+
+  const float input_min = -63.5f;
+  const float input_max = 64.0f;
+  const float output_min = 0.0f;
+  const float output_max = (255.0f / 256.0f);
+  const int output_dims_count = 5;
+  int8_t output_data[output_dims_count];
+  tflite::testing::TestSoftmaxQuantizedSigned(  //
+      {1, 5},                                   // Input shape.
+      {
+          F2QS(1.0, input_min, input_max),
+          F2QS(2.0, input_min, input_max),
+          F2QS(3.0, input_min, input_max),
+          F2QS(4.0, input_min, input_max),
+          F2QS(5.0, input_min, input_max),
+      },
+      input_min, input_max,  // Input quantized range.
+      {
+          // Expected results.
+          F2QS(0.011656231, output_min, output_max),
+          F2QS(0.031684921, output_min, output_max),
+          F2QS(0.086128544, output_min, output_max),
+          F2QS(0.234121657, output_min, output_max),
+          F2QS(0.636408647, output_min, output_max),
+      },
+      {1, 5},                  // Output shape.
+      output_min, output_max,  // Output quantized range.
+      output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestQuantizedSigned2D) {
+  using tflite::testing::F2QS;
+
+  const float input_min = -63.5f;
+  const float input_max = 64.0f;
+  const float output_min = 0.0f;
+  const float output_max = (255.0f / 256.0f);
+  const int output_dims_count = 10;
+  int8_t output_data[output_dims_count];
+  tflite::testing::TestSoftmaxQuantizedSigned(  //
+      {2, 2, 5},                                // Input shape.
+      {                                         // h = 0
+       F2QS(-3.0, input_min, input_max), F2QS(5.0, input_min, input_max),
+       F2QS(-7.0, input_min, input_max), F2QS(9.0, input_min, input_max),
+       F2QS(-11.0, input_min, input_max),
+       // h = 1
+       F2QS(1.0, input_min, input_max), F2QS(2.0, input_min, input_max),
+       F2QS(3.0, input_min, input_max), F2QS(4.0, input_min, input_max),
+       F2QS(5.0, input_min, input_max)},
+      input_min, input_max,  // Input quantized range.
+      {
+          // Expected results.
+          // h = 0
+          F2QS(0.000006034, output_min, output_max),
+          F2QS(0.017986099, output_min, output_max),
+          F2QS(0.000000111, output_min, output_max),
+          F2QS(0.982007754, output_min, output_max),
+          F2QS(0.000000002, output_min, output_max),
+          // h = 1
+          F2QS(0.011656231, output_min, output_max),
+          F2QS(0.031684921, output_min, output_max),
+          F2QS(0.086128544, output_min, output_max),
+          F2QS(0.234121657, output_min, output_max),
+          F2QS(0.636408647, output_min, output_max),
+      },
+      {2, 2, 5},               // Output shape.
+      output_min, output_max,  // Output quantized range.
+      output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestQuantizedSigned4D) {
+  using tflite::testing::F2QS;
+
+  const float input_min = -63.5f;
+  const float input_max = 64.0f;
+  const float output_min = 0.0f;
+  const float output_max = (255.0f / 256.0f);
+  const int output_dims_count = 120;
+  int8_t output_data[output_dims_count];
+  tflite::testing::TestSoftmaxQuantizedSigned(  //
+      {4, 2, 3, 4, 5},                          // Input shape.
+      {                                         // n = 0
+       // c = 0
+       // h = 0
+       F2QS(3.00, input_min, input_max), F2QS(6.00, input_min, input_max),
+       F2QS(-5.00, input_min, input_max), F2QS(4.00, input_min, input_max),
+       F2QS(-9.00, input_min, input_max),
+       // h = 1
+       F2QS(-10.00, input_min, input_max), F2QS(-10.00, input_min, input_max),
+       F2QS(-8.00, input_min, input_max), F2QS(2.00, input_min, input_max),
+       F2QS(2.00, input_min, input_max),
+       // h = 2
+       F2QS(8.00, input_min, input_max), F2QS(-5.00, input_min, input_max),
+       F2QS(-8.00, input_min, input_max), F2QS(5.00, input_min, input_max),
+       F2QS(-6.00, input_min, input_max),
+       // h = 3
+       F2QS(-8.00, input_min, input_max), F2QS(6.00, input_min, input_max),
+       F2QS(1.00, input_min, input_max), F2QS(-10.00, input_min, input_max),
+       F2QS(-8.00, input_min, input_max),
+
+       // c = 1
+       // h = 0
+       F2QS(7.00, input_min, input_max), F2QS(6.00, input_min, input_max),
+       F2QS(-10.00, input_min, input_max), F2QS(-4.00, input_min, input_max),
+       F2QS(-5.00, input_min, input_max),
+       // h = 1
+       F2QS(2.00, input_min, input_max), F2QS(7.00, input_min, input_max),
+       F2QS(9.00, input_min, input_max), F2QS(-9.00, input_min, input_max),
+       F2QS(7.00, input_min, input_max),
+       // h = 2
+       F2QS(-4.00, input_min, input_max), F2QS(-2.00, input_min, input_max),
+       F2QS(8.00, input_min, input_max), F2QS(2.00, input_min, input_max),
+       F2QS(2.00, input_min, input_max),
+       // h = 3
+       F2QS(3.00, input_min, input_max), F2QS(6.00, input_min, input_max),
+       F2QS(6.00, input_min, input_max), F2QS(2.00, input_min, input_max),
+       F2QS(4.00, input_min, input_max),
+
+       // c = 2
+       // h = 0
+       F2QS(9.00, input_min, input_max), F2QS(7.00, input_min, input_max),
+       F2QS(-7.00, input_min, input_max), F2QS(0.00, input_min, input_max),
+       F2QS(4.00, input_min, input_max),
+       // h = 1
+       F2QS(-3.00, input_min, input_max), F2QS(8.00, input_min, input_max),
+       F2QS(8.00, input_min, input_max), F2QS(-3.00, input_min, input_max),
+       F2QS(-4.00, input_min, input_max),
+       // h = 2
+       F2QS(-9.00, input_min, input_max), F2QS(-9.00, input_min, input_max),
+       F2QS(4.00, input_min, input_max), F2QS(-8.00, input_min, input_max),
+       F2QS(-1.00, input_min, input_max),
+       // h = 3
+       F2QS(-10.00, input_min, input_max), F2QS(-2.00, input_min, input_max),
+       F2QS(6.00, input_min, input_max), F2QS(-7.00, input_min, input_max),
+       F2QS(0.00, input_min, input_max),
+
+       // n = 1
+       // c = 0
+       // h = 0
+       F2QS(-9.00, input_min, input_max), F2QS(-8.00, input_min, input_max),
+       F2QS(6.00, input_min, input_max), F2QS(-1.00, input_min, input_max),
+       F2QS(-5.00, input_min, input_max),
+       // h = 1
+       F2QS(-10.00, input_min, input_max), F2QS(-5.00, input_min, input_max),
+       F2QS(-10.00, input_min, input_max), F2QS(7.00, input_min, input_max),
+       F2QS(-2.00, input_min, input_max),
+       // h = 2
+       F2QS(-5.00, input_min, input_max), F2QS(-4.00, input_min, input_max),
+       F2QS(1.00, input_min, input_max), F2QS(2.00, input_min, input_max),
+       F2QS(2.00, input_min, input_max),
+       // h = 3
+       F2QS(-2.00, input_min, input_max), F2QS(-2.00, input_min, input_max),
+       F2QS(1.00, input_min, input_max), F2QS(1.00, input_min, input_max),
+       F2QS(-4.00, input_min, input_max),
+
+       // c = 1
+       // h = 0
+       F2QS(-8.00, input_min, input_max), F2QS(-3.00, input_min, input_max),
+       F2QS(1.00, input_min, input_max), F2QS(1.00, input_min, input_max),
+       F2QS(-1.00, input_min, input_max),
+       // h = 1
+       F2QS(-2.00, input_min, input_max), F2QS(6.00, input_min, input_max),
+       F2QS(-1.00, input_min, input_max), F2QS(-5.00, input_min, input_max),
+       F2QS(6.00, input_min, input_max),
+       // h = 2
+       F2QS(-7.00, input_min, input_max), F2QS(8.00, input_min, input_max),
+       F2QS(9.00, input_min, input_max), F2QS(0.00, input_min, input_max),
+       F2QS(9.00, input_min, input_max),
+       // h = 3
+       F2QS(-9.00, input_min, input_max), F2QS(-5.00, input_min, input_max),
+       F2QS(-2.00, input_min, input_max), F2QS(0.00, input_min, input_max),
+       F2QS(8.00, input_min, input_max),
+
+       // c = 2
+       // h = 0
+       F2QS(4.00, input_min, input_max), F2QS(2.00, input_min, input_max),
+       F2QS(-3.00, input_min, input_max), F2QS(5.00, input_min, input_max),
+       F2QS(8.00, input_min, input_max),
+       // h = 1
+       F2QS(-1.00, input_min, input_max), F2QS(1.00, input_min, input_max),
+       F2QS(-4.00, input_min, input_max), F2QS(-9.00, input_min, input_max),
+       F2QS(7.00, input_min, input_max),
+       // h = 2
+       F2QS(3.00, input_min, input_max), F2QS(-8.00, input_min, input_max),
+       F2QS(0.00, input_min, input_max), F2QS(9.00, input_min, input_max),
+       F2QS(-4.00, input_min, input_max),
+       // h = 3
+       F2QS(8.00, input_min, input_max), F2QS(-1.00, input_min, input_max),
+       F2QS(9.00, input_min, input_max), F2QS(-9.00, input_min, input_max),
+       F2QS(1.00, input_min, input_max)},
+      input_min, input_max,  // Input quantized range.
+      {                      // Expected results.
+       // n = 0
+       // c = 0
+       // h = 0
+       F2QS(0.042009463, output_min, output_max),
+       F2QS(0.843782625, output_min, output_max),
+       F2QS(0.000014093, output_min, output_max),
+       F2QS(0.114193561, output_min, output_max),
+       F2QS(0.000000258, output_min, output_max),
+       // h = 1
+       F2QS(0.000003072, output_min, output_max),
+       F2QS(0.000003072, output_min, output_max),
+       F2QS(0.000022699, output_min, output_max),
+       F2QS(0.499985578, output_min, output_max),
+       F2QS(0.499985578, output_min, output_max),
+       // h = 2
+       F2QS(0.952571219, output_min, output_max),
+       F2QS(0.000002153, output_min, output_max),
+       F2QS(0.000000107, output_min, output_max),
+       F2QS(0.047425728, output_min, output_max),
+       F2QS(0.000000792, output_min, output_max),
+       // h = 3
+       F2QS(0.000000826, output_min, output_max),
+       F2QS(0.993305397, output_min, output_max),
+       F2QS(0.006692839, output_min, output_max),
+       F2QS(0.000000112, output_min, output_max),
+       F2QS(0.000000826, output_min, output_max),
+
+       // c = 1
+       // h = 0
+       F2QS(0.731046347, output_min, output_max),
+       F2QS(0.268936922, output_min, output_max),
+       F2QS(0.000000030, output_min, output_max),
+       F2QS(0.000012210, output_min, output_max),
+       F2QS(0.000004492, output_min, output_max),
+       // h = 1
+       F2QS(0.000717124, output_min, output_max),
+       F2QS(0.106430599, output_min, output_max),
+       F2QS(0.786421666, output_min, output_max),
+       F2QS(0.000000012, output_min, output_max),
+       F2QS(0.106430599, output_min, output_max),
+       // h = 2
+       F2QS(0.000006114, output_min, output_max),
+       F2QS(0.000045174, output_min, output_max),
+       F2QS(0.995015917, output_min, output_max),
+       F2QS(0.002466398, output_min, output_max),
+       F2QS(0.002466398, output_min, output_max),
+       // h = 3
+       F2QS(0.022595176, output_min, output_max),
+       F2QS(0.453836234, output_min, output_max),
+       F2QS(0.453836234, output_min, output_max),
+       F2QS(0.008312301, output_min, output_max),
+       F2QS(0.061420055, output_min, output_max),
+
+       // c = 2
+       // h = 0
+       F2QS(0.875505904, output_min, output_max),
+       F2QS(0.118486839, output_min, output_max),
+       F2QS(0.000000099, output_min, output_max),
+       F2QS(0.000108046, output_min, output_max),
+       F2QS(0.005899112, output_min, output_max),
+       // h = 1
+       F2QS(0.000008351, output_min, output_max),
+       F2QS(0.499990113, output_min, output_max),
+       F2QS(0.499990113, output_min, output_max),
+       F2QS(0.000008351, output_min, output_max),
+       F2QS(0.000003072, output_min, output_max),
+       // h = 2
+       F2QS(0.000002245, output_min, output_max),
+       F2QS(0.000002245, output_min, output_max),
+       F2QS(0.993296627, output_min, output_max),
+       F2QS(0.000006103, output_min, output_max),
+       F2QS(0.006692780, output_min, output_max),
+       // h = 3
+       F2QS(0.000000112, output_min, output_max),
+       F2QS(0.000334520, output_min, output_max),
+       F2QS(0.997191323, output_min, output_max),
+       F2QS(0.000002254, output_min, output_max),
+       F2QS(0.002471790, output_min, output_max),
+
+       // n = 1
+       // c = 0
+       // h = 0
+       F2QS(0.000000306, output_min, output_max),
+       F2QS(0.000000831, output_min, output_max),
+       F2QS(0.999071142, output_min, output_max),
+       F2QS(0.000911035, output_min, output_max),
+       F2QS(0.000016686, output_min, output_max),
+       // h = 1
+       F2QS(0.000000041, output_min, output_max),
+       F2QS(0.000006143, output_min, output_max),
+       F2QS(0.000000041, output_min, output_max),
+       F2QS(0.999870380, output_min, output_max),
+       F2QS(0.000123394, output_min, output_max),
+       // h = 2
+       F2QS(0.000384554, output_min, output_max),
+       F2QS(0.001045327, output_min, output_max),
+       F2QS(0.155140254, output_min, output_max),
+       F2QS(0.421714933, output_min, output_max),
+       F2QS(0.421714933, output_min, output_max),
+       // h = 3
+       F2QS(0.023637081, output_min, output_max),
+       F2QS(0.023637081, output_min, output_max),
+       F2QS(0.474763454, output_min, output_max),
+       F2QS(0.474763454, output_min, output_max),
+       F2QS(0.003198931, output_min, output_max),
+
+       // c = 1
+       // h = 0
+       F2QS(0.000057299, output_min, output_max),
+       F2QS(0.008503973, output_min, output_max),
+       F2QS(0.464301197, output_min, output_max),
+       F2QS(0.464301197, output_min, output_max),
+       F2QS(0.062836334, output_min, output_max),
+       // h = 1
+       F2QS(0.000167625, output_min, output_max),
+       F2QS(0.499684188, output_min, output_max),
+       F2QS(0.000455653, output_min, output_max),
+       F2QS(0.000008346, output_min, output_max),
+       F2QS(0.499684188, output_min, output_max),
+       // h = 2
+       F2QS(0.000000048, output_min, output_max),
+       F2QS(0.155354299, output_min, output_max),
+       F2QS(0.422296769, output_min, output_max),
+       F2QS(0.000052116, output_min, output_max),
+       F2QS(0.422296769, output_min, output_max),
+       // h = 3
+       F2QS(0.000000041, output_min, output_max),
+       F2QS(0.000002259, output_min, output_max),
+       F2QS(0.000045383, output_min, output_max),
+       F2QS(0.000335334, output_min, output_max),
+       F2QS(0.999616982, output_min, output_max),
+
+       // c = 2
+       // h = 0
+       F2QS(0.017107856, output_min, output_max),
+       F2QS(0.002315297, output_min, output_max),
+       F2QS(0.000015600, output_min, output_max),
+       F2QS(0.046503973, output_min, output_max),
+       F2QS(0.934057274, output_min, output_max),
+       // h = 1
+       F2QS(0.000334516, output_min, output_max),
+       F2QS(0.002471755, output_min, output_max),
+       F2QS(0.000016655, output_min, output_max),
+       F2QS(0.000000112, output_min, output_max),
+       F2QS(0.997176963, output_min, output_max),
+       // h = 2
+       F2QS(0.002472313, output_min, output_max),
+       F2QS(0.000000041, output_min, output_max),
+       F2QS(0.000123089, output_min, output_max),
+       F2QS(0.997402302, output_min, output_max),
+       F2QS(0.000002254, output_min, output_max),
+       // h = 3
+       F2QS(0.268866557, output_min, output_max),
+       F2QS(0.000033181, output_min, output_max),
+       F2QS(0.730855076, output_min, output_max),
+       F2QS(0.000000011, output_min, output_max),
+       F2QS(0.000245175, output_min, output_max)},
+      {4, 2, 3, 4, 5},         // Output shape.
+      output_min, output_max,  // Output quantized range.
+      output_data);
+}
+
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/split.cc b/tensorflow/lite/micro/kernels/split.cc
index a4cc27fa8d1..d32a88ee87f 100644
--- a/tensorflow/lite/micro/kernels/split.cc
+++ b/tensorflow/lite/micro/kernels/split.cc
@@ -74,7 +74,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, 1);
 
   // Dynamic output tensors are needed if axis tensor is not constant.
-  // But Micro doesn't support dynamic memeory allocation, so we only support
+  // But Micro doesn't support dynamic memory allocation, so we only support
   // constant axis tensor for now.
   TF_LITE_ENSURE_MSG(context, IsConstantTensor(axis),
                      "Non constant axis tensor not supported");
diff --git a/tensorflow/lite/micro/kernels/split_test.cc b/tensorflow/lite/micro/kernels/split_test.cc
index a9ed9347cad..59be78f2960 100644
--- a/tensorflow/lite/micro/kernels/split_test.cc
+++ b/tensorflow/lite/micro/kernels/split_test.cc
@@ -62,7 +62,7 @@ void TestSplitTwoOutputsFloat(
   }
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
       resolver.FindOp(tflite::BuiltinOperator_SPLIT, /* version= */ 1);
@@ -165,7 +165,7 @@ void TestSplitFourOutputsFloat(
   }
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
       resolver.FindOp(tflite::BuiltinOperator_SPLIT, /* version= */ 1);
@@ -263,7 +263,7 @@ void TestSplitTwoOutputsQuantized(
   }
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
       resolver.FindOp(tflite::BuiltinOperator_SPLIT, /* version= */ 1);
@@ -352,7 +352,7 @@ void TestSplitTwoOutputsQuantized32(
   }
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
       resolver.FindOp(tflite::BuiltinOperator_SPLIT, /* version= */ 1);
diff --git a/tensorflow/lite/micro/kernels/strided_slice_test.cc b/tensorflow/lite/micro/kernels/strided_slice_test.cc
index 899037f218a..75732d8860d 100644
--- a/tensorflow/lite/micro/kernels/strided_slice_test.cc
+++ b/tensorflow/lite/micro/kernels/strided_slice_test.cc
@@ -84,7 +84,7 @@ void TestStrideSlide(std::initializer_list<int> input_shape,
                                                   "output_tensor"),
   };
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
diff --git a/tensorflow/lite/micro/kernels/sub.cc b/tensorflow/lite/micro/kernels/sub.cc
new file mode 100644
index 00000000000..c1f43e9c6bc
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/sub.cc
@@ -0,0 +1,208 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/sub.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace sub {
+
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+struct OpData {
+  bool requires_broadcast;
+
+  // These fields are used in both the general 8-bit -> 8bit quantized path,
+  // and the special 16-bit -> 16bit quantized path
+  int input1_shift;
+  int input2_shift;
+  int32 output_activation_min;
+  int32 output_activation_max;
+
+  // These fields are used only in the general 8-bit -> 8bit quantized path
+  int32 input1_multiplier;
+  int32 input2_multiplier;
+  int32 output_multiplier;
+  int output_shift;
+  int left_shift;
+  int32 input1_offset;
+  int32 input2_offset;
+  int32 output_offset;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  return nullptr;
+}
+
+void Free(TfLiteContext* context, void* buffer) {}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteSubParams* params,
+                             const TfLiteTensor* input1,
+                             const TfLiteTensor* input2, TfLiteTensor* output,
+                             OpData* data) {
+  data->requires_broadcast = !HaveSameShapes(input1, input2);
+
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
+    // 8bit -> 8bit general quantized path, with general rescalings
+    data->input1_offset = -input1->params.zero_point;
+    data->input2_offset = -input2->params.zero_point;
+    data->output_offset = output->params.zero_point;
+    data->left_shift = 20;
+    const float twice_max_input_scale =
+        2 * std::max(input1->params.scale, input2->params.scale);
+    const double real_input1_multiplier =
+        static_cast<double>(input1->params.scale / twice_max_input_scale);
+    const double real_input2_multiplier =
+        static_cast<double>(input2->params.scale / twice_max_input_scale);
+    const double real_output_multiplier =
+        static_cast<double>(twice_max_input_scale /
+                            ((1 << data->left_shift) * output->params.scale));
+
+    QuantizeMultiplierSmallerThanOneExp(
+        real_input1_multiplier, &data->input1_multiplier, &data->input1_shift);
+
+    QuantizeMultiplierSmallerThanOneExp(
+        real_input2_multiplier, &data->input2_multiplier, &data->input2_shift);
+
+    QuantizeMultiplierSmallerThanOneExp(
+        real_output_multiplier, &data->output_multiplier, &data->output_shift);
+
+    TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
+        context, params->activation, output, &data->output_activation_min,
+        &data->output_activation_max));
+  }
+
+  return kTfLiteOk;
+}
+
+void EvalSub(TfLiteContext* context, TfLiteNode* node, TfLiteSubParams* params,
+             const OpData* data, const TfLiteTensor* input1,
+             const TfLiteTensor* input2, TfLiteTensor* output) {
+  float output_activation_min, output_activation_max;
+  CalculateActivationRange(params->activation, &output_activation_min,
+                           &output_activation_max);
+  tflite::ArithmeticParams op_params;
+  SetActivationParams(output_activation_min, output_activation_max, &op_params);
+#define TF_LITE_SUB(opname)                                               \
+  opname(op_params, GetTensorShape(input1), GetTensorData<float>(input1), \
+         GetTensorShape(input2), GetTensorData<float>(input2),            \
+         GetTensorShape(output), GetTensorData<float>(output))
+  if (data->requires_broadcast) {
+    TF_LITE_SUB(tflite::reference_ops::BroadcastSub4DSlow);
+  } else {
+    TF_LITE_SUB(tflite::reference_ops::SubWithActivation);
+  }
+#undef TF_LITE_SUB
+}
+
+TfLiteStatus EvalSubQuantized(TfLiteContext* context, TfLiteNode* node,
+                              TfLiteSubParams* params, const OpData* data,
+                              const TfLiteTensor* input1,
+                              const TfLiteTensor* input2,
+                              TfLiteTensor* output) {
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
+    tflite::ArithmeticParams op_params;
+    op_params.left_shift = data->left_shift;
+    op_params.input1_offset = data->input1_offset;
+    op_params.input1_multiplier = data->input1_multiplier;
+    op_params.input1_shift = data->input1_shift;
+    op_params.input2_offset = data->input2_offset;
+    op_params.input2_multiplier = data->input2_multiplier;
+    op_params.input2_shift = data->input2_shift;
+    op_params.output_offset = data->output_offset;
+    op_params.output_multiplier = data->output_multiplier;
+    op_params.output_shift = data->output_shift;
+    SetActivationParams(data->output_activation_min,
+                        data->output_activation_max, &op_params);
+    bool need_broadcast = reference_ops::ProcessBroadcastShapes(
+        GetTensorShape(input1), GetTensorShape(input2), &op_params);
+#define TF_LITE_SUB(opname, dtype)                                        \
+  opname(op_params, GetTensorShape(input1), GetTensorData<dtype>(input1), \
+         GetTensorShape(input2), GetTensorData<dtype>(input2),            \
+         GetTensorShape(output), GetTensorData<dtype>(output));
+    if (output->type == kTfLiteInt8) {
+      if (need_broadcast) {
+        TF_LITE_SUB(tflite::reference_ops::BroadcastSub4DSlow, int8_t);
+      } else {
+        TF_LITE_SUB(tflite::reference_ops::Sub, int8_t);
+      }
+    } else {
+      if (need_broadcast) {
+        TF_LITE_SUB(tflite::reference_ops::BroadcastSub4DSlow, uint8_t);
+      } else {
+        TF_LITE_SUB(tflite::reference_ops::Sub, uint8_t);
+      }
+    }
+#undef TF_LITE_SUB
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteSubParams*>(node->builtin_data);
+
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  OpData data;
+  TF_LITE_ENSURE_STATUS(
+      CalculateOpData(context, params, input1, input2, output, &data));
+
+  if (output->type == kTfLiteFloat32) {
+    EvalSub(context, node, params, &data, input1, input2, output);
+  } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_OK(context, EvalSubQuantized(context, node, params, &data,
+                                                input1, input2, output));
+  } else {
+    TF_LITE_KERNEL_LOG(context,
+                       "Inputs and outputs not all float|uint8|int8 types.");
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace sub
+
+TfLiteRegistration* Register_SUB() {
+  static TfLiteRegistration r = {};
+  r.init = sub::Init;
+  r.free = sub::Free;
+  r.prepare = sub::Prepare;
+  r.invoke = sub::Eval;
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/sub_test.cc b/tensorflow/lite/micro/kernels/sub_test.cc
new file mode 100644
index 00000000000..e59ac636f65
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/sub_test.cc
@@ -0,0 +1,534 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+#include "tensorflow/lite/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+// Shapes and values for mixed broadcast tests.
+const int broadcast_output_dims_count = 36;
+const int broadcast_num_shapes = 4;
+
+const int broadcast_input1_shape[] = {4, 2, 3, 1, 2};
+const float broadcast_input1_values[] = {-0.3, 2.3, 0.9,  0.5, 0.8, -1.1,
+                                         1.2,  2.8, -1.6, 0.0, 0.7, -2.2};
+const float broadcast_input2_values[] = {-0.2, -0.3, 0.4, -0.5, -1.0, -0.9};
+const float
+    broadcast_goldens[broadcast_num_shapes][broadcast_output_dims_count] = {
+        {-0.1, 2.6,  -0.7, 2.8,  0.7,  3.2,  1.1, 0.8,  0.5, 1.0,  1.9, 1.4,
+         1.0,  -0.8, 0.4,  -0.6, 1.8,  -0.2, 1.4, 3.1,  0.8, 3.3,  2.2, 3.7,
+         -1.4, 0.3,  -2.0, 0.5,  -0.6, 0.9,  0.9, -1.9, 0.3, -1.7, 1.7, -1.3},
+        {-0.1, 2.6, 0.5, 1.0, 1.8, -0.2, 1.4, 3.1, -2.0, 0.5, 1.7, -1.3},
+        {-0.1, 2.5,  0.0,  2.6,  -0.7, 1.9,  1.1, 0.7,  1.2, 0.8,  0.5, 0.1,
+         1.0,  -0.9, 1.1,  -0.8, 0.4,  -1.5, 1.7, 3.3,  2.2, 3.8,  2.1, 3.7,
+         -1.1, 0.5,  -0.6, 1.0,  -0.7, 0.9,  1.2, -1.7, 1.7, -1.2, 1.6, -1.3},
+        {-0.1, 2.5, 1.2, 0.8, 0.4, -1.5, 1.7, 3.3, -0.6, 1.0, 1.6, -1.3},
+};
+
+const int broadcast_max_shape_size = 5;
+const int broadcast_input2_shapes[broadcast_num_shapes]
+                                 [broadcast_max_shape_size] = {
+                                     {4, 1, 1, 3, 2},
+                                     {4, 1, 3, 1, 2},
+                                     {4, 2, 1, 3, 1},
+                                     {4, 2, 3, 1, 1},
+};
+const int broadcast_output_shapes[broadcast_num_shapes]
+                                 [broadcast_max_shape_size] = {
+                                     {4, 2, 3, 3, 2},
+                                     {4, 2, 3, 1, 2},
+                                     {4, 2, 3, 3, 2},
+                                     {4, 2, 3, 1, 2},
+};
+
+template <typename T>
+void ValidateSubGoldens(TfLiteTensor* tensors, int tensors_size,
+                        const T* golden, T* output, int output_size,
+                        TfLiteFusedActivation activation,
+                        float tolerance = 1e-5) {
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(::tflite::BuiltinOperator_SUB, 1);
+
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteSubParams builtin_data;
+  builtin_data.activation = activation;
+
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  const size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+
+  int inputs_array_data[] = {2, 0, 1};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 2};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  for (int i = 0; i < output_size; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(golden[i], output[i], tolerance);
+  }
+}
+
+void TestSubFloat(const int* input1_dims_data, const float* input1_data,
+                  const int* input2_dims_data, const float* input2_data,
+                  const int* output_dims_data, const float* expected_output,
+                  TfLiteFusedActivation activation, float* output_data) {
+  TfLiteIntArray* input1_dims = IntArrayFromInts(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInts(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+
+  constexpr int inputs_size = 2;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input1_data, input1_dims, "input1_tensor"),
+      CreateFloatTensor(input2_data, input2_dims, "input2_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor"),
+  };
+
+  ValidateSubGoldens(tensors, tensors_size, expected_output, output_data,
+                     ElementCount(*output_dims), activation);
+}
+
+template <typename T>
+void TestSubQuantized(const int* input1_dims_data, const float* input1_data,
+                      T* input1_quantized, float input1_scale,
+                      int input1_zero_point, const int* input2_dims_data,
+                      const float* input2_data, T* input2_quantized,
+                      float input2_scale, int input2_zero_point,
+                      const int* output_dims_data, const float* golden,
+                      T* golden_quantized, float output_scale,
+                      int output_zero_point, TfLiteFusedActivation activation,
+                      T* output_data) {
+  TfLiteIntArray* input1_dims = IntArrayFromInts(input1_dims_data);
+  TfLiteIntArray* input2_dims = IntArrayFromInts(input2_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+
+  constexpr int inputs_size = 2;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      tflite::testing::CreateQuantizedTensor(
+          input1_data, input1_quantized, input1_dims, input1_scale,
+          input1_zero_point, "input1_tensor"),
+      tflite::testing::CreateQuantizedTensor(
+          input2_data, input2_quantized, input2_dims, input2_scale,
+          input2_zero_point, "input2_tensor"),
+      tflite::testing::CreateQuantizedTensor(output_data, output_dims,
+                                             output_scale, output_zero_point,
+                                             "output_tensor"),
+  };
+  tflite::AsymmetricQuantize(golden, golden_quantized,
+                             ElementCount(*output_dims), output_scale,
+                             output_zero_point);
+
+  ValidateSubGoldens(tensors, tensors_size, golden_quantized, output_data,
+                     ElementCount(*output_dims), activation);
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(FloatSubNoActivation) {
+  const int output_dims_count = 4;
+  const int inout_shape[] = {4, 1, 2, 2, 1};
+  const float input1_values[] = {-2.0, 0.2, 0.7, 0.8};
+  const float input2_values[] = {0.1, 0.2, 0.3, 0.5};
+  const float golden_values[] = {-2.1, 0.0, 0.4, 0.3};
+  float output_data[output_dims_count];
+  tflite::testing::TestSubFloat(inout_shape, input1_values, inout_shape,
+                                input2_values, inout_shape, golden_values,
+                                kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(FloatSubActivationRelu1) {
+  const int output_dims_count = 4;
+  const int inout_shape[] = {4, 1, 2, 2, 1};
+  const float input1_values[] = {-2.0, 0.2, 2.0, 0.8};
+  const float input2_values[] = {2.0, 0.2, 0.3, 0.5};
+  const float golden_values[] = {-1.0, 0.0, 1.0, 0.3};
+
+  float output_data[output_dims_count];
+  tflite::testing::TestSubFloat(inout_shape, input1_values, inout_shape,
+                                input2_values, inout_shape, golden_values,
+                                kTfLiteActRelu1, output_data);
+}
+
+TF_LITE_MICRO_TEST(FloatSubVariousInputShapes) {
+  const int output_dims_count = 6;
+  float output_data[output_dims_count];
+
+  const float input1_values[] = {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0};
+  const float input2_values[] = {0.1, 0.2, 0.3, 0.5, 1.1, 0.1};
+  const float expected_output[] = {-2.1, 0.0, 0.4, 0.3, 0.0, 1.9};
+
+  constexpr int num_shapes = 4;
+  constexpr int max_shape_size = 5;
+  const int test_shapes[num_shapes][max_shape_size] = {
+      {1, 6},
+      {2, 2, 3},
+      {3, 2, 1, 3},
+      {4, 1, 3, 1, 2},
+  };
+
+  for (int i = 0; i < num_shapes; ++i) {
+    tflite::testing::TestSubFloat(test_shapes[i], input1_values, test_shapes[i],
+                                  input2_values, test_shapes[i],
+                                  expected_output, kTfLiteActNone, output_data);
+  }
+}
+
+TF_LITE_MICRO_TEST(FloatSubWithScalarBroadcast) {
+  const int output_dims_count = 6;
+  float output_data[output_dims_count];
+
+  const float input1_values[] = {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0};
+  const int input2_shape[] = {0};
+  const float input2_values[] = {0.1};
+  const float expected_output[] = {-2.1, 0.1, 0.6, 0.7, 1.0, 1.9};
+
+  constexpr int num_shapes = 4;
+  constexpr int max_shape_size = 5;
+  const int test_shapes[num_shapes][max_shape_size] = {
+      {1, 6},
+      {2, 2, 3},
+      {3, 2, 1, 3},
+      {4, 1, 3, 1, 2},
+  };
+
+  for (int i = 0; i < num_shapes; ++i) {
+    tflite::testing::TestSubFloat(test_shapes[i], input1_values, input2_shape,
+                                  input2_values, test_shapes[i],
+                                  expected_output, kTfLiteActNone, output_data);
+  }
+}
+
+TF_LITE_MICRO_TEST(QuantizedSubNoActivationUint8) {
+  const float scales[] = {0.25, 0.5, 1.0};
+  const int zero_points[] = {125, 129, 135};
+  const int output_dims_count = 4;
+  const int inout_shape[] = {4, 1, 2, 2, 1};
+  const float input1_values[] = {-2.01, -1.01, -0.01, 0.98};
+  const float input2_values[] = {-1.01, -1.99, -2.99, -4.02};
+  const float golden_values[] = {-1, 1, 3, 5};
+
+  uint8_t input1_quantized[output_dims_count];
+  uint8_t input2_quantized[output_dims_count];
+  uint8_t golden_quantized[output_dims_count];
+  uint8_t output[output_dims_count];
+
+  tflite::testing::TestSubQuantized(
+      inout_shape, input1_values, input1_quantized, scales[0], zero_points[0],
+      inout_shape, input2_values, input2_quantized, scales[1], zero_points[1],
+      inout_shape, golden_values, golden_quantized, scales[2], zero_points[2],
+      kTfLiteActNone, output);
+}
+
+TF_LITE_MICRO_TEST(QuantizedSubNoActivationInt8) {
+  const float scales[] = {0.25, 0.5, 1.0};
+  const int zero_points[] = {-10, 4, 13};
+  const int output_dims_count = 4;
+  const int inout_shape[] = {4, 1, 2, 2, 1};
+  const float input1_values[] = {-2.01, -1.01, -0.01, 0.98};
+  const float input2_values[] = {-1.01, -1.99, -2.99, -4.02};
+  const float golden_values[] = {-1, 1, 3, 5};
+
+  int8_t input1_quantized[output_dims_count];
+  int8_t input2_quantized[output_dims_count];
+  int8_t golden_quantized[output_dims_count];
+  int8_t output[output_dims_count];
+
+  tflite::testing::TestSubQuantized(
+      inout_shape, input1_values, input1_quantized, scales[0], zero_points[0],
+      inout_shape, input2_values, input2_quantized, scales[1], zero_points[1],
+      inout_shape, golden_values, golden_quantized, scales[2], zero_points[2],
+      kTfLiteActNone, output);
+}
+
+TF_LITE_MICRO_TEST(QuantizedSubActivationRelu1Uint8) {
+  const float scales[] = {0.25, 0.5, 1.0};
+  const int zero_points[] = {125, 129, 135};
+  const int output_dims_count = 4;
+  const int inout_shape[] = {4, 1, 2, 2, 1};
+  const float input1_values[] = {-2.01, -1.01, -0.01, 0.98};
+  const float input2_values[] = {-1.01, -1.99, -2.99, -4.02};
+  const float golden_values[] = {-1, 1, 1, 1};
+
+  uint8_t input1_quantized[output_dims_count];
+  uint8_t input2_quantized[output_dims_count];
+  uint8_t golden_quantized[output_dims_count];
+  uint8_t output[output_dims_count];
+
+  tflite::testing::TestSubQuantized(
+      inout_shape, input1_values, input1_quantized, scales[0], zero_points[0],
+      inout_shape, input2_values, input2_quantized, scales[1], zero_points[1],
+      inout_shape, golden_values, golden_quantized, scales[2], zero_points[2],
+      kTfLiteActRelu1, output);
+}
+
+TF_LITE_MICRO_TEST(QuantizedSubActivationRelu1Int8) {
+  const float scales[] = {0.25, 0.5, 1.0};
+  const int zero_points[] = {-10, 4, 13};
+  const int output_dims_count = 4;
+  const int inout_shape[] = {4, 1, 2, 2, 1};
+  const float input1_values[] = {-2.01, -1.01, -0.01, 0.98};
+  const float input2_values[] = {-1.01, -1.99, -2.99, -4.02};
+  const float golden_values[] = {-1, 1, 1, 1};
+
+  int8_t input1_quantized[output_dims_count];
+  int8_t input2_quantized[output_dims_count];
+  int8_t golden_quantized[output_dims_count];
+  int8_t output[output_dims_count];
+
+  tflite::testing::TestSubQuantized(
+      inout_shape, input1_values, input1_quantized, scales[0], zero_points[0],
+      inout_shape, input2_values, input2_quantized, scales[1], zero_points[1],
+      inout_shape, golden_values, golden_quantized, scales[2], zero_points[2],
+      kTfLiteActRelu1, output);
+}
+
+TF_LITE_MICRO_TEST(QuantizedSubVariousInputShapesUint8) {
+  const float scales[] = {0.1, 0.05, 0.1};
+  const int zero_points[] = {120, 130, 139};
+  const int output_dims_count = 6;
+
+  constexpr int num_shapes = 4;
+  constexpr int max_shape_size = 5;
+  const int test_shapes[num_shapes][max_shape_size] = {
+      {1, 6},
+      {2, 2, 3},
+      {3, 2, 1, 3},
+      {4, 1, 3, 1, 2},
+  };
+
+  const float input1_values[] = {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0};
+  const float input2_values[] = {-0.1, -0.2, -0.3, -0.5, -1.1, -0.1};
+  const float golden_values[] = {-1.9, 0.4, 1.0, 1.3, 2.2, 2.1};
+
+  uint8_t input1_quantized[output_dims_count];
+  uint8_t input2_quantized[output_dims_count];
+  uint8_t golden_quantized[output_dims_count];
+  uint8_t output[output_dims_count];
+
+  for (int i = 0; i < num_shapes; i++) {
+    tflite::testing::TestSubQuantized(
+        test_shapes[i], input1_values, input1_quantized, scales[0],
+        zero_points[0], test_shapes[i], input2_values, input2_quantized,
+        scales[1], zero_points[1], test_shapes[i], golden_values,
+        golden_quantized, scales[2], zero_points[2], kTfLiteActNone, output);
+  }
+}
+
+TF_LITE_MICRO_TEST(QuantizedSubVariousInputShapesInt8) {
+  const float scales[] = {0.1, 0.05, 0.1};
+  const int zero_points[] = {-9, 5, 14};
+  const int output_dims_count = 6;
+
+  constexpr int num_shapes = 4;
+  constexpr int max_shape_size = 5;
+  const int test_shapes[num_shapes][max_shape_size] = {
+      {1, 6},
+      {2, 2, 3},
+      {3, 2, 1, 3},
+      {4, 1, 3, 1, 2},
+  };
+
+  const float input1_values[] = {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0};
+  const float input2_values[] = {-0.1, -0.2, -0.3, -0.5, -1.1, -0.1};
+  const float golden_values[] = {-1.9, 0.4, 1.0, 1.3, 2.2, 2.1};
+
+  int8_t input1_quantized[output_dims_count];
+  int8_t input2_quantized[output_dims_count];
+  int8_t golden_quantized[output_dims_count];
+  int8_t output[output_dims_count];
+
+  for (int i = 0; i < num_shapes; i++) {
+    tflite::testing::TestSubQuantized(
+        test_shapes[i], input1_values, input1_quantized, scales[0],
+        zero_points[0], test_shapes[i], input2_values, input2_quantized,
+        scales[1], zero_points[1], test_shapes[i], golden_values,
+        golden_quantized, scales[2], zero_points[2], kTfLiteActNone, output);
+  }
+}
+
+TF_LITE_MICRO_TEST(QuantizedSubWithScalarBroadcastUint8) {
+  const int output_dims_count = 6;
+
+  const float input1_values[] = {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0};
+  const int input2_shape[] = {0};
+  const float input2_values[] = {-0.1};
+  const float golden[] = {-1.9, 0.3, 0.8, 0.9, 1.2, 2.1};
+
+  constexpr int num_shapes = 4;
+  constexpr int max_shape_size = 5;
+  const int test_shapes[num_shapes][max_shape_size] = {
+      {1, 6},
+      {2, 2, 3},
+      {3, 2, 1, 3},
+      {4, 1, 3, 1, 2},
+  };
+
+  const float scales[] = {0.1, 0.1, 0.1};
+  const int zero_points[] = {120, 131, 139};
+
+  uint8_t input1_quantized[output_dims_count];
+  uint8_t input2_quantized[output_dims_count];
+  uint8_t golden_quantized[output_dims_count];
+  uint8_t output[output_dims_count];
+
+  for (int i = 0; i < num_shapes; ++i) {
+    tflite::testing::TestSubQuantized(
+        test_shapes[i], input1_values, input1_quantized, scales[0],
+        zero_points[0], input2_shape, input2_values, input2_quantized,
+        scales[1], zero_points[1], test_shapes[i], golden, golden_quantized,
+        scales[2], zero_points[2], kTfLiteActNone, output);
+  }
+}
+TF_LITE_MICRO_TEST(QuantizedSubWithScalarBroadcastFloat) {
+  const float scales[] = {0.1, 0.05, 0.1};
+  const int zero_points[] = {127, 131, 139};
+  uint8_t input1_quantized[tflite::testing::broadcast_output_dims_count];
+  uint8_t input2_quantized[tflite::testing::broadcast_output_dims_count];
+  uint8_t golden_quantized[tflite::testing::broadcast_output_dims_count];
+  uint8_t output[tflite::testing::broadcast_output_dims_count];
+  float output_float[tflite::testing::broadcast_output_dims_count];
+
+  for (int i = 0; i < tflite::testing::broadcast_num_shapes; ++i) {
+    tflite::testing::TestSubFloat(tflite::testing::broadcast_input1_shape,
+                                  tflite::testing::broadcast_input1_values,
+                                  tflite::testing::broadcast_input2_shapes[i],
+                                  tflite::testing::broadcast_input2_values,
+                                  tflite::testing::broadcast_output_shapes[i],
+                                  tflite::testing::broadcast_goldens[i],
+                                  kTfLiteActNone, output_float);
+  }
+}
+
+TF_LITE_MICRO_TEST(QuantizedSubWithScalarBroadcastInt8) {
+  const int output_dims_count = 6;
+
+  const float input1_values[] = {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0};
+  const int input2_shape[] = {0};
+  const float input2_values[] = {-0.1};
+  const float golden[] = {-1.9, 0.3, 0.8, 0.9, 1.2, 2.1};
+
+  constexpr int num_shapes = 4;
+  constexpr int max_shape_size = 5;
+  const int test_shapes[num_shapes][max_shape_size] = {
+      {1, 6},
+      {2, 2, 3},
+      {3, 2, 1, 3},
+      {4, 1, 3, 1, 2},
+  };
+
+  const float scales[] = {0.1, 0.05, 0.05};
+  const int zero_points[] = {-8, 4, 12};
+
+  int8_t input1_quantized[output_dims_count];
+  int8_t input2_quantized[output_dims_count];
+  int8_t golden_quantized[output_dims_count];
+  int8_t output[output_dims_count];
+
+  for (int i = 0; i < num_shapes; ++i) {
+    tflite::testing::TestSubQuantized(
+        test_shapes[i], input1_values, input1_quantized, scales[0],
+        zero_points[0], input2_shape, input2_values, input2_quantized,
+        scales[1], zero_points[1], test_shapes[i], golden, golden_quantized,
+        scales[2], zero_points[2], kTfLiteActNone, output);
+  }
+}
+
+TF_LITE_MICRO_TEST(QuantizedSubWithMixedBroadcastUint8) {
+  const float scales[] = {0.1, 0.05, 0.1};
+  const int zero_points[] = {127, 131, 139};
+  uint8_t input1_quantized[tflite::testing::broadcast_output_dims_count];
+  uint8_t input2_quantized[tflite::testing::broadcast_output_dims_count];
+  uint8_t golden_quantized[tflite::testing::broadcast_output_dims_count];
+  uint8_t output[tflite::testing::broadcast_output_dims_count];
+  float output_float[tflite::testing::broadcast_output_dims_count];
+
+  for (int i = 0; i < tflite::testing::broadcast_num_shapes; ++i) {
+    tflite::testing::TestSubQuantized(
+        tflite::testing::broadcast_input1_shape,
+        tflite::testing::broadcast_input1_values, input1_quantized, scales[0],
+        zero_points[0], tflite::testing::broadcast_input2_shapes[i],
+        tflite::testing::broadcast_input2_values, input2_quantized, scales[1],
+        zero_points[1], tflite::testing::broadcast_output_shapes[i],
+        tflite::testing::broadcast_goldens[i], golden_quantized, scales[2],
+        zero_points[2], kTfLiteActNone, output);
+  }
+}
+
+TF_LITE_MICRO_TEST(QuantizedSubWithMixedBroadcastInt8) {
+  const float scales[] = {0.1, 0.05, 0.1};
+  const int zero_points[] = {-10, -5, 7};
+  int8_t input1_quantized[tflite::testing::broadcast_output_dims_count];
+  int8_t input2_quantized[tflite::testing::broadcast_output_dims_count];
+  int8_t golden_quantized[tflite::testing::broadcast_output_dims_count];
+  int8_t output[tflite::testing::broadcast_output_dims_count];
+  float output_float[tflite::testing::broadcast_output_dims_count];
+
+  for (int i = 0; i < tflite::testing::broadcast_num_shapes; ++i) {
+    tflite::testing::TestSubQuantized(
+        tflite::testing::broadcast_input1_shape,
+        tflite::testing::broadcast_input1_values, input1_quantized, scales[0],
+        zero_points[0], tflite::testing::broadcast_input2_shapes[i],
+        tflite::testing::broadcast_input2_values, input2_quantized, scales[1],
+        zero_points[1], tflite::testing::broadcast_output_shapes[i],
+        tflite::testing::broadcast_goldens[i], golden_quantized, scales[2],
+        zero_points[2], kTfLiteActNone, output);
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/svdf_test.cc b/tensorflow/lite/micro/kernels/svdf_test.cc
index c6a99ca5ea2..d93bf2782f0 100644
--- a/tensorflow/lite/micro/kernels/svdf_test.cc
+++ b/tensorflow/lite/micro/kernels/svdf_test.cc
@@ -128,7 +128,7 @@ void ValidateSVDFGoldens(const int batch_size, const int num_units,
                          const int golden_input_data_size, float* output_data,
                          float* expected_output, float tolerance = 1e-5f) {
   TfLiteContext context;
-  PopulateContext(tensors, tensor_count, &context);
+  PopulateContext(tensors, tensor_count, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
@@ -209,7 +209,7 @@ void ValidateIntegerSVDFGoldens(const int batch_size, const int num_units,
                                 const int golden_input_data_size,
                                 int8_t* output_data, int8_t* expected_output) {
   TfLiteContext context;
-  PopulateContext(tensors, tensor_count, &context);
+  PopulateContext(tensors, tensor_count, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
diff --git a/tensorflow/lite/micro/kernels/unpack_test.cc b/tensorflow/lite/micro/kernels/unpack_test.cc
index 8b3dd1b7299..86ccab8edc0 100644
--- a/tensorflow/lite/micro/kernels/unpack_test.cc
+++ b/tensorflow/lite/micro/kernels/unpack_test.cc
@@ -64,7 +64,7 @@ void TestUnpackThreeOutputsFloat(
   }
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
       resolver.FindOp(tflite::BuiltinOperator_UNPACK, /* version= */ 1);
@@ -141,7 +141,7 @@ void TestUnpackOneOutputFloat(std::initializer_list<int> input_dims_data,
   }
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
       resolver.FindOp(tflite::BuiltinOperator_UNPACK, /* version= */ 1);
@@ -233,7 +233,7 @@ void TestUnpackThreeOutputsQuantized(
   }
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
       resolver.FindOp(tflite::BuiltinOperator_UNPACK, /* version= */ 1);
@@ -329,7 +329,7 @@ void TestUnpackThreeOutputsQuantized32(
   }
 
   TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
       resolver.FindOp(tflite::BuiltinOperator_UNPACK, /* version= */ 1);
diff --git a/tensorflow/lite/micro/memory_planner/BUILD b/tensorflow/lite/micro/memory_planner/BUILD
index d6d17b975f5..33c9869afec 100644
--- a/tensorflow/lite/micro/memory_planner/BUILD
+++ b/tensorflow/lite/micro/memory_planner/BUILD
@@ -26,7 +26,6 @@ cc_library(
     deps = [
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
-        "//tensorflow/lite/schema:schema_fbs",
     ],
 )
 
@@ -46,9 +45,7 @@ cc_library(
     deps = [
         ":memory_planner",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core/api",
         "//tensorflow/lite/micro:micro_compatibility",
-        "//tensorflow/lite/schema:schema_fbs",
     ],
 )
 
@@ -69,9 +66,7 @@ cc_library(
     deps = [
         ":memory_planner",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/core/api",
         "//tensorflow/lite/micro:micro_compatibility",
-        "//tensorflow/lite/schema:schema_fbs",
     ],
 )
 
diff --git a/tensorflow/lite/micro/memory_planner/memory_planner.h b/tensorflow/lite/micro/memory_planner/memory_planner.h
index 4670d59208d..2c39fbe0db3 100644
--- a/tensorflow/lite/micro/memory_planner/memory_planner.h
+++ b/tensorflow/lite/micro/memory_planner/memory_planner.h
@@ -57,7 +57,7 @@ class MemoryPlanner {
                                  int size, int first_time_used,
                                  int last_time_used) = 0;
 
-  // The largest contguous block of memory that's needed to hold the layout.
+  // The largest contiguous block of memory that's needed to hold the layout.
   virtual size_t GetMaximumMemorySize() = 0;
   // How many buffers have been added to the planner.
   virtual int GetBufferCount() = 0;
diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index f1a46890707..c3044a0351f 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -475,8 +475,8 @@ TfLiteStatus MicroAllocator::AllocateNodeAndRegistrations(
                                        &(output[i].registration));
     if (status != kTfLiteOk) {
       TF_LITE_REPORT_ERROR(error_reporter_,
-                           "Failed to get registration from op code % d\n ",
-                           opcode);
+                           "Failed to get registration from op code %s\n ",
+                           EnumNameBuiltinOperator(opcode->builtin_code()));
       return status;
     }
     const auto* registration = output[i].registration;
@@ -607,12 +607,18 @@ TfLiteStatus MicroAllocator::AllocatePersistentBuffer(size_t bytes,
 TfLiteStatus MicroAllocator::RequestScratchBufferInArena(int node_id,
                                                          size_t bytes,
                                                          int* buffer_idx) {
-  // A sanity check to make sure scratch_buffer_handles_ is contiguous.
-  if (reinterpret_cast<uint8_t*>(scratch_buffer_handles_) !=
-      memory_allocator_->GetBuffer() - memory_allocator_->GetDataSize()) {
+  // A sanity check to make sure scratch_buffer_handles_ is contiguous i.e.
+  // scratch_buffer_handles_ is pointing to the last allocation from memory
+  // allocator.
+  if (scratch_buffer_handles_ != nullptr &&
+      reinterpret_cast<uint8_t*>(scratch_buffer_handles_) !=
+          memory_allocator_->GetBuffer() +
+              memory_allocator_->GetMaxBufferSize() -
+              memory_allocator_->GetDataSize()) {
     TF_LITE_REPORT_ERROR(error_reporter_,
                          "Internal error: AllocateFromTail can not be called "
                          "between two RequestScratchBufferInArena calls.");
+    return kTfLiteError;
   }
 
   internal::ScratchBufferHandle* handle =
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver.h b/tensorflow/lite/micro/micro_mutable_op_resolver.h
index 21066cf418d..ac304352a57 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver.h
@@ -29,6 +29,8 @@ namespace tflite {
 // Op versions discussed in this file are enumerated here:
 // tensorflow/lite/tools/versioning/op_version.cc
 
+inline int MicroOpResolverAnyVersion() { return 0; }
+
 template <unsigned int tOpCount = TFLITE_REGISTRATIONS_MAX>
 class MicroOpResolver : public OpResolver {
  public:
@@ -37,7 +39,8 @@ class MicroOpResolver : public OpResolver {
     for (unsigned int i = 0; i < registrations_len_; ++i) {
       const TfLiteRegistration& registration = registrations_[i];
       if ((registration.builtin_code == op) &&
-          (registration.version == version)) {
+          (registration.version == MicroOpResolverAnyVersion() ||
+           registration.version == version)) {
         return &registration;
       }
     }
@@ -49,7 +52,8 @@ class MicroOpResolver : public OpResolver {
       const TfLiteRegistration& registration = registrations_[i];
       if ((registration.builtin_code == BuiltinOperator_CUSTOM) &&
           (strcmp(registration.custom_name, op) == 0) &&
-          (registration.version == version)) {
+          (registration.version == MicroOpResolverAnyVersion() ||
+           registration.version == version)) {
         return &registration;
       }
     }
@@ -57,37 +61,45 @@ class MicroOpResolver : public OpResolver {
   }
 
   void AddBuiltin(tflite::BuiltinOperator op, TfLiteRegistration* registration,
-                  int min_version = 1, int max_version = 1) {
-    for (int version = min_version; version <= max_version; ++version) {
-      if (registrations_len_ >= tOpCount) {
-        // TODO(b/147748244) - Add error reporting hooks so we can report this!
-        return;
-      }
-      TfLiteRegistration* new_registration =
-          &registrations_[registrations_len_];
-      registrations_len_ += 1;
+                  int version = 1) {
+    if (registrations_len_ >= tOpCount) {
+      // TODO(b/147748244) - Add error reporting hooks so we can report this!
+      return;
+    }
+    TfLiteRegistration* new_registration = &registrations_[registrations_len_];
+    registrations_len_ += 1;
 
-      *new_registration = *registration;
-      new_registration->builtin_code = op;
-      new_registration->version = version;
+    *new_registration = *registration;
+    new_registration->builtin_code = op;
+    new_registration->version = version;
+  }
+
+  void AddBuiltin(tflite::BuiltinOperator op, TfLiteRegistration* registration,
+                  int min_version, int max_version) {
+    for (int version = min_version; version <= max_version; ++version) {
+      AddBuiltin(op, registration, version);
     }
   }
 
   void AddCustom(const char* name, TfLiteRegistration* registration,
-                 int min_version = 1, int max_version = 1) {
-    for (int version = min_version; version <= max_version; ++version) {
-      if (registrations_len_ >= tOpCount) {
-        // TODO(b/147748244) - Add error reporting hooks so we can report this!
-        return;
-      }
-      TfLiteRegistration* new_registration =
-          &registrations_[registrations_len_];
-      registrations_len_ += 1;
+                 int version = 1) {
+    if (registrations_len_ >= tOpCount) {
+      // TODO(b/147748244) - Add error reporting hooks so we can report this!
+      return;
+    }
+    TfLiteRegistration* new_registration = &registrations_[registrations_len_];
+    registrations_len_ += 1;
 
-      *new_registration = *registration;
-      new_registration->builtin_code = BuiltinOperator_CUSTOM;
-      new_registration->custom_name = name;
-      new_registration->version = version;
+    *new_registration = *registration;
+    new_registration->builtin_code = BuiltinOperator_CUSTOM;
+    new_registration->custom_name = name;
+    new_registration->version = version;
+  }
+
+  void AddCustom(const char* name, TfLiteRegistration* registration,
+                 int min_version, int max_version) {
+    for (int version = min_version; version <= max_version; ++version) {
+      AddCustom(name, registration, version);
     }
   }
 
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc b/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
index 34e320737e3..0619591523a 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
@@ -52,12 +52,12 @@ TF_LITE_MICRO_TEST(TestOperations) {
   // We need space for 7 operators because of 2 ops, one with 3 versions, one
   // with 4 versions.
   MicroOpResolver<7> micro_op_resolver;
-  micro_op_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &r, 0, 2);
-  micro_op_resolver.AddCustom("mock_custom", &r, 0, 3);
+  micro_op_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &r, 1, 3);
+  micro_op_resolver.AddCustom("mock_custom", &r, 1, 4);
   OpResolver* resolver = &micro_op_resolver;
 
   const TfLiteRegistration* registration =
-      resolver->FindOp(BuiltinOperator_CONV_2D, 0);
+      resolver->FindOp(BuiltinOperator_CONV_2D, 1);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
   TF_LITE_MICRO_EXPECT_EQ(nullptr, registration->init(nullptr, nullptr, 0));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(nullptr, nullptr));
@@ -71,7 +71,7 @@ TF_LITE_MICRO_TEST(TestOperations) {
   registration = resolver->FindOp(BuiltinOperator_RELU, 0);
   TF_LITE_MICRO_EXPECT_EQ(nullptr, registration);
 
-  registration = resolver->FindOp("mock_custom", 0);
+  registration = resolver->FindOp("mock_custom", 1);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
   TF_LITE_MICRO_EXPECT_EQ(nullptr, registration->init(nullptr, nullptr, 0));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(nullptr, nullptr));
@@ -103,4 +103,38 @@ TF_LITE_MICRO_TEST(TestOpRegistrationOverflow) {
   TF_LITE_MICRO_EXPECT_EQ(4, micro_op_resolver.GetRegistrationLength());
 }
 
+TF_LITE_MICRO_TEST(TestZeroVersionRegistration) {
+  using tflite::MicroOpResolver;
+  using tflite::OpResolver;
+
+  static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
+                                 tflite::MockPrepare, tflite::MockInvoke};
+
+  MicroOpResolver<1> micro_op_resolver;
+  micro_op_resolver.AddCustom("mock_custom", &r,
+                              tflite::MicroOpResolverAnyVersion());
+
+  TF_LITE_MICRO_EXPECT_EQ(1, micro_op_resolver.GetRegistrationLength());
+
+  OpResolver* resolver = &micro_op_resolver;
+
+  const TfLiteRegistration* registration = resolver->FindOp("mock_custom", 0);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration->init(nullptr, nullptr, 0));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(nullptr, nullptr));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(nullptr, nullptr));
+
+  registration = resolver->FindOp("mock_custom", 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration->init(nullptr, nullptr, 0));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(nullptr, nullptr));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(nullptr, nullptr));
+
+  registration = resolver->FindOp("mock_custom", 42);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration->init(nullptr, nullptr, 0));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(nullptr, nullptr));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(nullptr, nullptr));
+}
+
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/micro_time.cc b/tensorflow/lite/micro/micro_time.cc
new file mode 100644
index 00000000000..09119de8394
--- /dev/null
+++ b/tensorflow/lite/micro/micro_time.cc
@@ -0,0 +1,44 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Reference implementation of timer functions.  Platforms are not required to
+// implement these timer methods, but they are required to enable profiling.
+
+// On platforms that have a POSIX stack or C library, it can be written using
+// methods from <sys/time.h> or clock() from <time.h>.
+
+// To add an equivalent function for your own platform, create your own
+// implementation file, and place it in a subfolder with named after the OS
+// you're targeting. For example, see the Cortex M bare metal version in
+// tensorflow/lite/micro/bluepill/micro_time.cc or the mbed one on
+// tensorflow/lite/micro/mbed/micro_time.cc.
+
+#include "tensorflow/lite/micro/micro_time.h"
+
+namespace tflite {
+
+// Reference implementation of the ticks_per_second() function that's required
+// for a platform to support Tensorflow Lite for Microcontrollers profiling.
+// This returns 0 by default because timing is an optional feature that builds
+// without errors on platforms that do not need it.
+int32_t ticks_per_second() { return 0; }
+
+// Reference implementation of the GetCurrentTimeTicks() function that's
+// required for a platform to support Tensorflow Lite for Microcontrollers
+// profiling. This returns 0 by default because timing is an optional feature
+// that builds without errors on platforms that do not need it.
+int32_t GetCurrentTimeTicks() { return 0; }
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/micro_time.h b/tensorflow/lite/micro/micro_time.h
new file mode 100644
index 00000000000..465490a8ed9
--- /dev/null
+++ b/tensorflow/lite/micro/micro_time.h
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_MICRO_MICRO_TIME_H_
+#define TENSORFLOW_LITE_MICRO_MICRO_TIME_H_
+
+#include <stdint.h>
+
+namespace tflite {
+
+// These functions should be implemented by each target platform, and provide an
+// accurate tick count along with how many ticks there are per second.
+int32_t ticks_per_second();
+
+// Return time in ticks.  The meaning of a tick varies per platform.
+int32_t GetCurrentTimeTicks();
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_MICRO_TIME_H_
diff --git a/tensorflow/lite/micro/micro_time_test.cc b/tensorflow/lite/micro/micro_time_test.cc
new file mode 100644
index 00000000000..d7e4131a8b9
--- /dev/null
+++ b/tensorflow/lite/micro/micro_time_test.cc
@@ -0,0 +1,48 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/micro_time.h"
+
+#include "tensorflow/lite/micro/testing/micro_test.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestBasicTimerFunctionality) {
+  int32_t ticks_per_second = tflite::ticks_per_second();
+
+  // Retry enough times to guarantee a tick advance, while not taking too long
+  // to complete.  With 1e6 retries, assuming each loop takes tens of cycles,
+  // this will retry for less than 10 seconds on a 10MHz platform.
+  constexpr int kMaxRetries = 1e6;
+  int start_time = tflite::GetCurrentTimeTicks();
+
+  if (ticks_per_second != 0) {
+    for (int i = 0; i < kMaxRetries; i++) {
+      if (tflite::GetCurrentTimeTicks() - start_time > 0) {
+        break;
+      }
+    }
+  }
+
+  // Ensure the timer is increasing. This works for the overflow case too, since
+  // (MIN_INT + x) - (MAX_INT - y) == x + y + 1.  For example,
+  // 0x80000001(min int + 1) - 0x7FFFFFFE(max int - 1) = 0x00000003 == 3.
+  // GetTicksPerSecond() == 0 means the timer is not implemented on this
+  // platform.
+  TF_LITE_MICRO_EXPECT(ticks_per_second == 0 ||
+                       tflite::GetCurrentTimeTicks() - start_time > 0);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/micro_utils.h b/tensorflow/lite/micro/micro_utils.h
index 42b33dc810e..7c0ebacae25 100644
--- a/tensorflow/lite/micro/micro_utils.h
+++ b/tensorflow/lite/micro/micro_utils.h
@@ -49,7 +49,7 @@ int32_t FloatToSymmetricQuantizedInt32(const float value, const float scale);
 // uint8 |     X    |    X    |            |
 // int32 |          |    X    |     X      |
 //
-// The per-op quantizaiton spec can be found here:
+// The per-op quantization spec can be found here:
 // https://www.tensorflow.org/lite/performance/quantization_spec
 
 void AsymmetricQuantize(const float* input, int8_t* output, int num_elements,
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.i b/tensorflow/lite/micro/posix/micro_time.cc
similarity index 69%
rename from tensorflow/lite/python/optimize/calibration_wrapper.i
rename to tensorflow/lite/micro/posix/micro_time.cc
index 094ac20733a..f2d21e9b145 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.i
+++ b/tensorflow/lite/micro/posix/micro_time.cc
@@ -13,15 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-%include "std_string.i"
+// Posix implementation of micro_timer.
+// To include this with make, add TAGS=posix.
+#include "tensorflow/lite/micro/micro_time.h"
 
+#include <time.h>
 
-%{
-#define SWIG_FILE_WITH_INIT
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/python/optimize/calibration_wrapper.h"
-%}
+namespace tflite {
 
+int32_t ticks_per_second() { return CLOCKS_PER_SEC; }
 
-%include "tensorflow/lite/python/optimize/calibration_wrapper.h"
\ No newline at end of file
+int32_t GetCurrentTimeTicks() { return clock(); }
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/riscv32_mcu/debug_log.cc b/tensorflow/lite/micro/riscv32_mcu/debug_log.cc
index d1c2df866e9..e2a552e9221 100644
--- a/tensorflow/lite/micro/riscv32_mcu/debug_log.cc
+++ b/tensorflow/lite/micro/riscv32_mcu/debug_log.cc
@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-// TODO(b/121324430): Add test for DebugLog fuctions
+// TODO(b/121324430): Add test for DebugLog functions
 // TODO(b/121275099): Remove dependency on debug_log once the platform supports
 // printf
 
diff --git a/tensorflow/lite/micro/simple_memory_allocator.h b/tensorflow/lite/micro/simple_memory_allocator.h
index e624d652481..f8825cfcaa3 100644
--- a/tensorflow/lite/micro/simple_memory_allocator.h
+++ b/tensorflow/lite/micro/simple_memory_allocator.h
@@ -56,7 +56,7 @@ class SimpleMemoryAllocator {
   size_t data_size_max_;
   uint8_t* data_;
   SimpleMemoryAllocator* parent_allocator_ = nullptr;
-  // The allocator is locaked if it has a child.
+  // The allocator is locked if it has a child.
   bool has_child_allocator_ = false;
 };
 
diff --git a/tensorflow/lite/micro/sparkfun_edge/micro_time.cc b/tensorflow/lite/micro/sparkfun_edge/micro_time.cc
new file mode 100644
index 00000000000..12c9ae5c633
--- /dev/null
+++ b/tensorflow/lite/micro/sparkfun_edge/micro_time.cc
@@ -0,0 +1,72 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Reference implementation of timer functions.  Platforms are not required to
+// implement these timer methods, but they are required to enable profiling.
+
+// On platforms that have a POSIX stack or C library, it can be written using
+// methods from <sys/time.h> or clock() from <time.h>.
+
+// To add an equivalent function for your own platform, create your own
+// implementation file, and place it in a subfolder with named after the OS
+// you're targeting. For example, see the Cortex M bare metal version in
+// tensorflow/lite/micro/bluepill/micro_timer.cc or the mbed one on
+// tensorflow/lite/micro/mbed/micro_timer.cc.
+
+#include "tensorflow/lite/micro/micro_time.h"
+
+// These are headers from Ambiq's Apollo3 SDK.
+#include "am_bsp.h"         // NOLINT
+#include "am_mcu_apollo.h"  // NOLINT
+#include "am_util.h"        // NOLINT
+
+namespace tflite {
+namespace {
+
+// Select CTIMER 1 as benchmarking timer on Sparkfun Edge. This timer must not
+// be used elsewhere.
+constexpr int kTimerNum = 1;
+
+// Clock set to operate at 12MHz.
+constexpr int kClocksPerSecond = 12e6;
+
+}  // namespace
+
+int32_t ticks_per_second() { return kClocksPerSecond; }
+
+// Calling this method enables a timer that runs for eternity. The user is
+// responsible for avoiding trampling on this timer's config, otherwise timing
+// measurements may no longer be valid.
+int32_t GetCurrentTimeTicks() {
+  // TODO(b/150808076): Split out initialization, intialize in interpreter.
+  static bool is_initialized = false;
+  if (!is_initialized) {
+    am_hal_ctimer_config_t timer_config;
+    // Operate as a 32-bit timer.
+    timer_config.ui32Link = 1;
+    // Set timer A to continuous mode at 12MHz.
+    timer_config.ui32TimerAConfig =
+        AM_HAL_CTIMER_FN_CONTINUOUS | AM_HAL_CTIMER_HFRC_12MHZ;
+
+    am_hal_ctimer_stop(kTimerNum, AM_HAL_CTIMER_BOTH);
+    am_hal_ctimer_clear(kTimerNum, AM_HAL_CTIMER_BOTH);
+    am_hal_ctimer_config(kTimerNum, &timer_config);
+    am_hal_ctimer_start(kTimerNum, AM_HAL_CTIMER_TIMERA);
+    is_initialized = true;
+  }
+  return CTIMERn(kTimerNum)->TMR0;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/stm32f4HAL/debug_log.cc b/tensorflow/lite/micro/stm32f4HAL/debug_log.cc
new file mode 100644
index 00000000000..4be3b40e782
--- /dev/null
+++ b/tensorflow/lite/micro/stm32f4HAL/debug_log.cc
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/debug_log.h"
+
+#include <stm32f4xx_hal.h>
+#include <stm32f4xx_hal_uart.h>
+
+#include <cstdio>
+
+extern UART_HandleTypeDef DEBUG_UART_HANDLE;
+
+#ifdef __GNUC__
+int __io_putchar(int ch) {
+  HAL_UART_Transmit(&DEBUG_UART_HANDLE, (uint8_t *)&ch, 1, HAL_MAX_DELAY);
+
+  return ch;
+}
+#else
+int fputc(int ch, FILE *f) {
+  HAL_UART_Transmit(&DEBUG_UART_HANDLE, (uint8_t *)&ch, 1, HAL_MAX_DELAY);
+
+  return ch;
+}
+#endif /* __GNUC__ */
+
+extern "C" void DebugLog(const char *s) { fprintf(stderr, "%s", s); }
diff --git a/tensorflow/lite/micro/testing/BUILD b/tensorflow/lite/micro/testing/BUILD
index b076eb28506..01bdffc6892 100644
--- a/tensorflow/lite/micro/testing/BUILD
+++ b/tensorflow/lite/micro/testing/BUILD
@@ -21,3 +21,15 @@ cc_library(
         "//tensorflow/lite/micro:micro_utils",
     ],
 )
+
+cc_library(
+    name = "micro_benchmark",
+    hdrs = [
+        "micro_benchmark.h",
+    ],
+    deps = [
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_time",
+    ],
+)
diff --git a/tensorflow/lite/micro/testing/micro_benchmark.h b/tensorflow/lite/micro/testing/micro_benchmark.h
new file mode 100644
index 00000000000..cfc35c1a41c
--- /dev/null
+++ b/tensorflow/lite/micro/testing/micro_benchmark.h
@@ -0,0 +1,48 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_TESTING_MICRO_BENCHMARK_H_
+#define TENSORFLOW_LITE_MICRO_TESTING_MICRO_BENCHMARK_H_
+
+#include "tensorflow/lite/micro/micro_error_reporter.h"
+#include "tensorflow/lite/micro/micro_time.h"
+
+namespace micro_benchmark {
+extern tflite::ErrorReporter* reporter;
+}  // namespace micro_benchmark
+
+#define TF_LITE_MICRO_BENCHMARKS_BEGIN           \
+  namespace micro_benchmark {                    \
+  tflite::ErrorReporter* reporter;               \
+  }                                              \
+                                                 \
+  int main(int argc, char** argv) {              \
+    tflite::MicroErrorReporter error_reporter;   \
+    micro_benchmark::reporter = &error_reporter; \
+    int32_t start_ticks;                         \
+    int32_t duration_ticks;                      \
+    int32_t duration_ms;
+
+#define TF_LITE_MICRO_BENCHMARKS_END }
+
+#define TF_LITE_MICRO_BENCHMARK(func)                                         \
+  start_ticks = tflite::GetCurrentTimeTicks();                                \
+  func();                                                                     \
+  duration_ticks = tflite::GetCurrentTimeTicks() - start_ticks;               \
+  duration_ms = (duration_ticks * 1000) / tflite::ticks_per_second();         \
+  TF_LITE_REPORT_ERROR(micro_benchmark::reporter, "%s took %d ticks (%d ms)", \
+                       #func, duration_ticks, duration_ms);
+
+#endif  // TENSORFLOW_LITE_MICRO_TESTING_MICRO_BENCHMARK_H_
diff --git a/tensorflow/lite/micro/testing/test_utils.h b/tensorflow/lite/micro/testing/test_utils.h
index 48b4a71bf3c..7aa1e9d488f 100644
--- a/tensorflow/lite/micro/testing/test_utils.h
+++ b/tensorflow/lite/micro/testing/test_utils.h
@@ -99,31 +99,6 @@ inline int32_t F2Q32(const float value, const float scale) {
 void PopulateContext(TfLiteTensor* tensors, int tensors_size,
                      ErrorReporter* error_reporter, TfLiteContext* context);
 
-// This method cannot be put in test_utils.cc since it contains a reference to
-// micro_test::reporter, which the linker cannot find.
-// TODO(b/150642515): remove this method and all references to it.
-inline void PopulateContext(TfLiteTensor* tensors, int tensors_size,
-                            TfLiteContext* context) {
-  context->tensors_size = tensors_size;
-  context->tensors = tensors;
-  context->impl_ = static_cast<void*>(micro_test::reporter);
-  context->GetExecutionPlan = nullptr;
-  context->ResizeTensor = nullptr;
-  context->ReportError = ReportOpError;
-  context->AddTensors = nullptr;
-  context->GetNodeAndRegistration = nullptr;
-  context->ReplaceNodeSubsetsWithDelegateKernels = nullptr;
-  context->recommended_num_threads = 1;
-  context->GetExternalContext = nullptr;
-  context->SetExternalContext = nullptr;
-
-  for (int i = 0; i < tensors_size; ++i) {
-    if (context->tensors[i].is_variable) {
-      ResetVariableTensor(&context->tensors[i]);
-    }
-  }
-}
-
 inline TfLiteTensor CreateFloatTensor(std::initializer_list<float> data,
                                       TfLiteIntArray* dims, const char* name,
                                       bool is_variable = false) {
diff --git a/tensorflow/lite/micro/tools/ci_build/test_all.sh b/tensorflow/lite/micro/tools/ci_build/test_all.sh
index 2a586418f9d..5172f950eac 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_all.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_all.sh
@@ -28,7 +28,7 @@ make -f tensorflow/lite/micro/tools/make/Makefile \
   clean clean_downloads
 
 # Add all the test scripts for the various supported platforms here. This
-# emables running all the tests together has part of the continuous integration
+# enables running all the tests together has part of the continuous integration
 # pipeline and reduces duplication associated with setting up the docker
 # environment.
 
diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile
index 2237a4a862c..0038e6813a2 100644
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -91,8 +91,7 @@ MICRO_LITE_EXAMPLE_TESTS := $(wildcard tensorflow/lite/micro/examples/*/Makefile
 MICROLITE_TEST_SRCS := \
 $(wildcard tensorflow/lite/micro/*test.cc) \
 $(wildcard tensorflow/lite/micro/kernels/*test.cc) \
-$(wildcard tensorflow/lite/micro/memory_planner/*test.cc) \
-$(wildcard tensorflow/lite/micro/testing/*.cc)
+$(wildcard tensorflow/lite/micro/memory_planner/*test.cc)
 
 MICROLITE_BENCHMARK_SRCS := \
 $(wildcard tensorflow/lite/micro/kernels/xtensa-hifimini/*benchmark.cc)
@@ -110,7 +109,8 @@ tensorflow/lite/core/api/flatbuffer_conversions.cc \
 tensorflow/lite/core/api/op_resolver.cc \
 tensorflow/lite/core/api/tensor_utils.cc \
 tensorflow/lite/kernels/internal/quantization_util.cc \
-tensorflow/lite/kernels/kernel_util.cc
+tensorflow/lite/kernels/kernel_util.cc \
+tensorflow/lite/micro/testing/test_utils.cc
 
 MICROLITE_CC_SRCS := $(filter-out $(MICROLITE_TEST_SRCS), $(MICROLITE_CC_BASE_SRCS))
 MICROLITE_CC_SRCS := $(filter-out $(MICROLITE_BENCHMARK_SRCS), $(MICROLITE_CC_SRCS))
@@ -162,6 +162,7 @@ tensorflow/lite/kernels/internal/reference/reduce.h \
 tensorflow/lite/kernels/internal/reference/requantize.h \
 tensorflow/lite/kernels/internal/reference/round.h \
 tensorflow/lite/kernels/internal/reference/softmax.h \
+tensorflow/lite/kernels/internal/reference/sub.h \
 tensorflow/lite/kernels/internal/reference/logistic.h \
 tensorflow/lite/kernels/internal/reference/strided_slice.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h \
diff --git a/tensorflow/lite/micro/tools/make/targets/ecm3531/_main.c b/tensorflow/lite/micro/tools/make/targets/ecm3531/_main.c
index 25d3e7c169d..ead3709746b 100644
--- a/tensorflow/lite/micro/tools/make/targets/ecm3531/_main.c
+++ b/tensorflow/lite/micro/tools/make/targets/ecm3531/_main.c
@@ -15,7 +15,7 @@ limitations under the License.
 
 /* This is file contains the entry point to the application and is called after
    startup.
-   The GPIOs, Uart and timer are intialized and Tensorflow is invoked with the
+   The GPIOs, Uart and timer are initialized and Tensorflow is invoked with the
    call to main().
    Tensorflow will print out if the tests have passed or failed and the
    execution time is also
diff --git a/tensorflow/lite/micro/tools/make/transform_arduino_source.py b/tensorflow/lite/micro/tools/make/transform_arduino_source.py
index 9883fc13bc1..c6a49b7ad3d 100644
--- a/tensorflow/lite/micro/tools/make/transform_arduino_source.py
+++ b/tensorflow/lite/micro/tools/make/transform_arduino_source.py
@@ -54,11 +54,11 @@ def check_ino_functions(input_text):
   # have to have a setup() and loop() function, just like their IDE expects.
   if not re.search(r'void setup\(\) \{', input_text):
     raise Exception(
-        'All examples must have a setup() function for Arduino compatiblity\n' +
-        input_text)
+        'All examples must have a setup() function for Arduino compatibility\n'
+        + input_text)
   if not re.search(r'void loop\(\) \{', input_text):
     raise Exception(
-        'All examples must have a loop() function for Arduino compatiblity')
+        'All examples must have a loop() function for Arduino compatibility')
   return input_text
 
 
diff --git a/tensorflow/lite/micro/tools/make/transform_source.py b/tensorflow/lite/micro/tools/make/transform_source.py
index def6eab80ff..e023b6da35b 100644
--- a/tensorflow/lite/micro/tools/make/transform_source.py
+++ b/tensorflow/lite/micro/tools/make/transform_source.py
@@ -61,11 +61,11 @@ def check_ino_functions(input_text):
   # have to have a setup() and loop() function, just like their IDE expects.
   if not re.search(r'void setup\(\) \{', input_text):
     raise Exception(
-        'All examples must have a setup() function for Arduino compatiblity\n' +
-        input_text)
+        'All examples must have a setup() function for Arduino compatibility\n'
+        + input_text)
   if not re.search(r'void loop\(\) \{', input_text):
     raise Exception(
-        'All examples must have a loop() function for Arduino compatiblity')
+        'All examples must have a loop() function for Arduino compatibility')
   return input_text
 
 
@@ -75,7 +75,7 @@ def add_example_ino_library_include(input_text):
                 input_text, 1)
 
 
-def replace_ardunio_example_includes(line, _):
+def replace_arduino_example_includes(line, _):
   """Updates any includes for local example files."""
   # Because the export process moves the example source and header files out of
   # their default locations into the top-level 'examples' folder in the Arduino
@@ -122,7 +122,7 @@ def transform_arduino_sources(input_lines, flags):
   for line in input_lines:
     line = replace_arduino_includes(line, supplied_headers_list)
     if flags.is_example_ino or flags.is_example_source:
-      line = replace_ardunio_example_includes(line, flags.source_path)
+      line = replace_arduino_example_includes(line, flags.source_path)
     else:
       line = replace_arduino_main(line)
     output_lines.append(line)
diff --git a/tensorflow/lite/model.cc b/tensorflow/lite/model.cc
index 4f52344e9b3..bb08976c73e 100644
--- a/tensorflow/lite/model.cc
+++ b/tensorflow/lite/model.cc
@@ -225,8 +225,20 @@ string FlatBufferModel::GetMinimumRuntime() const {
       auto buf = metadata->buffer();
       auto* buffer = (*model_->buffers())[buf];
       auto* array = buffer->data();
-      return string(reinterpret_cast<const char*>(array->data()),
-                    array->size());
+      // Get the real length of the runtime string, since there might be
+      // trailing
+      // '\0's in the buffer.
+      for (int len = 0; len < array->size(); ++len) {
+        if (array->data()[len] == '\0') {
+          return string(reinterpret_cast<const char*>(array->data()), len);
+        }
+      }
+      // If there is no '\0' in the buffer, this indicates that the flatbuffer
+      // is malformed.
+      TF_LITE_REPORT_ERROR(
+          error_reporter_,
+          "Min_runtime_version in model metadata is malformed");
+      break;
     }
   }
   return "";
@@ -325,8 +337,8 @@ std::vector<int> FlatBufferIntArrayToVector(T* flat_array) {
   if (flat_array == nullptr) {
     return {};
   }
-  std::vector<int> ret(flat_array->Length());
-  for (int i = 0; i < flat_array->Length(); i++) {
+  std::vector<int> ret(flat_array->size());
+  for (int i = 0; i < flat_array->size(); i++) {
     ret[i] = flat_array->Get(i);
   }
   return ret;
@@ -347,9 +359,9 @@ TfLiteStatus InterpreterBuilder::ParseNodes(
   TfLiteStatus status = kTfLiteOk;
 
   // Reduce the number of redundant allocations
-  subgraph->ReserveNodes(operators->Length());
+  subgraph->ReserveNodes(operators->size());
 
-  for (int i = 0; i < operators->Length(); ++i) {
+  for (int i = 0; i < operators->size(); ++i) {
     const auto* op = operators->Get(i);
     int index = op->opcode_index();
     if (index < 0 || index >= flatbuffer_op_index_to_registration_.size()) {
@@ -555,7 +567,7 @@ TfLiteStatus InterpreterBuilder::ParseTensors(
     return kEmptyTensorName;
   };
 
-  for (int i = 0; i < tensors->Length(); ++i) {
+  for (int i = 0; i < tensors->size(); ++i) {
     const auto* tensor = tensors->Get(i);
     std::vector<int> dims = FlatBufferIntArrayToVector(tensor->shape());
 
@@ -603,7 +615,7 @@ TfLiteStatus InterpreterBuilder::ParseTensors(
     size_t dims_signature_rank = 0;
     const int* dims_signature_data = nullptr;
     if (tensor->shape_signature()) {
-      dims_signature_rank = tensor->shape_signature()->Length();
+      dims_signature_rank = tensor->shape_signature()->size();
       dims_signature_data = tensor->shape_signature()->data();
     }
 
@@ -718,15 +730,15 @@ TfLiteStatus InterpreterBuilder::operator()(
 
   interpreter->reset(new Interpreter(error_reporter_));
   (*interpreter)->SetNumThreads(num_threads);
-  if (subgraphs->Length() > 1) {
-    (*interpreter)->AddSubgraphs(subgraphs->Length() - 1);
+  if (subgraphs->size() > 1) {
+    (*interpreter)->AddSubgraphs(subgraphs->size() - 1);
   }
 
 #if defined(TFLITE_ENABLE_DEFAULT_PROFILER)
   (*interpreter)->SetProfiler(tflite::profiling::CreatePlatformProfiler());
 #endif
 
-  for (int subgraph_index = 0; subgraph_index < subgraphs->Length();
+  for (int subgraph_index = 0; subgraph_index < subgraphs->size();
        ++subgraph_index) {
     const tflite::SubGraph* subgraph = (*subgraphs)[subgraph_index];
     tflite::Subgraph* modified_subgraph =
@@ -739,7 +751,7 @@ TfLiteStatus InterpreterBuilder::operator()(
           subgraph_index);
       return cleanup_and_error();
     }
-    if (modified_subgraph->AddTensors(tensors->Length()) != kTfLiteOk) {
+    if (modified_subgraph->AddTensors(tensors->size()) != kTfLiteOk) {
       return cleanup_and_error();
     }
     // Set num threads
diff --git a/tensorflow/lite/model_test.cc b/tensorflow/lite/model_test.cc
index d843037e757..b9efdf676a8 100644
--- a/tensorflow/lite/model_test.cc
+++ b/tensorflow/lite/model_test.cc
@@ -366,7 +366,7 @@ TEST(BasicFlatBufferModel, TestReadRuntimeVersionFromModel) {
       "tensorflow/lite/testdata/test_min_runtime.bin");
   ASSERT_TRUE(model2);
   // Check that we have read the runtime string correctly.
-  ASSERT_EQ(model2->GetMinimumRuntime(), "1.10.0");
+  ASSERT_EQ(model2->GetMinimumRuntime(), "1.5.0");
 }
 
 // The test model has the following tensor encoded in the TACO format:
diff --git a/tensorflow/lite/mutable_op_resolver.cc b/tensorflow/lite/mutable_op_resolver.cc
index 36c512dcaac..5cb6ed169e7 100644
--- a/tensorflow/lite/mutable_op_resolver.cc
+++ b/tensorflow/lite/mutable_op_resolver.cc
@@ -29,29 +29,41 @@ const TfLiteRegistration* MutableOpResolver::FindOp(const char* op,
   return it != custom_ops_.end() ? &it->second : nullptr;
 }
 
+void MutableOpResolver::AddBuiltin(tflite::BuiltinOperator op,
+                                   const TfLiteRegistration* registration,
+                                   int version) {
+  TfLiteRegistration new_registration = *registration;
+  new_registration.custom_name = nullptr;
+  new_registration.builtin_code = op;
+  new_registration.version = version;
+  auto op_key = std::make_pair(op, version);
+  builtins_[op_key] = new_registration;
+}
+
 void MutableOpResolver::AddBuiltin(tflite::BuiltinOperator op,
                                    const TfLiteRegistration* registration,
                                    int min_version, int max_version) {
   for (int version = min_version; version <= max_version; ++version) {
-    TfLiteRegistration new_registration = *registration;
-    new_registration.custom_name = nullptr;
-    new_registration.builtin_code = op;
-    new_registration.version = version;
-    auto op_key = std::make_pair(op, version);
-    builtins_[op_key] = new_registration;
+    AddBuiltin(op, registration, version);
   }
 }
 
+void MutableOpResolver::AddCustom(const char* name,
+                                  const TfLiteRegistration* registration,
+                                  int version) {
+  TfLiteRegistration new_registration = *registration;
+  new_registration.builtin_code = BuiltinOperator_CUSTOM;
+  new_registration.custom_name = name;
+  new_registration.version = version;
+  auto op_key = std::make_pair(name, version);
+  custom_ops_[op_key] = new_registration;
+}
+
 void MutableOpResolver::AddCustom(const char* name,
                                   const TfLiteRegistration* registration,
                                   int min_version, int max_version) {
   for (int version = min_version; version <= max_version; ++version) {
-    TfLiteRegistration new_registration = *registration;
-    new_registration.builtin_code = BuiltinOperator_CUSTOM;
-    new_registration.custom_name = name;
-    new_registration.version = version;
-    auto op_key = std::make_pair(name, version);
-    custom_ops_[op_key] = new_registration;
+    AddCustom(name, registration, version);
   }
 }
 
diff --git a/tensorflow/lite/mutable_op_resolver.h b/tensorflow/lite/mutable_op_resolver.h
index 9e41ee86423..fe5e121424c 100644
--- a/tensorflow/lite/mutable_op_resolver.h
+++ b/tensorflow/lite/mutable_op_resolver.h
@@ -60,10 +60,14 @@ class MutableOpResolver : public OpResolver {
                                    int version) const override;
   const TfLiteRegistration* FindOp(const char* op, int version) const override;
   void AddBuiltin(tflite::BuiltinOperator op,
-                  const TfLiteRegistration* registration, int min_version = 1,
-                  int max_version = 1);
+                  const TfLiteRegistration* registration, int version = 1);
+  void AddBuiltin(tflite::BuiltinOperator op,
+                  const TfLiteRegistration* registration, int min_version,
+                  int max_version);
   void AddCustom(const char* name, const TfLiteRegistration* registration,
-                 int min_version = 1, int max_version = 1);
+                 int version = 1);
+  void AddCustom(const char* name, const TfLiteRegistration* registration,
+                 int min_version, int max_version);
   void AddAll(const MutableOpResolver& other);
 
  private:
diff --git a/tensorflow/lite/mutable_op_resolver_test.cc b/tensorflow/lite/mutable_op_resolver_test.cc
index 22641ebd539..71a30d95b16 100644
--- a/tensorflow/lite/mutable_op_resolver_test.cc
+++ b/tensorflow/lite/mutable_op_resolver_test.cc
@@ -81,6 +81,25 @@ TEST(MutableOpResolverTest, FindMissingOp) {
   EXPECT_EQ(found_registration, nullptr);
 }
 
+TEST(MutableOpResolverTest, RegisterOpWithSingleVersion) {
+  MutableOpResolver resolver;
+  // The kernel supports version 2 only
+  resolver.AddBuiltin(BuiltinOperator_ADD, GetDummyRegistration(), 2);
+
+  const TfLiteRegistration* found_registration;
+
+  found_registration = resolver.FindOp(BuiltinOperator_ADD, 1);
+  ASSERT_EQ(found_registration, nullptr);
+
+  found_registration = resolver.FindOp(BuiltinOperator_ADD, 2);
+  ASSERT_NE(found_registration, nullptr);
+  EXPECT_TRUE(found_registration->invoke == DummyInvoke);
+  EXPECT_EQ(found_registration->version, 2);
+
+  found_registration = resolver.FindOp(BuiltinOperator_ADD, 3);
+  ASSERT_EQ(found_registration, nullptr);
+}
+
 TEST(MutableOpResolverTest, RegisterOpWithMultipleVersions) {
   MutableOpResolver resolver;
   // The kernel supports version 2 and 3
diff --git a/tensorflow/lite/profiling/atrace_profiler.cc b/tensorflow/lite/profiling/atrace_profiler.cc
index 8fe36416082..2f6672d6bb7 100644
--- a/tensorflow/lite/profiling/atrace_profiler.cc
+++ b/tensorflow/lite/profiling/atrace_profiler.cc
@@ -16,56 +16,78 @@ limitations under the License.
 
 #include <dlfcn.h>
 
+#include <type_traits>
+
 #include "absl/strings/str_cat.h"
 
 namespace tflite {
 namespace profiling {
 
-ATraceProfiler::ATraceProfiler() {
-  handle_ = dlopen("libandroid.so", RTLD_NOW | RTLD_LOCAL);
-  if (handle_) {
-    // Use dlsym() to prevent crashes on devices running Android 5.1
-    // (API level 22) or lower.
-    atrace_is_enabled_ =
-        reinterpret_cast<FpIsEnabled>(dlsym(handle_, "ATrace_isEnabled"));
-    atrace_begin_section_ =
-        reinterpret_cast<FpBeginSection>(dlsym(handle_, "ATrace_beginSection"));
-    atrace_end_section_ =
-        reinterpret_cast<FpEndSection>(dlsym(handle_, "ATrace_endSection"));
+// Profiler reporting to ATrace.
+class ATraceProfiler : public tflite::Profiler {
+ public:
+  using FpIsEnabled = std::add_pointer<bool()>::type;
+  using FpBeginSection = std::add_pointer<void(const char*)>::type;
+  using FpEndSection = std::add_pointer<void()>::type;
 
-    if (!atrace_is_enabled_ || !atrace_begin_section_ || !atrace_end_section_) {
-      dlclose(handle_);
-      handle_ = nullptr;
+  ATraceProfiler() {
+    handle_ = dlopen("libandroid.so", RTLD_NOW | RTLD_LOCAL);
+    if (handle_) {
+      // Use dlsym() to prevent crashes on devices running Android 5.1
+      // (API level 22) or lower.
+      atrace_is_enabled_ =
+          reinterpret_cast<FpIsEnabled>(dlsym(handle_, "ATrace_isEnabled"));
+      atrace_begin_section_ = reinterpret_cast<FpBeginSection>(
+          dlsym(handle_, "ATrace_beginSection"));
+      atrace_end_section_ =
+          reinterpret_cast<FpEndSection>(dlsym(handle_, "ATrace_endSection"));
+
+      if (!atrace_is_enabled_ || !atrace_begin_section_ ||
+          !atrace_end_section_) {
+        dlclose(handle_);
+        handle_ = nullptr;
+      }
     }
   }
-}
 
-ATraceProfiler::~ATraceProfiler() {
-  if (handle_) {
-    dlclose(handle_);
+  ~ATraceProfiler() override {
+    if (handle_) {
+      dlclose(handle_);
+    }
   }
-}
 
-uint32_t ATraceProfiler::BeginEvent(const char* tag, EventType event_type,
-                                    uint32_t event_metadata,
-                                    uint32_t event_subgraph_index) {
-  if (handle_ && atrace_is_enabled_()) {
-    // Note: When recording an OPERATOR_INVOKE_EVENT, we have recorded the op
-    // name as tag and node index as event_metadata. See the macro
-    // TFLITE_SCOPED_TAGGED_OPERATOR_PROFILE defined in
-    // tensorflow/lite/core/api/profiler.h for details.
-    // op_name@node_index/subgraph_index
-    std::string trace_event_tag =
-        absl::StrCat(tag, "@", event_metadata, "/", event_subgraph_index);
-    atrace_begin_section_(trace_event_tag.c_str());
+  uint32_t BeginEvent(const char* tag, EventType event_type,
+                      uint32_t event_metadata,
+                      uint32_t event_subgraph_index) override {
+    if (handle_ && atrace_is_enabled_()) {
+      // Note: When recording an OPERATOR_INVOKE_EVENT, we have recorded the op
+      // name as tag and node index as event_metadata. See the macro
+      // TFLITE_SCOPED_TAGGED_OPERATOR_PROFILE defined in
+      // tensorflow/lite/core/api/profiler.h for details.
+      // op_name@node_index/subgraph_index
+      std::string trace_event_tag =
+          absl::StrCat(tag, "@", event_metadata, "/", event_subgraph_index);
+      atrace_begin_section_(trace_event_tag.c_str());
+    }
+    return 0;
   }
-  return 0;
-}
 
-void ATraceProfiler::EndEvent(uint32_t event_handle) {
-  if (handle_) {
-    atrace_end_section_();
+  void EndEvent(uint32_t event_handle) override {
+    if (handle_) {
+      atrace_end_section_();
+    }
   }
+
+ private:
+  // Handle to libandroid.so library. Null if not supported.
+  void* handle_;
+  FpIsEnabled atrace_is_enabled_;
+  FpBeginSection atrace_begin_section_;
+  FpEndSection atrace_end_section_;
+};
+
+std::unique_ptr<tflite::Profiler> CreateATraceProfiler() {
+  return std::unique_ptr<tflite::Profiler>(new ATraceProfiler());
 }
 
 }  // namespace profiling
diff --git a/tensorflow/lite/profiling/atrace_profiler.h b/tensorflow/lite/profiling/atrace_profiler.h
index fcfb9f807ae..d103cbc8536 100644
--- a/tensorflow/lite/profiling/atrace_profiler.h
+++ b/tensorflow/lite/profiling/atrace_profiler.h
@@ -15,37 +15,14 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_PROFILING_ATRACE_PROFILER_H_
 #define TENSORFLOW_LITE_PROFILING_ATRACE_PROFILER_H_
 
-#include <type_traits>
+#include <memory>
 
 #include "tensorflow/lite/core/api/profiler.h"
 
 namespace tflite {
 namespace profiling {
 
-// Profiler reporting to ATrace.
-class ATraceProfiler : public tflite::Profiler {
- public:
-  ATraceProfiler();
-
-  ~ATraceProfiler() override;
-
-  uint32_t BeginEvent(const char* tag, EventType event_type,
-                      uint32_t event_metadata,
-                      uint32_t event_subgraph_index) override;
-
-  void EndEvent(uint32_t event_handle) override;
-
- private:
-  using FpIsEnabled = std::add_pointer<bool()>::type;
-  using FpBeginSection = std::add_pointer<void(const char*)>::type;
-  using FpEndSection = std::add_pointer<void()>::type;
-
-  // Handle to libandroid.so library. Null if not supported.
-  void* handle_;
-  FpIsEnabled atrace_is_enabled_;
-  FpBeginSection atrace_begin_section_;
-  FpEndSection atrace_end_section_;
-};
+std::unique_ptr<tflite::Profiler> CreateATraceProfiler();
 
 }  // namespace profiling
 }  // namespace tflite
diff --git a/tensorflow/lite/profiling/platform_profiler.cc b/tensorflow/lite/profiling/platform_profiler.cc
index bbf5e178d66..cd0770c2348 100644
--- a/tensorflow/lite/profiling/platform_profiler.cc
+++ b/tensorflow/lite/profiling/platform_profiler.cc
@@ -27,7 +27,7 @@ namespace profiling {
 
 std::unique_ptr<tflite::Profiler> CreatePlatformProfiler() {
 #if defined(__ANDROID__)
-  return std::unique_ptr<tflite::Profiler>(new ATraceProfiler());
+  return CreateATraceProfiler();
 #else
   return std::unique_ptr<tflite::Profiler>(nullptr);
 #endif
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index a1f9baf7c7d..5903a96fb52 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -20,7 +20,8 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/lite/python/interpreter_wrapper:tensorflow_wrap_interpreter_wrapper",
+        "//tensorflow/lite/python/interpreter_wrapper:_pywrap_tensorflow_interpreter_wrapper",
+        "//tensorflow/python:util",
         "//third_party/py/numpy",
     ],
 )
@@ -43,7 +44,7 @@ py_test(
     ],
     deps = [
         ":interpreter",
-        "//tensorflow/lite/python/testdata:test_registerer_wrapper",
+        "//tensorflow/lite/python/testdata:_pywrap_test_registerer",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform",
diff --git a/tensorflow/lite/python/interpreter.py b/tensorflow/lite/python/interpreter.py
index 9a9cbc1aff1..e3d7d04be14 100644
--- a/tensorflow/lite/python/interpreter.py
+++ b/tensorflow/lite/python/interpreter.py
@@ -37,7 +37,7 @@ if not __file__.endswith('tflite_runtime/interpreter.py'):
   _interpreter_wrapper = LazyLoader(
       "_interpreter_wrapper", globals(),
       "tensorflow.lite.python.interpreter_wrapper."
-      "tensorflow_wrap_interpreter_wrapper")
+      '_pywrap_tensorflow_interpreter_wrapper')
   # pylint: enable=g-inconsistent-quotes
 
   del LazyLoader
@@ -200,7 +200,7 @@ class Interpreter(object):
       self._custom_op_registerers = []
     if model_path and not model_content:
       self._interpreter = (
-          _interpreter_wrapper.InterpreterWrapper_CreateWrapperCPPFromFile(
+          _interpreter_wrapper.CreateWrapperFromFile(
               model_path, self._custom_op_registerers))
       if not self._interpreter:
         raise ValueError('Failed to open {}'.format(model_path))
@@ -210,7 +210,7 @@ class Interpreter(object):
       # will always return the same pointer.
       self._model_content = model_content
       self._interpreter = (
-          _interpreter_wrapper.InterpreterWrapper_CreateWrapperCPPFromBuffer(
+          _interpreter_wrapper.CreateWrapperFromBuffer(
               model_content, self._custom_op_registerers))
     elif not model_content and not model_path:
       raise ValueError('`model_path` or `model_content` must be specified.')
@@ -517,7 +517,7 @@ class Interpreter(object):
 class InterpreterWithCustomOps(Interpreter):
   """Interpreter interface for TensorFlow Lite Models that accepts custom ops.
 
-  The interface provided by this class is experimenal and therefore not exposed
+  The interface provided by this class is experimental and therefore not exposed
   as part of the public API.
 
   Wraps the tf.lite.Interpreter class and adds the ability to load custom ops
diff --git a/tensorflow/lite/python/interpreter_test.py b/tensorflow/lite/python/interpreter_test.py
index 122f5f2d04c..2a10eb0cc69 100644
--- a/tensorflow/lite/python/interpreter_test.py
+++ b/tensorflow/lite/python/interpreter_test.py
@@ -33,7 +33,7 @@ if hasattr(sys, 'setdlopenflags') and hasattr(sys, 'getdlopenflags'):
   sys.setdlopenflags(sys.getdlopenflags() | ctypes.RTLD_GLOBAL)
 
 from tensorflow.lite.python import interpreter as interpreter_wrapper
-from tensorflow.lite.python.testdata import test_registerer_wrapper as test_registerer
+from tensorflow.lite.python.testdata import _pywrap_test_registerer as test_registerer
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import test
diff --git a/tensorflow/lite/python/interpreter_wrapper/BUILD b/tensorflow/lite/python/interpreter_wrapper/BUILD
index 15dc9ec4376..14dbc553257 100644
--- a/tensorflow/lite/python/interpreter_wrapper/BUILD
+++ b/tensorflow/lite/python/interpreter_wrapper/BUILD
@@ -1,4 +1,4 @@
-load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
+load("//tensorflow:tensorflow.bzl", "pybind_extension")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -13,7 +13,7 @@ cc_library(
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/c:common",
         "//third_party/py/numpy:headers",
-        "//third_party/python_runtime:headers",
+        "//third_party/python_runtime:headers",  # buildcleaner: keep
     ],
 )
 
@@ -34,7 +34,7 @@ cc_library(
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/experimental/tflite_api_dispatcher",
         "//tensorflow/lite/kernels:builtin_ops",
-        "//third_party/python_runtime:headers",
+        "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings:str_format",
     ],
@@ -46,7 +46,7 @@ cc_library(
     hdrs = ["python_error_reporter.h"],
     deps = [
         "//tensorflow/lite/core/api",
-        "//third_party/python_runtime:headers",
+        "//third_party/python_runtime:headers",  # buildcleaner: keep
     ],
 )
 
@@ -57,18 +57,23 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
-        "//third_party/python_runtime:headers",
+        "//third_party/python_runtime:headers",  # buildcleaner: keep
     ],
 )
 
-tf_py_wrap_cc(
-    name = "tensorflow_wrap_interpreter_wrapper",
+pybind_extension(
+    name = "_pywrap_tensorflow_interpreter_wrapper",
     srcs = [
-        "interpreter_wrapper.i",
+        "interpreter_wrapper_pybind11.cc",
     ],
-    copts = ["-fexceptions"],
+    hdrs = ["interpreter_wrapper.h"],
+    link_in_framework = True,
+    module_name = "_pywrap_tensorflow_interpreter_wrapper",
     deps = [
         ":interpreter_wrapper_lib",
+        "//tensorflow/lite/experimental/tflite_api_dispatcher",
+        "//tensorflow/python:pybind11_lib",
         "//third_party/python_runtime:headers",
+        "@pybind11",
     ],
 )
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.i b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.i
deleted file mode 100644
index a9d2bff833e..00000000000
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.i
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-%include "std_string.i"
-
-
-%{
-#define SWIG_FILE_WITH_INIT
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h"
-#include "tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h"
-%}
-
-
-%typemap(in) TfLiteDelegate* {
-  $1 = reinterpret_cast<TfLiteDelegate*>(PyLong_AsVoidPtr($input));
-}
-
-%typemap(out) TfLiteDelegate* {
-  $result = PyLong_FromVoidPtr($1)
-}
-
-// Converts a Python list of str to a std::vector<std::string>, returns true
-// if the conversion was successful.
-%{
-static bool PyListToStdVectorString(PyObject *list, std::vector<std::string> *strings) {
-  // Make sure the list is actually a list.
-  if (!PyList_Check(list)) return false;
-
-  // Convert the Python list to a vector of strings.
-  const int list_size = PyList_Size(list);
-  strings->resize(list_size);
-  for (int k = 0; k < list_size; k++) {
-    PyObject *string_py = PyList_GetItem(list, k);
-    if (PyString_Check(string_py)) {
-      (*strings)[k] = PyString_AsString(string_py);
-    } else if (PyUnicode_Check(string_py)) {
-      // First convert the PyUnicode to a PyString.
-      PyObject *utf8_string_py = PyUnicode_AsUTF8String(string_py);
-      if (!utf8_string_py) return false;
-
-      // Then convert it to a regular std::string.
-      (*strings)[k] = PyString_AsString(utf8_string_py);
-      Py_DECREF(utf8_string_py);
-    } else {
-      return false;
-    }
-  }
-  return true;
-}
-%}
-bool PyListToStdVectorString(PyObject *list, std::vector<std::string> *strings);
-
-%include "tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h"
-
-namespace tflite {
-namespace interpreter_wrapper {
-%extend InterpreterWrapper {
-
-  // Version of the constructor that handles producing Python exceptions
-  // that propagate strings.
-  static PyObject* CreateWrapperCPPFromFile(
-      const char* model_path,
-      PyObject* registerers_py) {
-    std::string error;
-    std::vector<std::string> registerers;
-    if (!PyListToStdVectorString(registerers_py, &registerers)) {
-      PyErr_SetString(PyExc_ValueError, "Second argument is expected to be a list of strings.");
-      return nullptr;
-    }
-    if(tflite::interpreter_wrapper::InterpreterWrapper* ptr =
-        tflite::interpreter_wrapper::InterpreterWrapper
-            ::CreateWrapperCPPFromFile(
-        model_path, registerers, &error)) {
-      return SWIG_NewPointerObj(
-          ptr, SWIGTYPE_p_tflite__interpreter_wrapper__InterpreterWrapper, 1);
-    } else {
-      PyErr_SetString(PyExc_ValueError, error.c_str());
-      return nullptr;
-    }
-  }
-
-  // Version of the constructor that handles producing Python exceptions
-  // that propagate strings.
-  static PyObject* CreateWrapperCPPFromBuffer(
-      PyObject* data ,
-      PyObject* registerers_py) {
-    std::string error;
-    std::vector<std::string> registerers;
-    if (!PyListToStdVectorString(registerers_py, &registerers)) {
-      PyErr_SetString(PyExc_ValueError, "Second argument is expected to be a list of strings.");
-      return nullptr;
-    }
-    if(tflite::interpreter_wrapper::InterpreterWrapper* ptr =
-        tflite::interpreter_wrapper::InterpreterWrapper
-            ::CreateWrapperCPPFromBuffer(
-        data, registerers, &error)) {
-      return SWIG_NewPointerObj(
-          ptr, SWIGTYPE_p_tflite__interpreter_wrapper__InterpreterWrapper, 1);
-    } else {
-      PyErr_SetString(PyExc_ValueError, error.c_str());
-      return nullptr;
-    }
-  }
-}
-
-}  // namespace interpreter_wrapper
-}  // namespace tflite
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc
new file mode 100644
index 00000000000..84264ad803a
--- /dev/null
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc
@@ -0,0 +1,149 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "include/pybind11/pybind11.h"
+#include "include/pybind11/pytypes.h"
+#include "include/pybind11/stl.h"
+#include "tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h"
+#include "tensorflow/python/lib/core/pybind11_lib.h"
+
+namespace py = pybind11;
+using tflite::interpreter_wrapper::InterpreterWrapper;
+
+PYBIND11_MODULE(_pywrap_tensorflow_interpreter_wrapper, m) {
+  m.doc() = R"pbdoc(
+    _pywrap_tensorflow_interpreter_wrapper
+    -----
+  )pbdoc";
+
+  // pybind11 suggests to convert factory functions into constructors, but
+  // when bytes are provided the wrapper will be confused which
+  // constructor to call.
+  m.def("CreateWrapperFromFile",
+        [](const std::string& model_path,
+           const std::vector<std::string>& registerers) {
+          std::string error;
+          auto* wrapper = ::InterpreterWrapper::CreateWrapperCPPFromFile(
+              model_path.c_str(), registerers, &error);
+          if (!wrapper) {
+            throw std::invalid_argument(error);
+          }
+          return wrapper;
+        });
+  m.def("CreateWrapperFromBuffer",
+        [](const py::bytes& data, const std::vector<std::string>& registerers) {
+          std::string error;
+          auto* wrapper = ::InterpreterWrapper::CreateWrapperCPPFromBuffer(
+              data.ptr(), registerers, &error);
+          if (!wrapper) {
+            throw std::invalid_argument(error);
+          }
+          return wrapper;
+        });
+  py::class_<InterpreterWrapper>(m, "InterpreterWrapper")
+      .def("AllocateTensors",
+           [](InterpreterWrapper& self) {
+             return tensorflow::pyo_or_throw(self.AllocateTensors());
+           })
+      .def("Invoke",
+           [](InterpreterWrapper& self) {
+             return tensorflow::pyo_or_throw(self.Invoke());
+           })
+      .def("InputIndices",
+           [](const InterpreterWrapper& self) {
+             return tensorflow::pyo_or_throw(self.InputIndices());
+           })
+      .def("OutputIndices",
+           [](InterpreterWrapper& self) {
+             return tensorflow::pyo_or_throw(self.OutputIndices());
+           })
+      .def("ResizeInputTensor",
+           [](InterpreterWrapper& self, int i, py::handle& value) {
+             return tensorflow::pyo_or_throw(
+                 self.ResizeInputTensor(i, value.ptr()));
+           })
+      .def("NumTensors", &InterpreterWrapper::NumTensors)
+      .def("TensorName", &InterpreterWrapper::TensorName)
+      .def("TensorType",
+           [](const InterpreterWrapper& self, int i) {
+             return tensorflow::pyo_or_throw(self.TensorType(i));
+           })
+      .def("TensorSize",
+           [](const InterpreterWrapper& self, int i) {
+             return tensorflow::pyo_or_throw(self.TensorSize(i));
+           })
+      .def("TensorSizeSignature",
+           [](const InterpreterWrapper& self, int i) {
+             return tensorflow::pyo_or_throw(self.TensorSizeSignature(i));
+           })
+      .def("TensorSparsityParameters",
+           [](const InterpreterWrapper& self, int i) {
+             return tensorflow::pyo_or_throw(self.TensorSparsityParameters(i));
+           })
+      .def(
+          "TensorQuantization",
+          [](const InterpreterWrapper& self, int i) {
+            return tensorflow::pyo_or_throw(self.TensorQuantization(i));
+          },
+          R"pbdoc(
+            Deprecated in favor of TensorQuantizationParameters.
+          )pbdoc")
+      .def("TensorQuantizationParameters",
+           [](InterpreterWrapper& self, int i) {
+             return tensorflow::pyo_or_throw(
+                 self.TensorQuantizationParameters(i));
+           })
+      .def("SetTensor",
+           [](InterpreterWrapper& self, int i, py::handle& value) {
+             return tensorflow::pyo_or_throw(self.SetTensor(i, value.ptr()));
+           })
+      .def("GetTensor",
+           [](const InterpreterWrapper& self, int i) {
+             return tensorflow::pyo_or_throw(self.GetTensor(i));
+           })
+      .def("ResetVariableTensors",
+           [](InterpreterWrapper& self) {
+             return tensorflow::pyo_or_throw(self.ResetVariableTensors());
+           })
+      .def("NumNodes", &InterpreterWrapper::NumNodes)
+      .def("NodeName", &InterpreterWrapper::NodeName)
+      .def("NodeInputs",
+           [](const InterpreterWrapper& self, int i) {
+             return tensorflow::pyo_or_throw(self.NodeInputs(i));
+           })
+      .def("NodeOutputs",
+           [](const InterpreterWrapper& self, int i) {
+             return tensorflow::pyo_or_throw(self.NodeOutputs(i));
+           })
+      .def(
+          "tensor",
+          [](InterpreterWrapper& self, py::handle& base_object, int i) {
+            return tensorflow::pyo_or_throw(self.tensor(base_object.ptr(), i));
+          },
+          R"pbdoc(
+            Returns a reference to tensor index i as a numpy array. The
+            base_object should be the interpreter object providing the memory.
+          )pbdoc")
+      .def(
+          "ModifyGraphWithDelegate",
+          // Address of the delegate is passed as an argument.
+          [](InterpreterWrapper& self, uintptr_t delegate_ptr) {
+            return tensorflow::pyo_or_throw(self.ModifyGraphWithDelegate(
+                reinterpret_cast<TfLiteDelegate*>(delegate_ptr)));
+          },
+          R"pbdoc(
+            Adds a delegate to the interpreter.
+          )pbdoc");
+}
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index ebb3f4a2c53..7e5f8ce704f 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -165,6 +165,112 @@ class TargetSpec(object):
     self.supported_types = supported_types
 
 
+class QuantizationMode(object):
+  """QuantizationMode determines the quantized conversion from user options."""
+
+  def __init__(self, optimizations, target_spec, representative_dataset,
+               graph_def):
+    self._optimizations = optimizations
+    self._target_spec = target_spec
+    self._representative_dataset = representative_dataset
+    self._graph_def = graph_def
+
+    self._validate_int8_required()
+
+  def post_training_int8_no_float(self):
+    """Post training int8 quantize, disallow float fallback."""
+    return (self._is_int8_target_required() and
+            self._representative_dataset is not None)
+
+  def post_training_int8_allow_float(self):
+    """Post training int8 quantize, allow float fallback."""
+    return (self._any_optimization_enabled() and
+            self._representative_dataset is not None and
+            self._smallest_supported_type() == constants.INT8)
+
+  def training_time_int8_allow_float(self):
+    """Training-time int8 quantize, allow float fallback."""
+    return (self._any_optimization_enabled() and
+            self._contains_training_quant_op())
+
+  def post_training_dynamic_range_int8(self):
+    """Post training int8 const, on-the-fly int8 quantize of dynamic tensors."""
+    # Post-training dynamic range quantization is only enabled if post-training
+    # int8 quantization and training time quantization was not done.
+    return (self._any_optimization_enabled() and
+            self._representative_dataset is None and
+            not self._contains_training_quant_op() and
+            self._smallest_supported_type() == constants.INT8)
+
+  def post_training_fp16(self):
+    """Post training fp16 quantize."""
+    return (self._any_optimization_enabled() and
+            self._smallest_supported_type() == constants.FLOAT16)
+
+  def fp32_execution(self):
+    """If none of the above are true."""
+    return not (self.post_training_int8_no_float() or
+                self.post_training_int8_allow_float() or
+                self.training_time_int8_allow_float() or
+                self.post_training_dynamic_range_int8() or
+                self.post_training_fp16())
+
+  # Below are helpers for the above functions.
+
+  def _validate_int8_required(self):
+    """Int8 mode requires certain parameters to exist and be compatible."""
+    if not self._is_int8_target_required():
+      return
+
+    if self._target_spec.supported_types and (self._smallest_supported_type() !=
+                                              constants.INT8):
+      raise ValueError("TFLITE_BUILTINS_INT8 requires smallest supported "
+                       "type to be INT8.")
+
+    if self._representative_dataset:
+      if not isinstance(self._representative_dataset, RepresentativeDataset):
+        self._representative_dataset = RepresentativeDataset(
+            self._representative_dataset)
+      if self._representative_dataset.input_gen is None:
+        raise ValueError(
+            "Provide an input generator for representative_dataset")
+    else:
+      # TODO(b/150661651): Relax this check for QAT.
+      raise ValueError("representative_dataset is required when specifying "
+                       "TFLITE_BUILTINS_INT8 or INT8 supported types.")
+
+  def _is_int8_target_required(self):
+    return (set([OpsSet.TFLITE_BUILTINS_INT8]) == set(
+        self._target_spec.supported_ops) or
+            set(self._target_spec.supported_types) == set([constants.INT8]))
+
+  def _any_optimization_enabled(self):
+    return bool(
+        set(self._optimizations).intersection([
+            Optimize.OPTIMIZE_FOR_LATENCY, Optimize.OPTIMIZE_FOR_SIZE,
+            Optimize.DEFAULT
+        ]))
+
+  def _smallest_supported_type(self):
+    if self._target_spec.supported_types:
+      return min(self._target_spec.supported_types, key=lambda x: x.size)
+    else:
+      # The default smallest supported type is INT8.
+      return constants.INT8
+
+  def _contains_training_quant_op(self):
+    """Checks if the graph contains any training-time quantization ops."""
+    training_quant_ops = frozenset({
+        "FakeQuantWithMinMaxVars", "FakeQuantWithMinMaxVarsPerChannel",
+        "QuantizeAndDequantizeV2", "QuantizeAndDequantizeV3"
+    })
+
+    for node_def in self._graph_def.node:
+      if any([op in node_def.name for op in training_quant_ops]):
+        return True
+    return False
+
+
 class TFLiteConverterBase(object):
   """Converter subclass to share functionality between V1 and V2 converters."""
 
@@ -202,104 +308,12 @@ class TFLiteConverterBase(object):
       optimizers.append("layout")
     return _get_grappler_config(optimizers)
 
-  def _validate_representative_dataset(self):
-    if self.representative_dataset:
-      if not isinstance(self.representative_dataset, RepresentativeDataset):
-        self.representative_dataset = RepresentativeDataset(
-            self.representative_dataset)
-      if self.representative_dataset.input_gen is None:
-        raise ValueError(
-            "Provide an input generator for representative_dataset")
-    elif self._is_int8_target_required():
-      # TODO(b/150661651): Relax this check for QAT
-      raise ValueError("representative_dataset is required when specifying "
-                       "TFLITE_BUILTINS_INT8 or INT8 supported types.")
-
-  def _validate_quantization(self):
-    if self._is_int8_target_required():
-      if self.target_spec.supported_types and (self._smallest_supported_type()
-                                               != constants.INT8):
-        raise ValueError("TFLITE_BUILTINS_INT8 requires smallest supported "
-                         "type to be INT8.")
-
-  def _is_int8_target_required(self):
-    return ((set([OpsSet.TFLITE_BUILTINS_INT8]) == set(
-        self.target_spec.supported_ops) or
-            self._smallest_supported_type() == constants.INT8) and
-        not self._is_int16x8_target_required())
-
-  def _is_int16x8_target_required(self):
-    return bool(
-          set(self.target_spec.supported_ops).intersection([
-            OpsSet.TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
-        ]))
-
-  def _smallest_supported_type(self):
-    if self.target_spec.supported_types:
-      return min(self.target_spec.supported_types, key=lambda x: x.size)
-    else:
-      return None
-
-  def _any_optimization_enabled(self):
-    return bool(
-        set(self.optimizations).intersection([
-            Optimize.OPTIMIZE_FOR_LATENCY, Optimize.OPTIMIZE_FOR_SIZE,
-            Optimize.DEFAULT
-        ]))
-
-  def _contains_training_quant_op(self, graph_def):
-    """Checks if the graph contains any training-time quantization ops.
-
-    This is one of the simplest ways to detect whether the model is
-    training-time quantized, since FakeQuant ops are added only during
-    quantization aware training.
-
-    Args:
-      graph_def: GraphDef representing the TF graph.
-
-    Returns:
-      True/False
-    """
-    training_quant_ops = frozenset({
-        "FakeQuantWithMinMaxVars", "FakeQuantWithMinMaxVarsPerChannel",
-        "QuantizeAndDequantizeV2", "QuantizeAndDequantizeV3"})
-
-    for node_def in graph_def.node:
-      if any([op in node_def.name for op in training_quant_ops]):
-        return True
-    return False
-
-  def _is_post_training_optimize(self):
-    return self._is_int8_target_required() or \
-      self._is_int16x8_target_required() or \
-      self._any_optimization_enabled()
-
-  def _is_int8_weight_only_quantize(self):
-    return (self._is_post_training_optimize() and
-            (self.representative_dataset is None) and
-            not self._contains_training_quant_op(self._graph_def))
-
-  def _is_float16_quantize(self):
-    return self._any_optimization_enabled() and (
-        self._smallest_supported_type() == constants.FLOAT16)
-
-  def _is_calibration_quantize(self):
-    return (self._is_post_training_optimize() and
-            self.representative_dataset and
-            self._smallest_supported_type() != constants.FLOAT16)
-
-  def _is_training_time_quantize(self):
-    return (self._contains_training_quant_op(self._graph_def) and
-            self._any_optimization_enabled())
-
   def _calibrate_quantize_model(self, result, inference_input_type,
-                                inference_output_type, enable_mlir_quantizer):
-    allow_float = not self._is_int8_target_required() and not self._is_int16x8_target_required()
-    if (self._is_int16x8_target_required()):
-      allow_float = bool(
-        set(self.target_spec.supported_ops).intersection([
-            OpsSet.TFLITE_BUILTINS
-        ]))
+                                inference_output_type, allow_float):
+    if not isinstance(self.representative_dataset, RepresentativeDataset):
+      self.representative_dataset = RepresentativeDataset(
+          self.representative_dataset)
+
     calibrate_quantize = _calibrator.Calibrator(result)
     activations_type = constants.INT16 if self._is_int16x8_target_required() else constants.INT8
     if (self.experimental_calibrate_only:
@@ -309,17 +323,15 @@ class TFLiteConverterBase(object):
         self.representative_dataset.input_gen, inference_input_type,
         inference_output_type, allow_float, activations_type, enable_mlir_quantizer)
 
-  def _is_unknown_shapes_allowed(self):
+  def _is_unknown_shapes_allowed(self, fp32_execution):
     # TODO(b/128319310): Investigate which quantization methods work.
-    if self._any_optimization_enabled():
+    if not fp32_execution:
       return False
 
     # Unknown dimensions are only allowed with the new converter.
     if not self.experimental_new_converter:
       return False
-
-    # TODO(b/150489014): Disable functionality for now.
-    return False
+    return True
 
   def _get_base_converter_args(self):
     """Returns the base converter args.
@@ -327,13 +339,11 @@ class TFLiteConverterBase(object):
     Returns:
       {key str: val}
     """
-    float16_quantize = self._is_float16_quantize()
     args = {
         "input_format": constants.TENSORFLOW_GRAPHDEF,
         "allow_custom_ops": self.allow_custom_ops,
-        "post_training_quantize": (self._is_int8_weight_only_quantize() or
-                                   float16_quantize),
-        "quantize_to_float16": float16_quantize,
+        "post_training_quantize": False,
+        "quantize_to_float16": False,
         "debug_info": self._debug_info,
         "target_ops": self.target_spec.supported_ops,
         "enable_mlir_converter": self.experimental_new_converter,
@@ -441,6 +451,21 @@ class TFLiteConverterV2(TFLiteConverterBase):
     Raises:
       Invalid signature keys.
     """
+    # When run without eager enabled, this will return the legacy
+    # TFLiteConverter.
+    if not context.executing_eagerly():
+      signature_key = None
+      if signature_keys:
+        if len(signature_keys) != 1:
+          raise ValueError("Only support a single signature key.")
+        else:
+          signature_key = signature_keys[0]
+      logging.warning("Invoking the TF1 implementation of TFLiteConverter "
+                      "because eager is disabled. Consider enabling eager.")
+      return TFLiteConverter.from_saved_model(saved_model_dir,
+                                              signature_key=signature_key,
+                                              tag_set=tags)
+
     # Ensures any graphs created in Eager mode are able to run. This is required
     # in order to create a tf.estimator.Exporter that exports a TFLite model.
     with context.eager_mode():
@@ -520,7 +545,10 @@ class TFLiteConverterV2(TFLiteConverterBase):
         config=self._grappler_config(),
         graph=frozen_func.graph)
 
-    if not self._is_unknown_shapes_allowed():
+    quant_mode = QuantizationMode(self.optimizations, self.target_spec,
+                                  self.representative_dataset, graph_def)
+
+    if not self._is_unknown_shapes_allowed(quant_mode.fp32_execution()):
       # Checks dimensions in input tensor.
       for tensor in input_tensors:
         # Note that shape_list might be empty for scalar shapes.
@@ -536,8 +564,6 @@ class TFLiteConverterV2(TFLiteConverterBase):
           shape[0] = 1
           tensor.set_shape(shape)
 
-    self._validate_quantization()
-    self._validate_representative_dataset()
     if self._trackable_obj is None:
       self._debug_info = _get_debug_info(
           _build_debug_info_func(self._funcs[0].graph), graph_def)
@@ -548,12 +574,22 @@ class TFLiteConverterV2(TFLiteConverterBase):
 
     converter_kwargs = self._get_base_converter_args()
 
-    if self._is_training_time_quantize():
+    if quant_mode.training_time_int8_allow_float():
       converter_kwargs.update({
           "inference_type": constants.INT8,
           "inference_input_type": constants.FLOAT,
       })
 
+    if quant_mode.post_training_dynamic_range_int8():
+      converter_kwargs.update({
+          "post_training_quantize": True,
+      })
+    elif quant_mode.post_training_fp16():
+      converter_kwargs.update({
+          "post_training_quantize": True,
+          "quantize_to_float16": True,
+      })
+
     if not self.experimental_new_converter:
       logging.warning(
           "Please consider switching to use new converter by setting "
@@ -572,9 +608,12 @@ class TFLiteConverterV2(TFLiteConverterBase):
         output_tensors=output_tensors,
         **converter_kwargs)
 
-    if self._is_calibration_quantize():
-      result = self._calibrate_quantize_model(
-          result, constants.FLOAT, constants.FLOAT)
+    if quant_mode.post_training_int8_no_float():
+      result = self._calibrate_quantize_model(result, constants.FLOAT,
+                                              constants.FLOAT, False)
+    elif quant_mode.post_training_int8_allow_float():
+      result = self._calibrate_quantize_model(result, constants.FLOAT,
+                                              constants.FLOAT, True)
 
     return result
 
@@ -1030,8 +1069,12 @@ class TFLiteConverter(TFLiteConverterBase):
         Input shape is not specified.
         None value for dimension in input_tensor.
     """
+    quant_mode = QuantizationMode(self.optimizations, self.target_spec,
+                                  self.representative_dataset, self._graph_def)
+
     # Checks dimensions in input tensor.
-    if not self._is_unknown_shapes_allowed() and self._has_valid_tensors():
+    if (not self._is_unknown_shapes_allowed(quant_mode.fp32_execution()) and
+        self._has_valid_tensors()):
       for tensor in self._input_tensors:
         shape = tensor.shape
         if not shape:
@@ -1064,13 +1107,14 @@ class TFLiteConverter(TFLiteConverterBase):
     else:
       quantized_stats = None
 
-    self._validate_quantization()
-    self._validate_representative_dataset()
-
     toco_inference_input_type = self.inference_input_type
     inference_input_type = self.inference_input_type
     inference_output_type = self.inference_output_type
-    post_training_optimize = self._is_post_training_optimize()
+    post_training_optimize = (
+        quant_mode.post_training_int8_no_float() or
+        quant_mode.post_training_int8_allow_float() or
+        quant_mode.post_training_dynamic_range_int8() or
+        quant_mode.post_training_fp16())
     if post_training_optimize:
       # Post training optimizations require that TOCO outputs a float model.
       if self.inference_type != constants.FLOAT:
@@ -1083,7 +1127,9 @@ class TFLiteConverter(TFLiteConverterBase):
       if inference_output_type is None:
         inference_output_type = constants.FLOAT
 
-    weight_only_quantize = self._is_int8_weight_only_quantize()
+    weight_only_quantize = (
+        quant_mode.post_training_dynamic_range_int8() or
+        quant_mode.post_training_fp16())
     if weight_only_quantize:
       # Currently, weight only quantization requires float inputs and outputs.
       if (inference_input_type != constants.FLOAT or
@@ -1124,6 +1170,17 @@ class TFLiteConverter(TFLiteConverterBase):
     self._debug_info = _get_debug_info(self._debug_info_func, optimized_graph)
 
     converter_kwargs = self._get_base_converter_args()
+
+    if quant_mode.post_training_dynamic_range_int8():
+      converter_kwargs.update({
+          "post_training_quantize": True,
+      })
+    elif quant_mode.post_training_fp16():
+      converter_kwargs.update({
+          "post_training_quantize": True,
+          "quantize_to_float16": True,
+      })
+
     converter_kwargs.update({
         "inference_type": self.inference_type,
         "inference_input_type": toco_inference_input_type,
@@ -1166,9 +1223,12 @@ class TFLiteConverter(TFLiteConverterBase):
           output_arrays=self._output_arrays,
           **converter_kwargs)
 
-    if self._is_calibration_quantize():
-      result = self._calibrate_quantize_model(
-          result, inference_input_type, inference_output_type)
+    if quant_mode.post_training_int8_no_float():
+      result = self._calibrate_quantize_model(result, inference_input_type,
+                                              inference_output_type, False)
+    elif quant_mode.post_training_int8_allow_float():
+      result = self._calibrate_quantize_model(result, inference_input_type,
+                                              inference_output_type, True)
 
     return result
 
@@ -1211,12 +1271,13 @@ class TFLiteConverter(TFLiteConverterBase):
         shape[0] = batch_size
         tensor.set_shape(shape)
 
-  def _is_unknown_shapes_allowed(self):
+  def _is_unknown_shapes_allowed(self, fp32_execution):
     # Ophint Converted nodes will need the shapes to be known.
     if _is_ophint_converted(self._graph_def):
       return False
 
-    if not super(TFLiteConverter, self)._is_unknown_shapes_allowed():
+    if not super(TFLiteConverter,
+                 self)._is_unknown_shapes_allowed(fp32_execution):
       return False
 
     # `conversion_summary_dir` calls TOCO. Unknown shapes are only supported by
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index a3f403eb2c4..17af3f0f065 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -427,7 +427,6 @@ class FromSessionTest(TestModels, parameterized.TestCase):
         str(error.exception))
 
   def testSizeNone(self):
-    self.skipTest('b/150489014')
     with ops.Graph().as_default():
       in_tensor = array_ops.placeholder(
           shape=[1, None, 16, 3], dtype=dtypes.float32)
@@ -893,17 +892,13 @@ class FromSessionTest(TestModels, parameterized.TestCase):
       ('NoRepresentativeData', False, False, True, False, False, False),
       # Post training quantization if both rep data and int8 included.
       ('UseSampleDataIncludeInt8', True, True, False, False, True, False),
-      # Error if no rep data and int8 included.
-      ('NoSampleDataIncludeInt8', False, True, False, True, False, False),
 
       # Quantize to Float16 even if rep data provided with mlir.
       ('UseRepresentativeDataMlir', True, False, True, False, False, True),
       # Quantize to Float16 if no rep data provided with mlir.
       ('NoRepresentativeDataMlir', False, False, True, False, False, True),
       # Post training quantization if both rep data and int8 included with mlir.
-      ('SampleDataIncludeInt8Mlir', True, True, False, False, True, True),
-      # Error if no rep data and int8 included with mlir.
-      ('NoSampleDataIncludeInt8Mlir', False, True, False, True, False, True))
+      ('SampleDataIncludeInt8Mlir', True, True, False, False, True, True))
   def testQuantizeFloat16(self, use_rep_data, include_int8,
                           is_float16_quantized, is_error,
                           is_post_training_quantized, enable_mlir):
@@ -974,7 +969,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     quantized_converter.experimental_new_converter = enable_mlir
     quantized_converter.optimizations = [lite.Optimize.DEFAULT]
     quantized_converter.target_spec.supported_types = [lite.constants.FLOAT16]
-    # Specifiy only int8 builtin ops
+    # Specify only int8 builtin ops
     quantized_converter.target_spec.supported_ops = [
         lite.OpsSet.TFLITE_BUILTINS_INT8
     ]
@@ -1039,9 +1034,6 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     quantized_converter.target_spec.supported_types = [lite.constants.INT8]
     quantized_tflite = quantized_converter.convert()
     self.assertTrue(quantized_tflite)
-    # Ensure that restricting supported types to int8 forces
-    # all fixed point ops/tensors in converter.
-    self.assertTrue(quantized_converter._is_int8_target_required())
 
     # Ensure that the quantized weights tflite model is smaller.
     self.assertLess(len(quantized_tflite), len(float_tflite))
@@ -1082,7 +1074,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     self.assertEqual(np.int8, output_details[0]['dtype'])
 
     # Ensure that the quantized weights tflite model is smaller.
-    self.assertTrue(len(quantized_tflite) < len(float_tflite))
+    self.assertLess(len(quantized_tflite), len(float_tflite))
 
   @parameterized.named_parameters(
       ('InferenceType_INT8', lite_constants.INT8),
diff --git a/tensorflow/lite/python/lite_v2_test.py b/tensorflow/lite/python/lite_v2_test.py
index d26badf4b7a..c48f91b31cd 100644
--- a/tensorflow/lite/python/lite_v2_test.py
+++ b/tensorflow/lite/python/lite_v2_test.py
@@ -29,6 +29,8 @@ from six.moves import zip
 from tensorflow.lite.python import lite
 from tensorflow.lite.python.interpreter import Interpreter
 from tensorflow.python import keras
+from tensorflow.python.client import session
+from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -48,6 +50,7 @@ from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import save_options
+from tensorflow.python.saved_model import saved_model
 from tensorflow.python.saved_model.save import save
 from tensorflow.python.training.tracking import tracking
 
@@ -421,6 +424,56 @@ class FromConcreteFunctionTest(TestModels):
 
 class FromSavedModelTest(TestModels):
 
+  def _createV1SavedModel(self, shape):
+    """Create a simple SavedModel."""
+    saved_model_dir = os.path.join(self.get_temp_dir(), 'simple_savedmodel')
+    with ops.Graph().as_default():
+      with session.Session() as sess:
+        in_tensor_1 = array_ops.placeholder(
+            shape=shape, dtype=dtypes.float32, name='inputB')
+        in_tensor_2 = array_ops.placeholder(
+            shape=shape, dtype=dtypes.float32, name='inputA')
+        variable_node = variables.Variable(1.0, name='variable_node')
+        out_tensor = in_tensor_1 + in_tensor_2 * variable_node
+        inputs = {'x': in_tensor_1, 'y': in_tensor_2}
+        outputs = {'z': out_tensor}
+        sess.run(variables.variables_initializer([variable_node]))
+        saved_model.simple_save(sess, saved_model_dir, inputs, outputs)
+    return saved_model_dir
+
+  @test_util.run_v2_only
+  def testV1SimpleModel(self):
+    """Test a SavedModel."""
+    with context.graph_mode():
+      saved_model_dir = self._createV1SavedModel(shape=[1, 16, 16, 3])
+
+      # Convert model and ensure model is not None.
+      converter = lite.TFLiteConverterV2.from_saved_model(saved_model_dir)
+      tflite_model = converter.convert()
+      self.assertTrue(tflite_model)
+
+      interpreter = Interpreter(model_content=tflite_model)
+      interpreter.allocate_tensors()
+
+      input_details = interpreter.get_input_details()
+      self.assertLen(input_details, 2)
+      self.assertEqual('inputA', input_details[0]['name'])
+      self.assertEqual(np.float32, input_details[0]['dtype'])
+      self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+      self.assertEqual((0., 0.), input_details[0]['quantization'])
+
+      self.assertEqual('inputB', input_details[1]['name'])
+      self.assertEqual(np.float32, input_details[1]['dtype'])
+      self.assertTrue(([1, 16, 16, 3] == input_details[1]['shape']).all())
+      self.assertEqual((0., 0.), input_details[1]['quantization'])
+
+      output_details = interpreter.get_output_details()
+      self.assertLen(output_details, 1)
+      self.assertEqual('add', output_details[0]['name'])
+      self.assertEqual(np.float32, output_details[0]['dtype'])
+      self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+      self.assertEqual((0., 0.), output_details[0]['quantization'])
+
   @test_util.run_v2_only
   def testConstModel(self):
     """Test a basic model with functions to make sure functions are inlined."""
@@ -870,7 +923,6 @@ class UnknownShapes(TestModels):
 
   @test_util.run_v2_only
   def testMatMul(self):
-    self.skipTest('b/150489014')
     input_data = constant_op.constant(
         np.array(np.random.random_sample((10, 4)), dtype=np.float32))
 
diff --git a/tensorflow/lite/python/op_hint.py b/tensorflow/lite/python/op_hint.py
index 6aac9d32d6f..29683718016 100644
--- a/tensorflow/lite/python/op_hint.py
+++ b/tensorflow/lite/python/op_hint.py
@@ -587,7 +587,7 @@ class _LiteAggregateOperand(_LiteOperand):
 
     In particular, if you have 4 inputs to a hint stub, this will be the
     node that you can use as an output. I.e. you have 4 timesteps from a
-    static rnn, then a fused UnidriecitonalLSTM will expect 1 input with
+    static rnn, then a fused UnidirectionalLSTM will expect 1 input with
     all 4 time steps. So here we make a pack and return the output name of
     that pack.
 
diff --git a/tensorflow/lite/python/optimize/BUILD b/tensorflow/lite/python/optimize/BUILD
index 053c57cd016..93af9fc1e9a 100644
--- a/tensorflow/lite/python/optimize/BUILD
+++ b/tensorflow/lite/python/optimize/BUILD
@@ -1,4 +1,4 @@
-load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
+load("//tensorflow:tensorflow.bzl", "pybind_extension")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -20,8 +20,7 @@ cc_library(
         "//tensorflow/lite/tools/optimize:quantize_model",
         "//tensorflow/lite/tools/optimize/calibration:calibration_reader",
         "//tensorflow/lite/tools/optimize/calibration:calibrator_lib",
-        "//third_party/py/numpy:headers",
-        "//third_party/python_runtime:headers",
+        "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@com_google_absl//absl/memory",
     ],
 )
@@ -34,38 +33,43 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/sparsity:sparsify_model",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/python/interpreter_wrapper:numpy",
         "//tensorflow/lite/python/interpreter_wrapper:python_error_reporter",
         "//tensorflow/lite/python/interpreter_wrapper:python_utils",
-        "//tensorflow/lite/tools/optimize:quantize_model",
-        "//tensorflow/lite/tools/optimize/calibration:calibration_reader",
-        "//tensorflow/lite/tools/optimize/calibration:calibrator_lib",
-        "//third_party/py/numpy:headers",
-        "//third_party/python_runtime:headers",
+        "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@com_google_absl//absl/memory",
     ],
 )
 
-tf_py_wrap_cc(
-    name = "tensorflow_lite_wrap_calibration_wrapper",
+pybind_extension(
+    name = "_pywrap_tensorflow_lite_calibration_wrapper",
     srcs = [
-        "calibration_wrapper.i",
+        "calibration_wrapper_pybind11.cc",
     ],
+    hdrs = ["calibration_wrapper.h"],
+    link_in_framework = True,
+    module_name = "_pywrap_tensorflow_lite_calibration_wrapper",
     deps = [
         ":calibration_wrapper_lib",
+        "//tensorflow/python:pybind11_lib",
         "//third_party/python_runtime:headers",
+        "@pybind11",
     ],
 )
 
-tf_py_wrap_cc(
-    name = "tensorflow_lite_wrap_sparsification_wrapper",
+pybind_extension(
+    name = "_pywrap_tensorflow_lite_sparsification_wrapper",
     srcs = [
-        "sparsification_wrapper.i",
+        "sparsification_wrapper_pybind11.cc",
     ],
+    hdrs = ["sparsification_wrapper.h"],
+    link_in_framework = True,
+    module_name = "_pywrap_tensorflow_lite_sparsification_wrapper",
     deps = [
         ":sparsification_wrapper_lib",
+        "//tensorflow/python:pybind11_lib",
         "//third_party/python_runtime:headers",
+        "@pybind11",
     ],
 )
 
@@ -77,7 +81,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/lite/python/optimize:tensorflow_lite_wrap_calibration_wrapper",
+        ":_pywrap_tensorflow_lite_calibration_wrapper",  # buildcleaner: keep
         "//tensorflow/python:util",
         "//third_party/py/numpy",
     ],
@@ -91,9 +95,8 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/lite/python/optimize:tensorflow_lite_wrap_sparsification_wrapper",
+        "_pywrap_tensorflow_lite_sparsification_wrapper",  # buildcleaner: keep
         "//tensorflow/python:util",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -110,11 +113,11 @@ py_test(
     deps = [
         ":calibrator",
         "//tensorflow/lite/python:lite_constants",
-        "//tensorflow/lite/python/optimize:tensorflow_lite_wrap_calibration_wrapper",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
         "@six_archive//:six",
     ],
 )
@@ -130,12 +133,8 @@ py_test(
     tags = ["no_oss"],
     deps = [
         ":sparsifier",
-        "//tensorflow/lite/python:lite_constants",
-        "//tensorflow/lite/python/optimize:tensorflow_lite_wrap_sparsification_wrapper",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.cc b/tensorflow/lite/python/optimize/calibration_wrapper.cc
index 7427bde55df..264d44269d7 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.cc
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.cc
@@ -50,8 +50,7 @@ namespace {
 using python_utils::PyDecrefDeleter;
 
 std::unique_ptr<tflite::ModelT> CreateMutableModel(const tflite::Model& model) {
-  std::unique_ptr<tflite::ModelT> copied_model =
-      absl::make_unique<tflite::ModelT>();
+  auto copied_model = absl::make_unique<tflite::ModelT>();
   model.UnPackTo(copied_model.get(), nullptr);
   return copied_model;
 }
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper_pybind11.cc b/tensorflow/lite/python/optimize/calibration_wrapper_pybind11.cc
new file mode 100644
index 00000000000..f56b23090b9
--- /dev/null
+++ b/tensorflow/lite/python/optimize/calibration_wrapper_pybind11.cc
@@ -0,0 +1,61 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "include/pybind11/pybind11.h"
+#include "include/pybind11/pytypes.h"
+#include "tensorflow/lite/python/optimize/calibration_wrapper.h"
+#include "tensorflow/python/lib/core/pybind11_lib.h"
+
+namespace py = pybind11;
+using tflite::calibration_wrapper::CalibrationWrapper;
+
+PYBIND11_MODULE(_pywrap_tensorflow_lite_calibration_wrapper, m) {
+  m.doc() = R"pbdoc(
+    _pywrap_tensorflow_lite_calibration_wrapper
+    -----
+  )pbdoc";
+  py::class_<CalibrationWrapper>(m, "CalibrationWrapper")
+      .def(py::init([](py::handle& data) {
+        return ::CalibrationWrapper::CreateWrapperCPPFromBuffer(data.ptr());
+      }))
+      .def("Prepare",
+           [](CalibrationWrapper& self) {
+             return tensorflow::pyo_or_throw(self.Prepare());
+           })
+      .def(
+          "FeedTensor",
+          [](CalibrationWrapper& self, py::handle& input_value) {
+            return tensorflow::pyo_or_throw(self.FeedTensor(input_value.ptr()));
+          })
+      .def("QuantizeModel",
+           [](CalibrationWrapper& self, int input_py_type, int output_py_type,
+              bool allow_float, bool enable_mlir_quantizer) {
+             return tensorflow::pyo_or_throw(
+                 self.QuantizeModel(input_py_type, output_py_type, allow_float,
+                                    enable_mlir_quantizer));
+           })
+      .def("QuantizeModel",
+           [](CalibrationWrapper& self, int input_py_type, int output_py_type,
+              bool allow_float) {
+             return tensorflow::pyo_or_throw(
+                 self.QuantizeModel(input_py_type, output_py_type, allow_float,
+                                    /*enable_mlir_quantizer=*/false));
+           })
+      .def("QuantizeModel", [](CalibrationWrapper& self, int input_py_type,
+                               int output_py_type, bool allow_float,
+                               const char* operator_output_name) {
+        return tensorflow::pyo_or_throw(self.QuantizeModel(
+            input_py_type, output_py_type, allow_float, operator_output_name));
+      });
+}
diff --git a/tensorflow/lite/python/optimize/calibrator.py b/tensorflow/lite/python/optimize/calibrator.py
index 37206516b58..47fa1703ef9 100644
--- a/tensorflow/lite/python/optimize/calibrator.py
+++ b/tensorflow/lite/python/optimize/calibrator.py
@@ -27,7 +27,7 @@ from tensorflow.lite.python import lite_constants
 _calibration_wrapper = LazyLoader(
     "_calibration_wrapper", globals(),
     "tensorflow.lite.python.optimize."
-    "tensorflow_lite_wrap_calibration_wrapper")
+    "_pywrap_tensorflow_lite_calibration_wrapper")
 
 
 class Calibrator(object):
@@ -48,8 +48,8 @@ class Calibrator(object):
     if not model_content:
       raise ValueError("`model_content` must be specified.")
     try:
-      self._calibrator = (_calibration_wrapper.CalibrationWrapper
-                          .CreateWrapperCPPFromBuffer(model_content))
+      self._calibrator = (
+          _calibration_wrapper.CalibrationWrapper(model_content))
     except Exception as e:
       raise ValueError("Failed to parse the model: %s." % e)
     if not self._calibrator:
diff --git a/tensorflow/lite/python/optimize/sparsification_wrapper_pybind11.cc b/tensorflow/lite/python/optimize/sparsification_wrapper_pybind11.cc
new file mode 100644
index 00000000000..8a090d5b50a
--- /dev/null
+++ b/tensorflow/lite/python/optimize/sparsification_wrapper_pybind11.cc
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "include/pybind11/pybind11.h"
+#include "include/pybind11/pytypes.h"
+#include "tensorflow/lite/python/optimize/sparsification_wrapper.h"
+#include "tensorflow/python/lib/core/pybind11_lib.h"
+
+namespace py = pybind11;
+using tflite::sparsification_wrapper::SparsificationWrapper;
+
+PYBIND11_MODULE(_pywrap_tensorflow_lite_sparsification_wrapper, m) {
+  m.doc() = R"pbdoc(
+    _pywrap_tensorflow_lite_sparsification_wrapper
+    -----
+  )pbdoc";
+  py::class_<SparsificationWrapper>(m, "SparsificationWrapper")
+      .def(py::init([](py::handle& data) {
+        return ::SparsificationWrapper::CreateWrapperCPPFromBuffer(data.ptr());
+      }))
+      .def("SparsifyModel", [](SparsificationWrapper& self) {
+        return tensorflow::pyo_or_throw(self.SparsifyModel());
+      });
+}
diff --git a/tensorflow/lite/python/optimize/sparsifier.py b/tensorflow/lite/python/optimize/sparsifier.py
index d47ead8ea52..a91d78be1fb 100644
--- a/tensorflow/lite/python/optimize/sparsifier.py
+++ b/tensorflow/lite/python/optimize/sparsifier.py
@@ -25,7 +25,7 @@ from tensorflow.python.util.lazy_loader import LazyLoader
 _sparsification_wrapper = LazyLoader(
     "_sparsification_wrapper", globals(),
     "tensorflow.lite.python.optimize."
-    "tensorflow_lite_wrap_sparsification_wrapper")
+    "_pywrap_tensorflow_lite_sparsification_wrapper")
 
 
 class Sparsifier(object):
@@ -38,7 +38,7 @@ class Sparsifier(object):
     """Constructor.
 
     Args:
-      model_content: Content of a TF-Lite Flatbuffer file.
+      model_content: Content of a TFLite Flatbuffer file.
 
     Raises:
       ValueError: If unable to open the model.
@@ -47,8 +47,7 @@ class Sparsifier(object):
       raise ValueError("`model_content` must be specified.")
     try:
       self._sparsifier = (
-          _sparsification_wrapper.SparsificationWrapper
-          .CreateWrapperCPPFromBuffer(model_content))
+          _sparsification_wrapper.SparsificationWrapper(model_content))
     except Exception as e:
       raise ValueError("Failed to parse the model: %s." % e)
     if not self._sparsifier:
@@ -59,6 +58,5 @@ class Sparsifier(object):
 
     Returns:
       A sparse model.
-
     """
     return self._sparsifier.SparsifyModel()
diff --git a/tensorflow/lite/python/testdata/BUILD b/tensorflow/lite/python/testdata/BUILD
index 23eb44cc06b..d98e401c76a 100644
--- a/tensorflow/lite/python/testdata/BUILD
+++ b/tensorflow/lite/python/testdata/BUILD
@@ -1,5 +1,5 @@
 load("//tensorflow/lite:build_def.bzl", "tf_to_tflite")
-load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
+load("//tensorflow:tensorflow.bzl", "pybind_extension")
 
 package(
     default_visibility = ["//tensorflow:internal"],
@@ -87,13 +87,20 @@ cc_library(
     alwayslink = 1,
 )
 
-tf_py_wrap_cc(
-    name = "test_registerer_wrapper",
+pybind_extension(
+    name = "_pywrap_test_registerer",
     srcs = [
-        "test_registerer.i",
+        "test_registerer_wrapper.cc",
     ],
+    hdrs = ["test_registerer.h"],
+    additional_exported_symbols = ["TF_TestRegisterer"],
+    link_in_framework = True,
+    module_name = "_pywrap_test_registerer",
     deps = [
         ":test_registerer",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:builtin_ops",
         "//third_party/python_runtime:headers",
+        "@pybind11",
     ],
 )
diff --git a/tensorflow/lite/python/testdata/test_registerer_wrapper.cc b/tensorflow/lite/python/testdata/test_registerer_wrapper.cc
new file mode 100644
index 00000000000..c50dee4346c
--- /dev/null
+++ b/tensorflow/lite/python/testdata/test_registerer_wrapper.cc
@@ -0,0 +1,36 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "include/pybind11/pybind11.h"
+#include "include/pybind11/pytypes.h"
+#include "tensorflow/lite/python/testdata/test_registerer.h"
+
+PYBIND11_MODULE(_pywrap_test_registerer, m) {
+  m.doc() = R"pbdoc(
+    _pywrap_test_registerer
+    -----
+  )pbdoc";
+  m.def("get_num_test_registerer_calls", &tflite::get_num_test_registerer_calls,
+        R"pbdoc(
+          Returns the num_test_registerer_calls counter and re-sets it.
+        )pbdoc");
+  m.def(
+      "TF_TestRegisterer",
+      [](uintptr_t resolver) {
+        tflite::TF_TestRegisterer(
+            reinterpret_cast<tflite::MutableOpResolver*>(resolver));
+      },
+      R"pbdoc(
+        Dummy registerer function with the correct signature. Registers a fake
+        custom op needed by test models. Increments the
+        num_test_registerer_calls counter by one.
+      )pbdoc");
+}
diff --git a/tensorflow/lite/python/tflite_convert.py b/tensorflow/lite/python/tflite_convert.py
index 2d8aca1862d..d0dd7313df3 100644
--- a/tensorflow/lite/python/tflite_convert.py
+++ b/tensorflow/lite/python/tflite_convert.py
@@ -500,10 +500,10 @@ def _get_tf1_flags(parser):
   parser.add_argument(
       "--conversion_summary_dir",
       type=str,
-      help=("Full filepath to store the conversion logs, which inclues graphviz"
-            " of the model before/after the conversion, an HTML report and the "
-            "conversion proto buffers. This will only be generated when passing"
-            " --experimental_new_converter"))
+      help=("Full filepath to store the conversion logs, which includes "
+            "graphviz of the model before/after the conversion, an HTML report "
+            "and the conversion proto buffers. This will only be generated "
+            "when passing --experimental_new_converter"))
 
 
 def _get_tf2_flags(parser):
diff --git a/tensorflow/lite/python/util.py b/tensorflow/lite/python/util.py
index 3c1630acb6f..c22970a2ab4 100644
--- a/tensorflow/lite/python/util.py
+++ b/tensorflow/lite/python/util.py
@@ -362,7 +362,7 @@ def get_debug_info(nodes_to_debug_info_func, converted_graph):
   Args:
     nodes_to_debug_info_func: The method to collect the op debug info for the
       nodes.
-    converted_graph: A `GraphDef` after optimization and transfermation.
+    converted_graph: A `GraphDef` after optimization and transformation.
 
   Returns:
     `GraphDebugInfo` for all the original nodes in `converted_graph`.
diff --git a/tensorflow/lite/python/util_test.py b/tensorflow/lite/python/util_test.py
index 7b1324bb757..51a0c57260a 100644
--- a/tensorflow/lite/python/util_test.py
+++ b/tensorflow/lite/python/util_test.py
@@ -176,7 +176,7 @@ class TensorFunctionsTest(test_util.TensorFlowTestCase):
 
   @test_util.run_deprecated_v1
   def testSetTensorShapeDimensionInvalid(self):
-    # Tests set_tensor_shape where the shape passed in is incompatiable.
+    # Tests set_tensor_shape where the shape passed in is incompatible.
     with ops.Graph().as_default():
       tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
     self.assertEqual([None, 3, 5], tensor.shape.as_list())
diff --git a/tensorflow/lite/python/wrap_toco.py b/tensorflow/lite/python/wrap_toco.py
index 213f31c6fe4..54060844dab 100644
--- a/tensorflow/lite/python/wrap_toco.py
+++ b/tensorflow/lite/python/wrap_toco.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 # We need to import pywrap_tensorflow prior to the toco wrapper.
-# pylint: disable=invalud-import-order,g-bad-import-order
+# pylint: disable=invalid-import-order,g-bad-import-order
 from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
 from tensorflow.python import _pywrap_toco_api
 
diff --git a/tensorflow/lite/testdata/string_input_model.bin b/tensorflow/lite/testdata/string_input_model.bin
new file mode 100644
index 00000000000..6c20990f6dc
Binary files /dev/null and b/tensorflow/lite/testdata/string_input_model.bin differ
diff --git a/tensorflow/lite/testdata/test_min_runtime.bin b/tensorflow/lite/testdata/test_min_runtime.bin
index c68174390de..567d7d1ff88 100644
Binary files a/tensorflow/lite/testdata/test_min_runtime.bin and b/tensorflow/lite/testdata/test_min_runtime.bin differ
diff --git a/tensorflow/lite/testing/generate_examples.py b/tensorflow/lite/testing/generate_examples.py
index fd21d42d2d5..bc87fde9467 100644
--- a/tensorflow/lite/testing/generate_examples.py
+++ b/tensorflow/lite/testing/generate_examples.py
@@ -29,7 +29,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 import argparse
 import os
 import sys
@@ -132,4 +132,4 @@ if __name__ == "__main__":
     print("\nGot the following unparsed args, %r please fix.\n" % unparsed)
     exit(1)
   else:
-    tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/lite/testing/generate_examples_lib.py b/tensorflow/lite/testing/generate_examples_lib.py
index c974070b8f9..9b3b7866caa 100644
--- a/tensorflow/lite/testing/generate_examples_lib.py
+++ b/tensorflow/lite/testing/generate_examples_lib.py
@@ -35,7 +35,7 @@ import os
 import re
 import zipfile
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 
 # TODO(aselle): Disable GPU for now
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
diff --git a/tensorflow/lite/testing/model_coverage/model_coverage_lib.py b/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
index 886d070cf15..26f518590df 100644
--- a/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
+++ b/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
@@ -380,6 +380,8 @@ def compare_models_v2(tflite_model,
       tf_results = [tf_results[list(tf_results.keys())[0]]]
     else:
       tf_results = [tf_results[tflite_label] for tflite_label in tflite_labels]
+  else:
+    tf_results = [tf_results]
 
   for tf_result, tflite_result in zip(tf_results, tflite_results):
     np.testing.assert_almost_equal(tf_result, tflite_result, tolerance)
diff --git a/tensorflow/lite/testing/op_tests/abs.py b/tensorflow/lite/testing/op_tests/abs.py
index 9757fd5d115..581831db120 100644
--- a/tensorflow/lite/testing/op_tests/abs.py
+++ b/tensorflow/lite/testing/op_tests/abs.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/add_n.py b/tensorflow/lite/testing/op_tests/add_n.py
index 51ae351464e..2385bd89600 100644
--- a/tensorflow/lite/testing/op_tests/add_n.py
+++ b/tensorflow/lite/testing/op_tests/add_n.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/arg_min_max.py b/tensorflow/lite/testing/op_tests/arg_min_max.py
index 88ddafb8fbe..e693ce6f44a 100644
--- a/tensorflow/lite/testing/op_tests/arg_min_max.py
+++ b/tensorflow/lite/testing/op_tests/arg_min_max.py
@@ -19,8 +19,7 @@ from __future__ import print_function
 
 import random
 
-import tensorflow as tf
-
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/batch_to_space_nd.py b/tensorflow/lite/testing/op_tests/batch_to_space_nd.py
index 3284cf9571d..794bc2dd54f 100644
--- a/tensorflow/lite/testing/op_tests/batch_to_space_nd.py
+++ b/tensorflow/lite/testing/op_tests/batch_to_space_nd.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -55,6 +55,15 @@ def make_batch_to_space_nd_tests(options):
           "constant_block_shape": [True, False],
           "constant_crops": [True, False],
       },
+      # 3D use case.
+      {
+          "dtype": [tf.float32],
+          "input_shape": [[1, 3, 3]],
+          "block_shape": [[1]],
+          "crops": [[[0, 0]], [[1, 1]]],
+          "constant_block_shape": [True],
+          "constant_crops": [True],
+      },
   ]
 
   def build_graph(parameters):
diff --git a/tensorflow/lite/testing/op_tests/binary_op.py b/tensorflow/lite/testing/op_tests/binary_op.py
index 88702b0542f..48c4296cc19 100644
--- a/tensorflow/lite/testing/op_tests/binary_op.py
+++ b/tensorflow/lite/testing/op_tests/binary_op.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/cast.py b/tensorflow/lite/testing/op_tests/cast.py
index ffa59e02044..a65b6e43a89 100644
--- a/tensorflow/lite/testing/op_tests/cast.py
+++ b/tensorflow/lite/testing/op_tests/cast.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/ceil.py b/tensorflow/lite/testing/op_tests/ceil.py
index c648bb206e4..8f7324d2741 100644
--- a/tensorflow/lite/testing/op_tests/ceil.py
+++ b/tensorflow/lite/testing/op_tests/ceil.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/concat.py b/tensorflow/lite/testing/op_tests/concat.py
index 2c15318deda..1cb726ceb1d 100644
--- a/tensorflow/lite/testing/op_tests/concat.py
+++ b/tensorflow/lite/testing/op_tests/concat.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/constant.py b/tensorflow/lite/testing/op_tests/constant.py
index 03c31dc7094..312f911887a 100644
--- a/tensorflow/lite/testing/op_tests/constant.py
+++ b/tensorflow/lite/testing/op_tests/constant.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/control_dep.py b/tensorflow/lite/testing/op_tests/control_dep.py
index f6467e54dd6..bd9e369303b 100644
--- a/tensorflow/lite/testing/op_tests/control_dep.py
+++ b/tensorflow/lite/testing/op_tests/control_dep.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import ExtraTocoOptions
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
diff --git a/tensorflow/lite/testing/op_tests/conv.py b/tensorflow/lite/testing/op_tests/conv.py
index c23f571251d..3a12cafe478 100644
--- a/tensorflow/lite/testing/op_tests/conv.py
+++ b/tensorflow/lite/testing/op_tests/conv.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/conv2d_transpose.py b/tensorflow/lite/testing/op_tests/conv2d_transpose.py
index 2f3f7430b4a..ec0b793292f 100644
--- a/tensorflow/lite/testing/op_tests/conv2d_transpose.py
+++ b/tensorflow/lite/testing/op_tests/conv2d_transpose.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/conv_activation.py b/tensorflow/lite/testing/op_tests/conv_activation.py
index 4ee2ae80af7..b4cc4c6ba58 100644
--- a/tensorflow/lite/testing/op_tests/conv_activation.py
+++ b/tensorflow/lite/testing/op_tests/conv_activation.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/conv_to_depthwiseconv_with_shared_weights.py b/tensorflow/lite/testing/op_tests/conv_to_depthwiseconv_with_shared_weights.py
index fa0d188bc8b..1185e971bfe 100644
--- a/tensorflow/lite/testing/op_tests/conv_to_depthwiseconv_with_shared_weights.py
+++ b/tensorflow/lite/testing/op_tests/conv_to_depthwiseconv_with_shared_weights.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/conv_with_shared_weights.py b/tensorflow/lite/testing/op_tests/conv_with_shared_weights.py
index e3228cb5830..6f4499af9d4 100644
--- a/tensorflow/lite/testing/op_tests/conv_with_shared_weights.py
+++ b/tensorflow/lite/testing/op_tests/conv_with_shared_weights.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/cos.py b/tensorflow/lite/testing/op_tests/cos.py
index 20b831dce9a..13ce39f1508 100644
--- a/tensorflow/lite/testing/op_tests/cos.py
+++ b/tensorflow/lite/testing/op_tests/cos.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/depth_to_space.py b/tensorflow/lite/testing/op_tests/depth_to_space.py
index 13859017a59..9693a664c54 100644
--- a/tensorflow/lite/testing/op_tests/depth_to_space.py
+++ b/tensorflow/lite/testing/op_tests/depth_to_space.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/depthwiseconv.py b/tensorflow/lite/testing/op_tests/depthwiseconv.py
index 4741469388f..1e20244e8f7 100644
--- a/tensorflow/lite/testing/op_tests/depthwiseconv.py
+++ b/tensorflow/lite/testing/op_tests/depthwiseconv.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/elementwise.py b/tensorflow/lite/testing/op_tests/elementwise.py
index 12b6dd196e9..0cb607d783b 100644
--- a/tensorflow/lite/testing/op_tests/elementwise.py
+++ b/tensorflow/lite/testing/op_tests/elementwise.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/elu.py b/tensorflow/lite/testing/op_tests/elu.py
index 9d23a823a56..12fbe981165 100644
--- a/tensorflow/lite/testing/op_tests/elu.py
+++ b/tensorflow/lite/testing/op_tests/elu.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/embedding_lookup.py b/tensorflow/lite/testing/op_tests/embedding_lookup.py
index eed7edd4564..cdcbb7143d4 100644
--- a/tensorflow/lite/testing/op_tests/embedding_lookup.py
+++ b/tensorflow/lite/testing/op_tests/embedding_lookup.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/equal.py b/tensorflow/lite/testing/op_tests/equal.py
index 68b2b5f1e3b..76a3fed1456 100644
--- a/tensorflow/lite/testing/op_tests/equal.py
+++ b/tensorflow/lite/testing/op_tests/equal.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/exp.py b/tensorflow/lite/testing/op_tests/exp.py
index c9018c0d17c..4d0e9256169 100644
--- a/tensorflow/lite/testing/op_tests/exp.py
+++ b/tensorflow/lite/testing/op_tests/exp.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/expand_dims.py b/tensorflow/lite/testing/op_tests/expand_dims.py
index 45ad9c6f97c..a7a21b8694a 100644
--- a/tensorflow/lite/testing/op_tests/expand_dims.py
+++ b/tensorflow/lite/testing/op_tests/expand_dims.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/eye.py b/tensorflow/lite/testing/op_tests/eye.py
index 111c7bd6598..cd6b6a00938 100644
--- a/tensorflow/lite/testing/op_tests/eye.py
+++ b/tensorflow/lite/testing/op_tests/eye.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_scalar_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/fill.py b/tensorflow/lite/testing/op_tests/fill.py
index 21a9a29e091..541651a5445 100644
--- a/tensorflow/lite/testing/op_tests/fill.py
+++ b/tensorflow/lite/testing/op_tests/fill.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_scalar_data
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
diff --git a/tensorflow/lite/testing/op_tests/floor.py b/tensorflow/lite/testing/op_tests/floor.py
index 731bfa3c091..b9da4afb006 100644
--- a/tensorflow/lite/testing/op_tests/floor.py
+++ b/tensorflow/lite/testing/op_tests/floor.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/fully_connected.py b/tensorflow/lite/testing/op_tests/fully_connected.py
index c5a7dc81fea..0cc0dfbedbe 100644
--- a/tensorflow/lite/testing/op_tests/fully_connected.py
+++ b/tensorflow/lite/testing/op_tests/fully_connected.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/fused_batch_norm.py b/tensorflow/lite/testing/op_tests/fused_batch_norm.py
index 10ed48ed013..f0d7b4fa98b 100644
--- a/tensorflow/lite/testing/op_tests/fused_batch_norm.py
+++ b/tensorflow/lite/testing/op_tests/fused_batch_norm.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/gather.py b/tensorflow/lite/testing/op_tests/gather.py
index a5340ceb8a9..fa9ed7814c3 100644
--- a/tensorflow/lite/testing/op_tests/gather.py
+++ b/tensorflow/lite/testing/op_tests/gather.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/gather_nd.py b/tensorflow/lite/testing/op_tests/gather_nd.py
index 5f17ea968ef..1137488469e 100644
--- a/tensorflow/lite/testing/op_tests/gather_nd.py
+++ b/tensorflow/lite/testing/op_tests/gather_nd.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/gather_with_constant.py b/tensorflow/lite/testing/op_tests/gather_with_constant.py
index f529003cb47..2a9e93bff07 100644
--- a/tensorflow/lite/testing/op_tests/gather_with_constant.py
+++ b/tensorflow/lite/testing/op_tests/gather_with_constant.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/global_batch_norm.py b/tensorflow/lite/testing/op_tests/global_batch_norm.py
index 903a3f55e23..9e3270b3ffa 100644
--- a/tensorflow/lite/testing/op_tests/global_batch_norm.py
+++ b/tensorflow/lite/testing/op_tests/global_batch_norm.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/greater.py b/tensorflow/lite/testing/op_tests/greater.py
index b543930bc52..f30a085020b 100644
--- a/tensorflow/lite/testing/op_tests/greater.py
+++ b/tensorflow/lite/testing/op_tests/greater.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/greater_equal.py b/tensorflow/lite/testing/op_tests/greater_equal.py
index c67b3494c67..20a49719f12 100644
--- a/tensorflow/lite/testing/op_tests/greater_equal.py
+++ b/tensorflow/lite/testing/op_tests/greater_equal.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/hardswish.py b/tensorflow/lite/testing/op_tests/hardswish.py
index 6f7c9262e71..2816fe5bd64 100644
--- a/tensorflow/lite/testing/op_tests/hardswish.py
+++ b/tensorflow/lite/testing/op_tests/hardswish.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 import functools
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -33,7 +33,7 @@ def _tflite_convert_verify_num_ops(tflite_convert_function, *args, **kwargs):
   tflite_model_binary = result[0]
   if not result[0]:
     tf.compat.v1.logging.error(result[1])  # stderr from running tflite_convert.
-    raise RuntimeError("Failed to bulid model: \n\n" + result[1])
+    raise RuntimeError("Failed to build model: \n\n" + result[1])
   interpreter = tf.lite.Interpreter(model_content=tflite_model_binary)
   interpreter.allocate_tensors()
   if len(interpreter.get_tensor_details()) != num_ops:
diff --git a/tensorflow/lite/testing/op_tests/identity.py b/tensorflow/lite/testing/op_tests/identity.py
index 712ef994fbf..45aa8af7699 100644
--- a/tensorflow/lite/testing/op_tests/identity.py
+++ b/tensorflow/lite/testing/op_tests/identity.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/l2norm.py b/tensorflow/lite/testing/op_tests/l2norm.py
index a5a78779679..932b0cfc908 100644
--- a/tensorflow/lite/testing/op_tests/l2norm.py
+++ b/tensorflow/lite/testing/op_tests/l2norm.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/l2norm_shared_epsilon.py b/tensorflow/lite/testing/op_tests/l2norm_shared_epsilon.py
index 8cadb19276c..0971422eaa5 100644
--- a/tensorflow/lite/testing/op_tests/l2norm_shared_epsilon.py
+++ b/tensorflow/lite/testing/op_tests/l2norm_shared_epsilon.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/leaky_relu.py b/tensorflow/lite/testing/op_tests/leaky_relu.py
index e437cfd7d73..e37df7722f5 100644
--- a/tensorflow/lite/testing/op_tests/leaky_relu.py
+++ b/tensorflow/lite/testing/op_tests/leaky_relu.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/less.py b/tensorflow/lite/testing/op_tests/less.py
index 44b32f98b43..099f0039454 100644
--- a/tensorflow/lite/testing/op_tests/less.py
+++ b/tensorflow/lite/testing/op_tests/less.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/less_equal.py b/tensorflow/lite/testing/op_tests/less_equal.py
index 2a96d65f10e..2e8e8d03887 100644
--- a/tensorflow/lite/testing/op_tests/less_equal.py
+++ b/tensorflow/lite/testing/op_tests/less_equal.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/local_response_norm.py b/tensorflow/lite/testing/op_tests/local_response_norm.py
index a5390d18d02..dea49775b8c 100644
--- a/tensorflow/lite/testing/op_tests/local_response_norm.py
+++ b/tensorflow/lite/testing/op_tests/local_response_norm.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/log_softmax.py b/tensorflow/lite/testing/op_tests/log_softmax.py
index 210b947eb75..0543158de0c 100644
--- a/tensorflow/lite/testing/op_tests/log_softmax.py
+++ b/tensorflow/lite/testing/op_tests/log_softmax.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/logic.py b/tensorflow/lite/testing/op_tests/logic.py
index d2e2045d3a3..743d6ecab90 100644
--- a/tensorflow/lite/testing/op_tests/logic.py
+++ b/tensorflow/lite/testing/op_tests/logic.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/lstm.py b/tensorflow/lite/testing/op_tests/lstm.py
index 65c35e4ffe3..549c839873e 100644
--- a/tensorflow/lite/testing/op_tests/lstm.py
+++ b/tensorflow/lite/testing/op_tests/lstm.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import ExtraTocoOptions
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
diff --git a/tensorflow/lite/testing/op_tests/matrix_diag.py b/tensorflow/lite/testing/op_tests/matrix_diag.py
index 4382c03bc25..835e07f3777 100644
--- a/tensorflow/lite/testing/op_tests/matrix_diag.py
+++ b/tensorflow/lite/testing/op_tests/matrix_diag.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/matrix_set_diag.py b/tensorflow/lite/testing/op_tests/matrix_set_diag.py
index 9ce3adf708f..41cd6ba7c09 100644
--- a/tensorflow/lite/testing/op_tests/matrix_set_diag.py
+++ b/tensorflow/lite/testing/op_tests/matrix_set_diag.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/maximum.py b/tensorflow/lite/testing/op_tests/maximum.py
index ebdd60c66ab..a78ac6eb8c7 100644
--- a/tensorflow/lite/testing/op_tests/maximum.py
+++ b/tensorflow/lite/testing/op_tests/maximum.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/minimum.py b/tensorflow/lite/testing/op_tests/minimum.py
index ec973779de3..1591acc291f 100644
--- a/tensorflow/lite/testing/op_tests/minimum.py
+++ b/tensorflow/lite/testing/op_tests/minimum.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/mirror_pad.py b/tensorflow/lite/testing/op_tests/mirror_pad.py
index c002e44d214..0d466354ac3 100644
--- a/tensorflow/lite/testing/op_tests/mirror_pad.py
+++ b/tensorflow/lite/testing/op_tests/mirror_pad.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/nearest_upsample.py b/tensorflow/lite/testing/op_tests/nearest_upsample.py
index eb0ca6ea437..9a437ee0ae1 100644
--- a/tensorflow/lite/testing/op_tests/nearest_upsample.py
+++ b/tensorflow/lite/testing/op_tests/nearest_upsample.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/neg.py b/tensorflow/lite/testing/op_tests/neg.py
index c85a5cde5db..25fdf39e808 100644
--- a/tensorflow/lite/testing/op_tests/neg.py
+++ b/tensorflow/lite/testing/op_tests/neg.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/not_equal.py b/tensorflow/lite/testing/op_tests/not_equal.py
index 08c40bd13d3..7ecf6e2ffb6 100644
--- a/tensorflow/lite/testing/op_tests/not_equal.py
+++ b/tensorflow/lite/testing/op_tests/not_equal.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/one_hot.py b/tensorflow/lite/testing/op_tests/one_hot.py
index ef92e11f5f3..44385d9ff97 100644
--- a/tensorflow/lite/testing/op_tests/one_hot.py
+++ b/tensorflow/lite/testing/op_tests/one_hot.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/pack.py b/tensorflow/lite/testing/op_tests/pack.py
index 149c2be2f09..1db34685f3b 100644
--- a/tensorflow/lite/testing/op_tests/pack.py
+++ b/tensorflow/lite/testing/op_tests/pack.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/pad.py b/tensorflow/lite/testing/op_tests/pad.py
index a136fee547b..0746a5a1601 100644
--- a/tensorflow/lite/testing/op_tests/pad.py
+++ b/tensorflow/lite/testing/op_tests/pad.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/padv2.py b/tensorflow/lite/testing/op_tests/padv2.py
index 0a121d409e6..d34a682f88b 100644
--- a/tensorflow/lite/testing/op_tests/padv2.py
+++ b/tensorflow/lite/testing/op_tests/padv2.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/placeholder_with_default.py b/tensorflow/lite/testing/op_tests/placeholder_with_default.py
index c6dcd6af38a..d081f0f983e 100644
--- a/tensorflow/lite/testing/op_tests/placeholder_with_default.py
+++ b/tensorflow/lite/testing/op_tests/placeholder_with_default.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
 from tensorflow.lite.testing.zip_test_utils import TF_TYPE_INFO
diff --git a/tensorflow/lite/testing/op_tests/pool.py b/tensorflow/lite/testing/op_tests/pool.py
index f334d2a77ab..a2063c6dcee 100644
--- a/tensorflow/lite/testing/op_tests/pool.py
+++ b/tensorflow/lite/testing/op_tests/pool.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/prelu.py b/tensorflow/lite/testing/op_tests/prelu.py
index 3fb593598ce..f927c7a8b00 100644
--- a/tensorflow/lite/testing/op_tests/prelu.py
+++ b/tensorflow/lite/testing/op_tests/prelu.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/range.py b/tensorflow/lite/testing/op_tests/range.py
index 63ba7bce4e6..ad3d2dfc252 100644
--- a/tensorflow/lite/testing/op_tests/range.py
+++ b/tensorflow/lite/testing/op_tests/range.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_scalar_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/rank.py b/tensorflow/lite/testing/op_tests/rank.py
index f27afdab853..277b967cb1b 100644
--- a/tensorflow/lite/testing/op_tests/rank.py
+++ b/tensorflow/lite/testing/op_tests/rank.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/reduce.py b/tensorflow/lite/testing/op_tests/reduce.py
index 1437bdffbfe..f6d05cbbdfc 100644
--- a/tensorflow/lite/testing/op_tests/reduce.py
+++ b/tensorflow/lite/testing/op_tests/reduce.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/relu.py b/tensorflow/lite/testing/op_tests/relu.py
index ee021b3e591..943a996cd0b 100644
--- a/tensorflow/lite/testing/op_tests/relu.py
+++ b/tensorflow/lite/testing/op_tests/relu.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/relu1.py b/tensorflow/lite/testing/op_tests/relu1.py
index ac92bac1cb2..7a061bca66d 100644
--- a/tensorflow/lite/testing/op_tests/relu1.py
+++ b/tensorflow/lite/testing/op_tests/relu1.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/relu6.py b/tensorflow/lite/testing/op_tests/relu6.py
index db75c22895c..16ff4b9eebc 100644
--- a/tensorflow/lite/testing/op_tests/relu6.py
+++ b/tensorflow/lite/testing/op_tests/relu6.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/reshape.py b/tensorflow/lite/testing/op_tests/reshape.py
index 752c1fa53c1..26bc0f7db35 100644
--- a/tensorflow/lite/testing/op_tests/reshape.py
+++ b/tensorflow/lite/testing/op_tests/reshape.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/resize_bilinear.py b/tensorflow/lite/testing/op_tests/resize_bilinear.py
index 897d3ceb5f6..0316e84b692 100644
--- a/tensorflow/lite/testing/op_tests/resize_bilinear.py
+++ b/tensorflow/lite/testing/op_tests/resize_bilinear.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/resize_nearest_neighbor.py b/tensorflow/lite/testing/op_tests/resize_nearest_neighbor.py
index 24b8b2aebe4..4e49e0bb39b 100644
--- a/tensorflow/lite/testing/op_tests/resize_nearest_neighbor.py
+++ b/tensorflow/lite/testing/op_tests/resize_nearest_neighbor.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/resolve_constant_strided_slice.py b/tensorflow/lite/testing/op_tests/resolve_constant_strided_slice.py
index f4409db789e..c0ef994a7b1 100644
--- a/tensorflow/lite/testing/op_tests/resolve_constant_strided_slice.py
+++ b/tensorflow/lite/testing/op_tests/resolve_constant_strided_slice.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
 
diff --git a/tensorflow/lite/testing/op_tests/reverse_sequence.py b/tensorflow/lite/testing/op_tests/reverse_sequence.py
index a42cbfd8d86..3b2a56d484c 100644
--- a/tensorflow/lite/testing/op_tests/reverse_sequence.py
+++ b/tensorflow/lite/testing/op_tests/reverse_sequence.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/reverse_v2.py b/tensorflow/lite/testing/op_tests/reverse_v2.py
index 05a0b169abe..97e833781dc 100644
--- a/tensorflow/lite/testing/op_tests/reverse_v2.py
+++ b/tensorflow/lite/testing/op_tests/reverse_v2.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/rfft2d.py b/tensorflow/lite/testing/op_tests/rfft2d.py
index 9eb607fd136..1e4ea42d879 100644
--- a/tensorflow/lite/testing/op_tests/rfft2d.py
+++ b/tensorflow/lite/testing/op_tests/rfft2d.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import ExtraTocoOptions
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
diff --git a/tensorflow/lite/testing/op_tests/round.py b/tensorflow/lite/testing/op_tests/round.py
index a4b62e1ae51..42a7a148b62 100644
--- a/tensorflow/lite/testing/op_tests/round.py
+++ b/tensorflow/lite/testing/op_tests/round.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/shape.py b/tensorflow/lite/testing/op_tests/shape.py
index 9ae78d3cdb2..b7c5a767eed 100644
--- a/tensorflow/lite/testing/op_tests/shape.py
+++ b/tensorflow/lite/testing/op_tests/shape.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/sigmoid.py b/tensorflow/lite/testing/op_tests/sigmoid.py
index d13bb4ad8e4..618d415d757 100644
--- a/tensorflow/lite/testing/op_tests/sigmoid.py
+++ b/tensorflow/lite/testing/op_tests/sigmoid.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/slice.py b/tensorflow/lite/testing/op_tests/slice.py
index adfa5781117..d751aea916d 100644
--- a/tensorflow/lite/testing/op_tests/slice.py
+++ b/tensorflow/lite/testing/op_tests/slice.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/softmax.py b/tensorflow/lite/testing/op_tests/softmax.py
index 9e9e87cb8ad..61bcde058d6 100644
--- a/tensorflow/lite/testing/op_tests/softmax.py
+++ b/tensorflow/lite/testing/op_tests/softmax.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/space_to_batch_nd.py b/tensorflow/lite/testing/op_tests/space_to_batch_nd.py
index 114f0841ae7..81753539e8a 100644
--- a/tensorflow/lite/testing/op_tests/space_to_batch_nd.py
+++ b/tensorflow/lite/testing/op_tests/space_to_batch_nd.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
@@ -55,6 +55,15 @@ def make_space_to_batch_nd_tests(options):
           "constant_block_shape": [True, False],
           "constant_paddings": [True, False],
       },
+      # 3D case.
+      {
+          "dtype": [tf.float32],
+          "input_shape": [[1, 4, 4]],
+          "block_shape": [[2]],
+          "paddings": [[[0, 0]]],
+          "constant_block_shape": [True, False],
+          "constant_paddings": [True, False],
+      },
   ]
 
   def build_graph(parameters):
diff --git a/tensorflow/lite/testing/op_tests/space_to_depth.py b/tensorflow/lite/testing/op_tests/space_to_depth.py
index b1a0864a037..f0de487dbeb 100644
--- a/tensorflow/lite/testing/op_tests/space_to_depth.py
+++ b/tensorflow/lite/testing/op_tests/space_to_depth.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/sparse_to_dense.py b/tensorflow/lite/testing/op_tests/sparse_to_dense.py
index f9e7ccbfdf2..e3888ef88de 100644
--- a/tensorflow/lite/testing/op_tests/sparse_to_dense.py
+++ b/tensorflow/lite/testing/op_tests/sparse_to_dense.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_scalar_data
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
diff --git a/tensorflow/lite/testing/op_tests/split.py b/tensorflow/lite/testing/op_tests/split.py
index 00f2b17af54..c0492c9bf23 100644
--- a/tensorflow/lite/testing/op_tests/split.py
+++ b/tensorflow/lite/testing/op_tests/split.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/splitv.py b/tensorflow/lite/testing/op_tests/splitv.py
index 8abade19c9b..e3ac796c835 100644
--- a/tensorflow/lite/testing/op_tests/splitv.py
+++ b/tensorflow/lite/testing/op_tests/splitv.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/squeeze.py b/tensorflow/lite/testing/op_tests/squeeze.py
index 1ac416d3a90..481dfd7612c 100644
--- a/tensorflow/lite/testing/op_tests/squeeze.py
+++ b/tensorflow/lite/testing/op_tests/squeeze.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/squeeze_transpose.py b/tensorflow/lite/testing/op_tests/squeeze_transpose.py
index 58682e65c50..a0de2465ca4 100644
--- a/tensorflow/lite/testing/op_tests/squeeze_transpose.py
+++ b/tensorflow/lite/testing/op_tests/squeeze_transpose.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/strided_slice.py b/tensorflow/lite/testing/op_tests/strided_slice.py
index 36defb52fdf..bc1b0115c24 100644
--- a/tensorflow/lite/testing/op_tests/strided_slice.py
+++ b/tensorflow/lite/testing/op_tests/strided_slice.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/strided_slice_np_style.py b/tensorflow/lite/testing/op_tests/strided_slice_np_style.py
index 45f2e4b867a..ac741f30000 100644
--- a/tensorflow/lite/testing/op_tests/strided_slice_np_style.py
+++ b/tensorflow/lite/testing/op_tests/strided_slice_np_style.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/tanh.py b/tensorflow/lite/testing/op_tests/tanh.py
index 96f306f60a7..ed2d32f0b0f 100644
--- a/tensorflow/lite/testing/op_tests/tanh.py
+++ b/tensorflow/lite/testing/op_tests/tanh.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/tile.py b/tensorflow/lite/testing/op_tests/tile.py
index 49d838c54ec..231c9039eb2 100644
--- a/tensorflow/lite/testing/op_tests/tile.py
+++ b/tensorflow/lite/testing/op_tests/tile.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/topk.py b/tensorflow/lite/testing/op_tests/topk.py
index fd2ffc1cb26..3b1547ffb14 100644
--- a/tensorflow/lite/testing/op_tests/topk.py
+++ b/tensorflow/lite/testing/op_tests/topk.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/transpose.py b/tensorflow/lite/testing/op_tests/transpose.py
index 9b7e026269f..1adb3389fae 100644
--- a/tensorflow/lite/testing/op_tests/transpose.py
+++ b/tensorflow/lite/testing/op_tests/transpose.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/transpose_conv.py b/tensorflow/lite/testing/op_tests/transpose_conv.py
index 7919763ea03..654856f0d88 100644
--- a/tensorflow/lite/testing/op_tests/transpose_conv.py
+++ b/tensorflow/lite/testing/op_tests/transpose_conv.py
@@ -18,7 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/unfused_gru.py b/tensorflow/lite/testing/op_tests/unfused_gru.py
index 786a429fe9a..fa495105e8c 100644
--- a/tensorflow/lite/testing/op_tests/unfused_gru.py
+++ b/tensorflow/lite/testing/op_tests/unfused_gru.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/unidirectional_sequence_lstm.py b/tensorflow/lite/testing/op_tests/unidirectional_sequence_lstm.py
index f82ce53ea8e..8d2dae63bca 100644
--- a/tensorflow/lite/testing/op_tests/unidirectional_sequence_lstm.py
+++ b/tensorflow/lite/testing/op_tests/unidirectional_sequence_lstm.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/unidirectional_sequence_rnn.py b/tensorflow/lite/testing/op_tests/unidirectional_sequence_rnn.py
index 80966905b4c..18dbad0eacd 100644
--- a/tensorflow/lite/testing/op_tests/unidirectional_sequence_rnn.py
+++ b/tensorflow/lite/testing/op_tests/unidirectional_sequence_rnn.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/unique.py b/tensorflow/lite/testing/op_tests/unique.py
index 14ebe6bc280..ab609ed41b7 100644
--- a/tensorflow/lite/testing/op_tests/unique.py
+++ b/tensorflow/lite/testing/op_tests/unique.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/unpack.py b/tensorflow/lite/testing/op_tests/unpack.py
index 0b594442654..1053d769017 100644
--- a/tensorflow/lite/testing/op_tests/unpack.py
+++ b/tensorflow/lite/testing/op_tests/unpack.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/unroll_batch_matmul.py b/tensorflow/lite/testing/op_tests/unroll_batch_matmul.py
index bc2c9deecbd..f74c2ea8d5c 100644
--- a/tensorflow/lite/testing/op_tests/unroll_batch_matmul.py
+++ b/tensorflow/lite/testing/op_tests/unroll_batch_matmul.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/where.py b/tensorflow/lite/testing/op_tests/where.py
index 06a54054a22..49802422e3f 100644
--- a/tensorflow/lite/testing/op_tests/where.py
+++ b/tensorflow/lite/testing/op_tests/where.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/op_tests/zeros_like.py b/tensorflow/lite/testing/op_tests/zeros_like.py
index 32b15fd1f76..e9db5502652 100644
--- a/tensorflow/lite/testing/op_tests/zeros_like.py
+++ b/tensorflow/lite/testing/op_tests/zeros_like.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.testing.zip_test_utils import create_tensor_data
 from tensorflow.lite.testing.zip_test_utils import make_zip_of_tests
 from tensorflow.lite.testing.zip_test_utils import register_make_test_function
diff --git a/tensorflow/lite/testing/parse_testdata.cc b/tensorflow/lite/testing/parse_testdata.cc
index 4f0ad45e011..166e8b4cb95 100644
--- a/tensorflow/lite/testing/parse_testdata.cc
+++ b/tensorflow/lite/testing/parse_testdata.cc
@@ -63,7 +63,7 @@ std::vector<std::string> ParseLine(const std::string& line) {
 
 }  // namespace
 
-// Given a `filename`, produce a vector of Examples corresopnding
+// Given a `filename`, produce a vector of Examples corresponding
 // to test cases that can be applied to a tflite model.
 TfLiteStatus ParseExamples(const char* filename,
                            std::vector<Example>* examples) {
diff --git a/tensorflow/lite/testing/tf_driver_test.cc b/tensorflow/lite/testing/tf_driver_test.cc
index e79704d616c..f14133c07c9 100644
--- a/tensorflow/lite/testing/tf_driver_test.cc
+++ b/tensorflow/lite/testing/tf_driver_test.cc
@@ -42,7 +42,7 @@ class TestDriver : public TfDriver {
   }
 };
 
-TEST(TfDriverTest, ReadingAndWrintingValues) {
+TEST(TfDriverTest, ReadingAndWritingValues) {
   TestDriver driver;
   ASSERT_EQ(driver.WriteAndReadBack(tensorflow::DT_FLOAT, {1, 2, 2},
                                     "0.10,0.20,0.30,0.40"),
@@ -55,7 +55,7 @@ TEST(TfDriverTest, ReadingAndWrintingValues) {
             "0,1,y,z");
 }
 
-TEST(TfDriverTest, ReadingAndWrintingValuesStrings) {
+TEST(TfDriverTest, ReadingAndWritingValuesStrings) {
   TestDriver driver;
 
   auto set_buffer = [](const std::vector<string>& values, string* buffer) {
diff --git a/tensorflow/lite/testing/toco_convert.py b/tensorflow/lite/testing/toco_convert.py
index e8d1e8eec12..79a6eea2954 100644
--- a/tensorflow/lite/testing/toco_convert.py
+++ b/tensorflow/lite/testing/toco_convert.py
@@ -23,7 +23,7 @@ import tempfile
 import traceback
 
 import numpy as np
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 
 from tensorflow.lite.testing import zip_test_utils
 
diff --git a/tensorflow/lite/testing/util.h b/tensorflow/lite/testing/util.h
index b76cb4c1f68..f72f284c3d3 100644
--- a/tensorflow/lite/testing/util.h
+++ b/tensorflow/lite/testing/util.h
@@ -51,7 +51,7 @@ class TestErrorReporter : public ErrorReporter {
 
 inline void LogToStderr() {
 #ifdef PLATFORM_GOOGLE
-  FLAGS_logtostderr = true;
+  absl::SetFlag(&FLAGS_logtostderr, true);
 #endif
 }
 
diff --git a/tensorflow/lite/testing/zip_test_utils.py b/tensorflow/lite/testing/zip_test_utils.py
index dcfe77875ff..be837a8a669 100644
--- a/tensorflow/lite/testing/zip_test_utils.py
+++ b/tensorflow/lite/testing/zip_test_utils.py
@@ -30,7 +30,7 @@ import numpy as np
 from six import StringIO
 
 # pylint: disable=g-import-not-at-top
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from google.protobuf import text_format
 from tensorflow.lite.testing import _pywrap_string_util
 from tensorflow.lite.testing import generate_examples_report as report_lib
@@ -94,7 +94,7 @@ class ExtraTocoOptions(object):
     self.allow_custom_ops = False
     # Rnn states that are used to support rnn / lstm cells.
     self.rnn_states = None
-    # Split the LSTM inputs from 5 inoputs to 18 inputs for TFLite.
+    # Split the LSTM inputs from 5 inputs to 18 inputs for TFLite.
     self.split_tflite_lstm_inputs = None
     # The inference input type passed to TFLiteConvert.
     self.inference_input_type = None
@@ -168,7 +168,7 @@ def write_examples(fp, examples):
 
   Args:
     fp: File-like object to write to.
-    examples: Example dictionary consiting of keys "inputs" and "outputs"
+    examples: Example dictionary consisting of keys "inputs" and "outputs"
   """
 
   def write_tensor(fp, x):
@@ -196,7 +196,7 @@ def write_test_cases(fp, model_name, examples):
   Args:
     fp: File-like object to write to.
     model_name: Filename where the model was written to, relative to filename.
-    examples: Example dictionary consiting of keys "inputs" and "outputs"
+    examples: Example dictionary consisting of keys "inputs" and "outputs"
   """
 
   fp.write("load_model: %s\n" % os.path.basename(model_name))
@@ -262,8 +262,8 @@ def make_zip_of_tests(options,
                       expected_tf_failures=0):
   """Helper to make a zip file of a bunch of TensorFlow models.
 
-  This does a cartestian product of the dictionary of test_parameters and
-  calls make_graph() for each item in the cartestian product set.
+  This does a cartesian product of the dictionary of test_parameters and
+  calls make_graph() for each item in the cartesian product set.
   If the graph is built successfully, then make_test_inputs() is called to
   build expected input/output value pairs. The model is then converted to tflite
   with toco, and the examples are serialized with the tflite model into a zip
@@ -410,35 +410,36 @@ def make_zip_of_tests(options,
         # Build graph
         report["tf_log"] = ""
         report["toco_log"] = ""
-        tf.compat.v1.reset_default_graph()
+        tf.reset_default_graph()
 
-        with tf.device("/cpu:0"):
+        with tf.Graph().as_default():
+          with tf.device("/cpu:0"):
+            try:
+              inputs, outputs = make_graph(param_dict_real)
+            except (tf.errors.UnimplementedError,
+                    tf.errors.InvalidArgumentError, ValueError):
+              report["tf_log"] += traceback.format_exc()
+              return None, report
+
+          sess = tf.Session()
           try:
-            inputs, outputs = make_graph(param_dict_real)
+            baseline_inputs, baseline_outputs = (
+                make_test_inputs(param_dict_real, sess, inputs, outputs))
           except (tf.errors.UnimplementedError, tf.errors.InvalidArgumentError,
                   ValueError):
             report["tf_log"] += traceback.format_exc()
             return None, report
-
-        sess = tf.compat.v1.Session()
-        try:
-          baseline_inputs, baseline_outputs = (
-              make_test_inputs(param_dict_real, sess, inputs, outputs))
-        except (tf.errors.UnimplementedError, tf.errors.InvalidArgumentError,
-                ValueError):
-          report["tf_log"] += traceback.format_exc()
-          return None, report
-        report["toco"] = report_lib.FAILED
-        report["tf"] = report_lib.SUCCESS
-        # Convert graph to toco
-        input_tensors = [(input_tensor.name.split(":")[0], input_tensor.shape,
-                          input_tensor.dtype) for input_tensor in inputs]
-        output_tensors = [_normalize_output_name(out.name) for out in outputs]
-        # pylint: disable=g-long-ternary
-        graph_def = freeze_graph(
-            sess,
-            tf.global_variables() + inputs +
-            outputs) if use_frozen_graph else sess.graph_def
+          report["toco"] = report_lib.FAILED
+          report["tf"] = report_lib.SUCCESS
+          # Convert graph to toco
+          input_tensors = [(input_tensor.name.split(":")[0], input_tensor.shape,
+                            input_tensor.dtype) for input_tensor in inputs]
+          output_tensors = [_normalize_output_name(out.name) for out in outputs]
+          # pylint: disable=g-long-ternary
+          graph_def = freeze_graph(
+              sess,
+              tf.global_variables() + inputs +
+              outputs) if use_frozen_graph else sess.graph_def
 
         if "split_tflite_lstm_inputs" in param_dict_real:
           extra_toco_options.split_tflite_lstm_inputs = param_dict_real[
@@ -523,10 +524,9 @@ def make_zip_of_tests(options,
   percent = 0
   if tf_success > 0:
     percent = float(toco_success) / float(tf_success) * 100.
-  tf.compat.v1.logging.info(
-      ("Archive %s Considered %d graphs, %d TF evaluated graphs "
-       " and %d TOCO converted graphs (%.1f%%"), zip_path, total_conversions,
-      tf_success, toco_success, percent)
+  tf.logging.info(("Archive %s Considered %d graphs, %d TF evaluated graphs "
+                   " and %d TOCO converted graphs (%.1f%%"), zip_path,
+                  total_conversions, tf_success, toco_success, percent)
 
   tf_failures = parameter_count - tf_success
 
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index ae4220c1182..dfb0143a05a 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -1287,12 +1287,11 @@ void ProcessSpaceToBatchNDOperator(Model* model, SpaceToBatchNDOperator* op) {
     return;
   }
   const auto& input_shape = input_array.shape();
-  // This method only handles input dimensions of 4.
-  if (input_shape.dimensions_count() != 4) {
+  // This method only handles input dimensions of 3 or 4.
+  if (input_shape.dimensions_count() != 3 &&
+      input_shape.dimensions_count() != 4) {
     return;
   }
-  const auto input_height = input_shape.dims(1);
-  const auto input_width = input_shape.dims(2);
 
   const auto& block_shape_array = model->GetArray(op->inputs[1]);
   const auto& paddings_array = model->GetArray(op->inputs[2]);
@@ -1301,18 +1300,16 @@ void ProcessSpaceToBatchNDOperator(Model* model, SpaceToBatchNDOperator* op) {
   QCHECK_EQ(block_shape_array_shape.dimensions_count(), 1);
   QCHECK_EQ(paddings_array_shape.dimensions_count(), 2);
 
-  // We only support two dimensions.
-  QCHECK_EQ(block_shape_array_shape.dims(0), 2);
+  int spatial_dims_num = input_shape.dimensions_count() - 2;
+  QCHECK_EQ(block_shape_array_shape.dims(0), spatial_dims_num);
   if (!block_shape_array.buffer) {
     return;
   }
   QCHECK(block_shape_array.data_type == ArrayDataType::kInt32);
   const auto& block_shape_data =
       block_shape_array.GetBuffer<ArrayDataType::kInt32>().data;
-  auto block_height = block_shape_data[0];
-  auto block_width = block_shape_data[1];
 
-  QCHECK_EQ(paddings_array_shape.dims(0), 2);  // Number of block dimensions
+  QCHECK_EQ(paddings_array_shape.dims(0), spatial_dims_num);
   QCHECK_EQ(paddings_array_shape.dims(1), 2);  // Two parameters per dimension.
   if (!paddings_array.buffer) {
     return;
@@ -1320,16 +1317,23 @@ void ProcessSpaceToBatchNDOperator(Model* model, SpaceToBatchNDOperator* op) {
   QCHECK(paddings_array.data_type == ArrayDataType::kInt32);
   const auto& paddings_data =
       paddings_array.GetBuffer<ArrayDataType::kInt32>().data;
-  int height_with_paddings = input_height + paddings_data[0] + paddings_data[1];
-  int width_with_paddings = input_width + paddings_data[2] + paddings_data[3];
-  QCHECK_EQ(height_with_paddings % block_height, 0);
-  QCHECK_EQ(width_with_paddings % block_width, 0);
-  int output_height = height_with_paddings / block_height;
-  int output_width = width_with_paddings / block_width;
 
-  model->GetArray(op->outputs[0])
-      .copy_shape(Shape({input_shape.dims(0) * block_height * block_width,
-                         output_height, output_width, input_shape.dims(3)}));
+  Shape output_shape(input_shape);
+  std::vector<int>* output_shape_data = output_shape.mutable_dims();
+  int output_batch_size = input_shape.dims(0);
+  for (int dim = 0; dim < spatial_dims_num; ++dim) {
+    int final_dim_size = (input_shape.dims(dim + 1) + paddings_data[dim * 2] +
+                          paddings_data[dim * 2 + 1]);
+    QCHECK_EQ(final_dim_size % block_shape_data[dim], 0);
+    output_shape_data->at(dim + 1) = final_dim_size / block_shape_data[dim];
+    output_batch_size *= block_shape_data[dim];
+  }
+
+  output_shape_data->at(0) = output_batch_size;
+  output_shape_data->at(input_shape.dimensions_count() - 1) =
+      input_shape.dims(input_shape.dimensions_count() - 1);
+
+  model->GetArray(op->outputs[0]).copy_shape(output_shape);
 }
 
 void ProcessBatchToSpaceNDOperator(Model* model, BatchToSpaceNDOperator* op) {
@@ -1339,9 +1343,9 @@ void ProcessBatchToSpaceNDOperator(Model* model, BatchToSpaceNDOperator* op) {
     return;
   }
   const auto& input_shape = input_array.shape();
-  CHECK_EQ(input_shape.dimensions_count(), 4);
-  const auto input_height = input_shape.dims(1);
-  const auto input_width = input_shape.dims(2);
+  CHECK_GE(input_shape.dimensions_count(), 3);
+  CHECK_LE(input_shape.dimensions_count(), 4);
+  int spatial_dims_num = input_shape.dimensions_count() - 2;
 
   const auto& block_shape_array = model->GetArray(op->inputs[1]);
   const auto& crops_array = model->GetArray(op->inputs[2]);
@@ -1351,35 +1355,38 @@ void ProcessBatchToSpaceNDOperator(Model* model, BatchToSpaceNDOperator* op) {
   QCHECK_EQ(crops_array_shape.dimensions_count(), 2);
 
   // We only support two dimensions.
-  QCHECK_EQ(block_shape_array_shape.dims(0), 2);
+  QCHECK_EQ(block_shape_array_shape.dims(0), spatial_dims_num);
   if (!block_shape_array.buffer) {
     return;
   }
   QCHECK(block_shape_array.data_type == ArrayDataType::kInt32);
   const auto& block_shape_data =
       block_shape_array.GetBuffer<ArrayDataType::kInt32>().data;
-  auto block_height = block_shape_data[0];
-  auto block_width = block_shape_data[1];
 
-  QCHECK_EQ(crops_array_shape.dims(0), 2);  // Number of block dimensions
+  QCHECK_EQ(crops_array_shape.dims(0), spatial_dims_num);
   QCHECK_EQ(crops_array_shape.dims(1), 2);  // Two parameters per dimension.
   if (!crops_array.buffer) {
     return;
   }
   QCHECK(crops_array.data_type == ArrayDataType::kInt32);
   const auto& crops_data = crops_array.GetBuffer<ArrayDataType::kInt32>().data;
-  const int crops_top = crops_data[0];
-  const int crops_bottom = crops_data[1];
-  const int crops_left = crops_data[2];
-  const int crops_right = crops_data[3];
-  const int output_height =
-      input_height * block_height - crops_top - crops_bottom;
-  const int output_width = input_width * block_width - crops_left - crops_right;
-  QCHECK_EQ(input_shape.dims(0) % (block_height * block_width), 0);
 
-  model->GetArray(op->outputs[0])
-      .copy_shape(Shape({input_shape.dims(0) / (block_height * block_width),
-                         output_height, output_width, input_shape.dims(3)}));
+  Shape output_shape(input_shape);
+  std::vector<int>* output_shape_data = output_shape.mutable_dims();
+  int output_batch_size = input_shape.dims(0);
+  for (int dim = 0; dim < spatial_dims_num; ++dim) {
+    // Number of batch must be multiple of (block_shape[dim]).
+    QCHECK_EQ(output_batch_size % block_shape_data[dim], 0);
+    output_batch_size = output_batch_size / block_shape_data[dim];
+    output_shape_data->at(dim + 1) =
+        input_shape.dims(dim + 1) * block_shape_data[dim] -
+        crops_data[dim * 2] - crops_data[dim * 2 + 1];
+  }
+  output_shape_data->at(0) = output_batch_size;
+  output_shape_data->at(input_shape.dimensions_count() - 1) =
+      input_shape.dims(input_shape.dimensions_count() - 1);
+
+  model->GetArray(op->outputs[0]).copy_shape(output_shape);
 }
 
 void ProcessGatherOperator(Model* model, GatherOperator* op) {
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_binary.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_binary.cc
index b9405e1fa05..c08d66b84f6 100644
--- a/tensorflow/lite/toco/graph_transformations/remove_trivial_binary.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_binary.cc
@@ -123,6 +123,9 @@ bool AreAllBufferElementsEqualTo(const std::vector<Scalar>& buffer_data,
                  AreAllBufferElementsEqualTo(constant_input_float_data, 1.f);
   }
 
+  is_trivial = is_trivial && binary_op->fused_activation_function ==
+                                 FusedActivationFunctionType::kNone;
+
   if (!is_trivial) {
     return ::tensorflow::Status::OK();
   }
diff --git a/tensorflow/lite/toco/python/toco_from_protos_test.py b/tensorflow/lite/toco/python/toco_from_protos_test.py
index e673f3fd32c..511f714dfe1 100644
--- a/tensorflow/lite/toco/python/toco_from_protos_test.py
+++ b/tensorflow/lite/toco/python/toco_from_protos_test.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 import os
 import tempfile
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.lite.toco import model_flags_pb2
 from tensorflow.lite.toco import toco_flags_pb2
 from tensorflow.lite.toco import types_pb2
@@ -80,7 +80,7 @@ class TocoFromProtosTest(googletest.TestCase):
 
   def test_toco(self):
     """Run a couple of TensorFlow graphs against TOCO through the python bin."""
-    with tf.compat.v1.Session() as sess:
+    with tf.Session() as sess:
       img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
       val = img + tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.])
       out = tf.identity(val, name="out")
diff --git a/tensorflow/lite/toco/tflite/BUILD b/tensorflow/lite/toco/tflite/BUILD
index 43cb88de4f6..d34f38a6863 100644
--- a/tensorflow/lite/toco/tflite/BUILD
+++ b/tensorflow/lite/toco/tflite/BUILD
@@ -32,7 +32,7 @@ cc_library(
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/toco:graph_transformations",
         "//tensorflow/lite/toco:model",
-        "//tensorflow/lite/tools/versioning:op_version",
+        "//tensorflow/lite/tools/versioning",
         "@com_google_absl//absl/memory",
         "@flatbuffers",
     ],
@@ -147,6 +147,7 @@ cc_library(
         ":operator",
         "//tensorflow/lite/toco:model",
         "//tensorflow/lite/toco:tooling_util",
+        "//tensorflow/lite/tools/versioning",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/lite/toco/tflite/op_version.cc b/tensorflow/lite/toco/tflite/op_version.cc
index 09150d23f37..853d30ad027 100644
--- a/tensorflow/lite/toco/tflite/op_version.cc
+++ b/tensorflow/lite/toco/tflite/op_version.cc
@@ -22,27 +22,13 @@ limitations under the License.
 #include "tensorflow/lite/toco/model.h"
 #include "tensorflow/lite/toco/tflite/operator.h"
 #include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/lite/tools/versioning/runtime_version.h"
 
 namespace toco {
 namespace tflite {
 
-bool CompareVersion(const string& v1, const string& v2) {
-  const std::vector<string>& vec1 = absl::StrSplit(v1, '.');
-  const std::vector<string>& vec2 = absl::StrSplit(v2, '.');
-  int i = 0;
-  while (i < vec1.size() && i < vec2.size()) {
-    int v1_val, v2_val;
-    if (absl::SimpleAtoi(vec1[i], &v1_val) &&
-        absl::SimpleAtoi(vec2[i], &v2_val)) {
-      if (v1_val != v2_val) return v1_val < v2_val;
-    }
-    ++i;
-  }
-  // If there are remaining items in v2 not being compared, then v1 should
-  // precede v2.
-  return i < vec2.size();
-}
-
+// Deprecated and please register new ops/versions in
+// tflite/tools/versioning/op_version.cc".
 string GetMinimumRuntimeVersionForModel(const Model& model) {
   // Use this as the placeholder string if a particular op is not yet included
   // in any Tensorflow's RC/Final release source package. Once that op is
@@ -145,7 +131,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kSplit, 1}, "1.5.0"},
           {{OperatorType::kSplit, 2}, "1.14.0"},
           {{OperatorType::kSplit, 3}, "1.14.0"},
-          {{OperatorType::kSplitV, 1}, "1.13.1"},
+          {{OperatorType::kSplitV, 2}, kPendingReleaseOpVersion},
           {{OperatorType::kStridedSlice, 1}, "1.6.0"},
           {{OperatorType::kStridedSlice, 2}, "1.14.0"},
           {{OperatorType::kStridedSlice, 3}, "2.1.0"},
@@ -253,7 +239,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
       // doesn't have a minimum runtime version associated, continue.
       continue;
     }
-    if (CompareVersion(model_min_version, it->second)) {
+    if (::tflite::CompareRuntimeVersion(model_min_version, it->second)) {
       // Current min model runtime version should be bumped if we see a higher
       // op version.
       model_min_version = it->second;
diff --git a/tensorflow/lite/toco/tflite/op_version.h b/tensorflow/lite/toco/tflite/op_version.h
index 7b644c19db1..54a77501b14 100644
--- a/tensorflow/lite/toco/tflite/op_version.h
+++ b/tensorflow/lite/toco/tflite/op_version.h
@@ -20,11 +20,6 @@ limitations under the License.
 namespace toco {
 namespace tflite {
 
-// Returns true if the first version string precedes the second.
-// For example, '1.14' should precede '1.9', also '1.14.1' should precede
-// '1.14'. If two version string is equal, then false will be returned.
-bool CompareVersion(const string&, const string&);
-
 // Get the minimum TF Lite runtime required to run a model. Each built-in
 // operator in the model will have its own minimum requirement of a runtime, and
 // the model's minimum requirement of runtime is defined as the maximum of all
diff --git a/tensorflow/lite/toco/tflite/op_version_test.cc b/tensorflow/lite/toco/tflite/op_version_test.cc
index 0d34b199735..14b086471b7 100644
--- a/tensorflow/lite/toco/tflite/op_version_test.cc
+++ b/tensorflow/lite/toco/tflite/op_version_test.cc
@@ -22,6 +22,7 @@ namespace toco {
 namespace tflite {
 namespace {
 
+// TODO(b/150701120): port the tests to tools/versioning/op_version_test.cc.
 TEST(OpVersionTest, MinimumVersionForSameOpVersions) {
   Model model;
   // Float convolutional kernel is introduced since '1.5.0'.
@@ -138,18 +139,6 @@ TEST(OpVersionTest, MinimumVersionForMixedOpVersions) {
   EXPECT_EQ(GetMinimumRuntimeVersionForModel(model), "1.10.0");
 }
 
-TEST(OpVersionTest, CompareVersionString) {
-  EXPECT_TRUE(CompareVersion("1.9", "1.13"));
-  EXPECT_FALSE(CompareVersion("1.13", "1.13"));
-  EXPECT_TRUE(CompareVersion("1.14", "1.14.1"));
-  EXPECT_FALSE(CompareVersion("1.14.1", "1.14"));
-  EXPECT_FALSE(CompareVersion("1.14.1", "1.9"));
-  EXPECT_FALSE(CompareVersion("1.0.9", "1.0.8"));
-  EXPECT_FALSE(CompareVersion("2.1.0", "1.2.0"));
-  EXPECT_TRUE(CompareVersion("", "1.13"));
-  EXPECT_FALSE(CompareVersion("", ""));
-}
-
 }  // namespace
 }  // namespace tflite
 }  // namespace toco
diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
index 0c310d15020..fc408730759 100644
--- a/tensorflow/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -235,6 +235,16 @@ class SpaceToBatchND
 
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {}
+
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    ::tflite::OpSignature op_sig =
+        GetVersioningOpSig(builtin_op(), op_signature);
+    op_sig.options.space_batch.num_dims =
+        input_array.shape().dimensions_count();
+    return ::tflite::GetBuiltinOperatorVersion(op_sig);
+  }
 };
 
 class Sub : public BuiltinOperator<SubOperator, ::tflite::SubOptions,
@@ -292,6 +302,16 @@ class BatchToSpaceND
 
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {}
+
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    ::tflite::OpSignature op_sig =
+        GetVersioningOpSig(builtin_op(), op_signature);
+    op_sig.options.space_batch.num_dims =
+        input_array.shape().dimensions_count();
+    return ::tflite::GetBuiltinOperatorVersion(op_sig);
+  }
 };
 
 class Cast : public BuiltinOperator<CastOperator, ::tflite::CastOptions,
@@ -1656,21 +1676,17 @@ class TensorFlowUnsupported : public BaseOperator {
     for (size_t i = 0; i < keys.size(); ++i) {
       const auto key = keys[i].AsKey();
       const auto& value = m[key];
-      // TODO(wvo): hack to make this code compile with 2 different API
-      // versions.
-      // Please remove once OS/internal versions are in sync.
-      // See hardcoded values in the switch below.
       switch (value.GetType()) {
-        case 5:  // flexbuffers::FBT_STRING:
+        case flexbuffers::FBT_STRING:
           (*attr)[key].set_s(value.AsString().c_str());
           break;
-        case 1:  // flexbuffers::FBT_INT:
+        case flexbuffers::FBT_INT:
           (*attr)[key].set_i(value.AsInt64());
           break;
-        case 3:  // flexbuffers::FBT_FLOAT:
+        case flexbuffers::FBT_FLOAT:
           (*attr)[key].set_f(value.AsFloat());
           break;
-        case 26:  // flexbuffers::FBT_BOOL:
+        case flexbuffers::FBT_BOOL:
           (*attr)[key].set_b(value.AsBool());
           if (string(key) == "_output_quantized") {
             op->quantized = value.AsBool();
@@ -1679,7 +1695,7 @@ class TensorFlowUnsupported : public BaseOperator {
             op->support_output_type_float_in_quantized_op = value.AsBool();
           }
           break;
-        case 11: {  // flexbuffers::FBT_VECTOR_INT: {
+        case flexbuffers::FBT_VECTOR_INT: {
           auto* list = (*attr)[key].mutable_list();
           const auto& vector = value.AsTypedVector();
           for (size_t i = 0; i < vector.size(); i++) {
@@ -1687,7 +1703,7 @@ class TensorFlowUnsupported : public BaseOperator {
           }
           break;
         }
-        case 13: {  // flexbuffers::FBT_VECTOR_FLOAT: {
+        case flexbuffers::FBT_VECTOR_FLOAT: {
           auto* list = (*attr)[key].mutable_list();
           const auto& vector = value.AsTypedVector();
           for (size_t i = 0; i < vector.size(); i++) {
@@ -1695,7 +1711,7 @@ class TensorFlowUnsupported : public BaseOperator {
           }
           break;
         }
-        case 15: {  // flexbuffers::FBT_VECTOR_STRING: {
+        case 15 /* TO_DO(wvo): flexbuffers::FBT_VECTOR_STRING_DEPRECATED*/: {
           auto* list = (*attr)[key].mutable_list();
           const auto& vector = value.AsTypedVector();
           for (size_t i = 0; i < vector.size(); i++) {
diff --git a/tensorflow/lite/toco/tflite/operator_test.cc b/tensorflow/lite/toco/tflite/operator_test.cc
index 391fac9b7d3..6e7f021a727 100644
--- a/tensorflow/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/lite/toco/tflite/operator_test.cc
@@ -840,7 +840,31 @@ TEST_F(OperatorTest, VersioningGreaterEqualTest) {
 }
 
 TEST_F(OperatorTest, VersioningSpaceToBatchNDTest) {
-  SimpleVersioningTest<SpaceToBatchNDOperator>();
+  SpaceToBatchNDOperator op;
+  op.inputs = {"input1"};
+  auto operator_by_type_map = BuildOperatorByTypeMap(false /*enable_flex_ops*/);
+  const BaseOperator* base_op = operator_by_type_map.at(op.type).get();
+
+  Model uint8_model;
+  Array& uint8_array = uint8_model.GetOrCreateArray(op.inputs[0]);
+  uint8_array.copy_shape({1, 2, 2, 2});
+  uint8_array.data_type = ArrayDataType::kUint8;
+  OperatorSignature uint8_signature = {.op = &op, .model = &uint8_model};
+  EXPECT_EQ(base_op->GetVersion(uint8_signature), 1);
+
+  Model int8_model;
+  Array& int8_array = int8_model.GetOrCreateArray(op.inputs[0]);
+  int8_array.copy_shape({1, 2, 2, 2});
+  int8_array.data_type = ArrayDataType::kInt8;
+  OperatorSignature int8_signature = {.op = &op, .model = &int8_model};
+  EXPECT_EQ(base_op->GetVersion(int8_signature), 2);
+
+  Model float_model;
+  Array& float_array = float_model.GetOrCreateArray(op.inputs[0]);
+  float_array.copy_shape({1, 2, 2});
+  float_array.data_type = ArrayDataType::kFloat;
+  OperatorSignature float_signature = {.op = &op, .model = &float_model};
+  EXPECT_EQ(base_op->GetVersion(float_signature), 3);
 }
 
 TEST_F(OperatorTest, VersioningLogSoftmaxTest) {
@@ -877,7 +901,31 @@ TEST_F(OperatorTest, VersioningUnpackTest) {
 }
 
 TEST_F(OperatorTest, VersioningBatchToSpaceNDTest) {
-  SimpleVersioningTest<BatchToSpaceNDOperator>();
+  BatchToSpaceNDOperator op;
+  op.inputs = {"input1"};
+  auto operator_by_type_map = BuildOperatorByTypeMap(false /*enable_flex_ops*/);
+  const BaseOperator* base_op = operator_by_type_map.at(op.type).get();
+
+  Model uint8_model;
+  Array& uint8_array = uint8_model.GetOrCreateArray(op.inputs[0]);
+  uint8_array.data_type = ArrayDataType::kUint8;
+  uint8_array.copy_shape({1, 2, 2, 2});
+  OperatorSignature uint8_signature = {.op = &op, .model = &uint8_model};
+  EXPECT_EQ(base_op->GetVersion(uint8_signature), 1);
+
+  Model int8_model;
+  Array& int8_array = int8_model.GetOrCreateArray(op.inputs[0]);
+  int8_array.data_type = ArrayDataType::kInt8;
+  int8_array.copy_shape({1, 2, 2, 2});
+  OperatorSignature int8_signature = {.op = &op, .model = &int8_model};
+  EXPECT_EQ(base_op->GetVersion(int8_signature), 2);
+
+  Model float_model;
+  Array& float_array = float_model.GetOrCreateArray(op.inputs[0]);
+  float_array.copy_shape({1, 2, 2});
+  float_array.data_type = ArrayDataType::kFloat;
+  OperatorSignature float_signature = {.op = &op, .model = &float_model};
+  EXPECT_EQ(base_op->GetVersion(float_signature), 3);
 }
 
 TEST_F(OperatorTest, VersioningTanhTest) {
diff --git a/tensorflow/lite/tools/BUILD b/tensorflow/lite/tools/BUILD
index bcad2b20305..9eacc19ed28 100644
--- a/tensorflow/lite/tools/BUILD
+++ b/tensorflow/lite/tools/BUILD
@@ -206,4 +206,12 @@ cc_test(
     ],
 )
 
+py_binary(
+    name = "zip_files",
+    srcs = ["zip_files.py"],
+    python_version = "PY3",
+    visibility = ["//visibility:public"],
+    deps = ["@absl_py//absl:app"],
+)
+
 tflite_portable_test_suite()
diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD
index 5a413112e2f..f9e33c74aa2 100644
--- a/tensorflow/lite/tools/benchmark/BUILD
+++ b/tensorflow/lite/tools/benchmark/BUILD
@@ -33,6 +33,7 @@ cc_binary(
         ],
         "//conditions:default": [],
     }),
+    tags = ["builder_default_android_arm64"],
     deps = [
         ":benchmark_tflite_model_lib",
         ":logging",
@@ -52,6 +53,7 @@ cc_binary(
         ],
         "//conditions:default": [],
     }),
+    tags = ["builder_default_android_arm64"],
     deps = [
         ":benchmark_performance_options",
         ":benchmark_tflite_model_lib",
@@ -90,10 +92,12 @@ cc_test(
     args = [
         "--fp32_graph=$(location //tensorflow/lite:testdata/multi_add.bin)",
         "--int8_graph=$(location //tensorflow/lite:testdata/add_quantized_int8.bin)",
+        "--string_graph=$(location //tensorflow/lite:testdata/string_input_model.bin)",
     ],
     data = [
         "//tensorflow/lite:testdata/add_quantized_int8.bin",
         "//tensorflow/lite:testdata/multi_add.bin",
+        "//tensorflow/lite:testdata/string_input_model.bin",
     ],
     tags = [
         "tflite_not_portable_android",
@@ -103,9 +107,11 @@ cc_test(
         ":benchmark_performance_options",
         ":benchmark_tflite_model_lib",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:string_util",
         "//tensorflow/lite/testing:util",
         "//tensorflow/lite/tools:command_line_flags",
         "@com_google_absl//absl/algorithm",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -148,6 +154,7 @@ cc_library(
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/experimental/ruy/profiler",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/profiling:platform_profiler",
         "//tensorflow/lite/profiling:profiler",
         "//tensorflow/lite/profiling:profile_summary_formatter",
         "//tensorflow/lite/tools/evaluation:utils",
diff --git a/tensorflow/lite/tools/benchmark/README.md b/tensorflow/lite/tools/benchmark/README.md
index 286ddf69cab..a0891859b0c 100644
--- a/tensorflow/lite/tools/benchmark/README.md
+++ b/tensorflow/lite/tools/benchmark/README.md
@@ -82,6 +82,11 @@ and the following optional parameters:
     blank, passive mode is used by default.
 *   `enable_op_profiling`: `bool` (default=false) \
     Whether to enable per-operator profiling measurement.
+*   `enable_platform_tracing`: `bool` (default=false) \
+    Whether to enable platform-wide tracing. Needs to be combined with
+    'enable_op_profiling'. Note, the platform-wide tracing might not work if
+    the tool runs as a commandline native binary. For example, on Android, the
+    ATrace-based tracing only works when the tool is launched as an APK.
 *   `hexagon_profiling`: `bool` (default=false) \
     Whether to profile ops running on hexagon. Needs to be combined with
     `enable_op_profiling`. When this is set to true the profile of ops
diff --git a/tensorflow/lite/tools/benchmark/android/AndroidManifest.xml b/tensorflow/lite/tools/benchmark/android/AndroidManifest.xml
index 7cdca2885dd..ee46ff9bf02 100644
--- a/tensorflow/lite/tools/benchmark/android/AndroidManifest.xml
+++ b/tensorflow/lite/tools/benchmark/android/AndroidManifest.xml
@@ -21,22 +21,21 @@
     <!-- Necessary for loading custom models from disk. -->
     <uses-permission android:name="android.permission.READ_EXTERNAL_STORAGE"/>
 
-    <!-- Target SDK 21 (<23) to avoid the need for requesting storage
-         permissions. This APK will almost always be used from the command-line
-         anyway, and be expicitly installed by the developer. -->
     <uses-sdk
-        android:minSdkVersion="21"
-        android:targetSdkVersion="21" />
+        android:minSdkVersion="23"
+        android:targetSdkVersion="23" />
 
-    <application>
+    <application
+        android:debuggable="true">
         <!-- This Activity runs the TensorFlow Lite benchmark at creation, using
              a provided set of arguments, then immediately terminates. -->
-        <activity android:name="org.tensorflow.lite.benchmark.BenchmarkModelActivity"
-                  android:screenOrientation="portrait"
-                  android:label="TFLite Benchmark"
-                  android:theme="@android:style/Theme.NoDisplay"
-                  android:exported="true"
-                  android:noHistory="true" />
+        <activity
+            android:name=".BenchmarkModelActivity"
+            android:screenOrientation="portrait"
+            android:label="TFLite Benchmark"
+            android:theme="@android:style/Theme.NoDisplay"
+            android:exported="true"
+            android:noHistory="true" />
     </application>
 
 </manifest>
diff --git a/tensorflow/lite/tools/benchmark/android/README.md b/tensorflow/lite/tools/benchmark/android/README.md
index 76463517db9..3e66b7f13f1 100644
--- a/tensorflow/lite/tools/benchmark/android/README.md
+++ b/tensorflow/lite/tools/benchmark/android/README.md
@@ -34,8 +34,10 @@ bazel build -c opt \
 (2) Connect your phone. Install the benchmark APK to your phone with adb:
 
 ```
-adb install -r -d bazel-bin/tensorflow/lite/tools/benchmark/android/benchmark_model.apk
+adb install -r -d -g bazel-bin/tensorflow/lite/tools/benchmark/android/benchmark_model.apk
 ```
+Note: Make sure to install with "-g" option to grant the permission for reading
+extenal storage.
 
 (3) Push the compute graph that you need to test.
 
@@ -50,9 +52,10 @@ and can be appended to the `args` string alongside the required `--graph` flag
 args key).
 
 ```
-adb shell am start -S -n \
-  org.tensorflow.lite.benchmark/org.tensorflow.lite.benchmark.BenchmarkModelActivity \
-  --es args '"--graph=/data/local/tmp/mobilenet_quant_v1_224.tflite --num_threads=4"'
+adb shell am start -S \
+  -n org.tensorflow.lite.benchmark/.BenchmarkModelActivity \
+  --es args '"--graph=/data/local/tmp/mobilenet_quant_v1_224.tflite \
+  --num_threads=4"'
 ```
 
 (5) The results will be available in Android's logcat, e.g.:
@@ -62,3 +65,60 @@ adb logcat | grep "Average inference"
 
 ... tflite  : Average inference timings in us: Warmup: 91471, Init: 4108, Inference: 80660.1
 ```
+
+## To trace Tensorflow Lite internals including operator invocation
+
+The steps described here follows the method of
+https://developer.android.com/topic/performance/tracing/on-device. Refer to the
+page for more detailed information.
+
+(0)-(3) Follow the steps (0)-(3) of [build/install/run](#to-buildinstallrun)
+section.
+
+(4) Set up Quick Settings tile for System Tracing app on your device. Follow the
+[instruction](https://developer.android.com/topic/performance/tracing/on-device#set-up-tile).
+The System Tracing tile will be added to the Quick Settings panel.
+
+Optionally, you can set up other configurations for tracing from the app menu.
+Refer to the
+[guide](https://developer.android.com/topic/performance/tracing/on-device#app-menu)
+for more information.
+
+(5) Tap the System Tracing tile, which has the label "Record trace". The tile
+becomes enabled, and a persistent notification appears to notify you that the
+system is now recording a trace.
+
+(6) Run the benchmark with platform tracing enabled.
+
+```
+adb shell am start -S \
+  -n org.tensorflow.lite.benchmark/.BenchmarkModelActivity \
+  --es args '"--graph=/data/local/tmp/mobilenet_quant_v1_224.tflite \
+  --num_threads=4 --enable_op_profiling=true --enable_platform_tracing=true"'
+```
+
+(7) Wait until the benchmark finishes. It can be checked from Android log
+messages, e.g.,
+
+```
+adb logcat | grep "Average inference"
+
+... tflite  : Average inference timings in us: Warmup: 91471, Init: 4108, Inference: 80660.1
+```
+
+(8) Stop tracing by tapping either the System Tracing tile in the Quick Settings
+panel or on the System Tracing notification. The system displays a new
+notification that contains the message "Saving trace". When saving is complete,
+the system dismisses the notification and displays a third notification "Trace
+saved", confirming that your trace has been saved and that you're ready to share
+the system trace.
+
+(9) [Share](https://developer.android.com/topic/performance/tracing/on-device#share-trace)
+a trace file,
+[convert](https://developer.android.com/topic/performance/tracing/on-device#converting_between_trace_formats)
+between tracing formats and
+[create](https://developer.android.com/topic/performance/tracing/on-device#create-html-report)
+an HTML report.
+Note that, the catured tracing file format is either in Perfetto format or in
+Systrace format depending on the Android version of your device. Select the
+appropriate method to handle the generated file.
diff --git a/tensorflow/lite/tools/benchmark/android/src/org/tensorflow/lite/benchmark/BenchmarkModelActivity.java b/tensorflow/lite/tools/benchmark/android/src/org/tensorflow/lite/benchmark/BenchmarkModelActivity.java
index 12410adf3d6..6833d70931b 100644
--- a/tensorflow/lite/tools/benchmark/android/src/org/tensorflow/lite/benchmark/BenchmarkModelActivity.java
+++ b/tensorflow/lite/tools/benchmark/android/src/org/tensorflow/lite/benchmark/BenchmarkModelActivity.java
@@ -18,6 +18,7 @@ package org.tensorflow.lite.benchmark;
 import android.app.Activity;
 import android.content.Intent;
 import android.os.Bundle;
+import android.os.Trace;
 import android.util.Log;
 
 /** Main {@code Activity} class for the benchmark app. */
@@ -37,7 +38,9 @@ public class BenchmarkModelActivity extends Activity {
     String args = bundle.getString(ARGS_INTENT_KEY_0, bundle.getString(ARGS_INTENT_KEY_1));
     Log.i(TAG, "Running TensorFlow Lite benchmark with args: " + args);
 
+    Trace.beginSection("TFLite Benchmark Model");
     BenchmarkModel.run(args);
+    Trace.endSection();
 
     finish();
   }
diff --git a/tensorflow/lite/tools/benchmark/benchmark_test.cc b/tensorflow/lite/tools/benchmark/benchmark_test.cc
index 8ff8c4ad57c..663a8187148 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_test.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_test.cc
@@ -20,7 +20,9 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/algorithm/algorithm.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/testing/util.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_performance_options.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_tflite_model.h"
@@ -29,13 +31,14 @@ limitations under the License.
 namespace {
 const std::string* g_fp32_model_path = nullptr;
 const std::string* g_int8_model_path = nullptr;
+const std::string* g_string_model_path = nullptr;
 }  // namespace
 
 namespace tflite {
 namespace benchmark {
 namespace {
 
-enum class ModelGraphType { FP32, INT8 };
+enum class ModelGraphType { FP32, INT8, STRING };
 
 BenchmarkParams CreateParams(int32_t num_runs, float min_secs, float max_secs,
                              ModelGraphType graph_type = ModelGraphType::FP32) {
@@ -52,6 +55,9 @@ BenchmarkParams CreateParams(int32_t num_runs, float min_secs, float max_secs,
   if (graph_type == ModelGraphType::INT8) {
     params.AddParam("graph",
                     BenchmarkParam::Create<std::string>(*g_int8_model_path));
+  } else if (graph_type == ModelGraphType::STRING) {
+    params.AddParam("graph",
+                    BenchmarkParam::Create<std::string>(*g_string_model_path));
   } else {
     // by default, simply use the fp32 one.
     params.AddParam("graph",
@@ -84,6 +90,8 @@ BenchmarkParams CreateParams(int32_t num_runs, float min_secs, float max_secs,
   params.AddParam("max_delegated_partitions", BenchmarkParam::Create<int>(0));
   params.AddParam("profiling_output_csv_file",
                   BenchmarkParam::Create<std::string>(""));
+  params.AddParam("enable_platform_tracing",
+                  BenchmarkParam::Create<bool>(false));
   return params;
 }
 
@@ -94,21 +102,52 @@ BenchmarkParams CreateFp32Params() {
 BenchmarkParams CreateInt8Params() {
   return CreateParams(2, 1.0f, 150.0f, ModelGraphType::INT8);
 }
+BenchmarkParams CreateStringParams() {
+  return CreateParams(2, 1.0f, 150.0f, ModelGraphType::STRING);
+}
 
 std::string CreateFilePath(const std::string& file_name) {
   return std::string(getenv("TEST_TMPDIR")) + file_name;
 }
 
 void WriteInputLayerValueFile(const std::string& file_path,
-                              ModelGraphType graph_type, int num_elements) {
+                              ModelGraphType graph_type, int num_elements,
+                              char file_value = 'a') {
   std::ofstream file(file_path);
-  int bytes =
-      graph_type == ModelGraphType::FP32 ? 4 * num_elements : num_elements;
-  // TODO(b/150335637): Add a test to check the initialization of the input
-  // tensor.
-  char* buffer = new char[bytes]();
-  file.write(buffer, bytes);
-  delete[](buffer);
+  int bytes = 0;
+  switch (graph_type) {
+    case ModelGraphType::FP32:
+      bytes = 4 * num_elements;
+      break;
+    case ModelGraphType::INT8:
+      bytes = num_elements;
+      break;
+    default:
+      LOG(WARNING) << absl::StrFormat(
+          "ModelGraphType(enum_value:%d) is not known.", graph_type);
+      LOG(WARNING) << "The size of the ModelGraphType will be 1 byte in tests.";
+      bytes = num_elements;
+      break;
+  }
+  std::vector<char> buffer(bytes, file_value);
+  file.write(buffer.data(), bytes);
+}
+
+void CheckInputTensorValue(const TfLiteTensor* input_tensor,
+                           char expected_value) {
+  ASSERT_THAT(input_tensor, testing::NotNull());
+  EXPECT_TRUE(std::all_of(
+      input_tensor->data.raw, input_tensor->data.raw + input_tensor->bytes,
+      [expected_value](char c) { return c == expected_value; }));
+}
+
+void CheckInputTensorValue(const TfLiteTensor* input_tensor,
+                           int tensor_dim_index,
+                           const std::string& expected_value) {
+  StringRef tensor_value = GetString(input_tensor, tensor_dim_index);
+  EXPECT_TRUE(absl::equal(tensor_value.str, tensor_value.str + tensor_value.len,
+                          expected_value.c_str(),
+                          expected_value.c_str() + expected_value.length()));
 }
 
 class TestBenchmark : public BenchmarkTfLiteModel {
@@ -121,26 +160,39 @@ class TestBenchmark : public BenchmarkTfLiteModel {
     PrepareInputData();
     ResetInputsAndOutputs();
   }
+
+  const TfLiteTensor* GetInputTensor(int index) {
+    return index >= interpreter_->inputs().size()
+               ? nullptr
+               : interpreter_->input_tensor(index);
+  }
 };
 
 TEST(BenchmarkTest, DoesntCrashFp32Model) {
   ASSERT_THAT(g_fp32_model_path, testing::NotNull());
 
-  BenchmarkTfLiteModel benchmark(CreateFp32Params());
+  TestBenchmark benchmark(CreateFp32Params());
   benchmark.Run();
 }
 
 TEST(BenchmarkTest, DoesntCrashInt8Model) {
   ASSERT_THAT(g_int8_model_path, testing::NotNull());
 
-  BenchmarkTfLiteModel benchmark(CreateInt8Params());
+  TestBenchmark benchmark(CreateInt8Params());
+  benchmark.Run();
+}
+
+TEST(BenchmarkTest, DoesntCrashStringModel) {
+  ASSERT_THAT(g_int8_model_path, testing::NotNull());
+
+  TestBenchmark benchmark(CreateStringParams());
   benchmark.Run();
 }
 
 TEST(BenchmarkTest, DoesntCrashMultiPerfOptions) {
   ASSERT_THAT(g_fp32_model_path, testing::NotNull());
 
-  BenchmarkTfLiteModel benchmark(CreateFp32Params());
+  TestBenchmark benchmark(CreateFp32Params());
   BenchmarkPerformanceOptions all_options_benchmark(&benchmark);
   all_options_benchmark.Run();
 }
@@ -150,7 +202,7 @@ TEST(BenchmarkTest, DoesntCrashMultiPerfOptionsWithProfiling) {
 
   BenchmarkParams params = CreateFp32Params();
   params.Set<bool>("enable_op_profiling", true);
-  BenchmarkTfLiteModel benchmark(std::move(params));
+  TestBenchmark benchmark(std::move(params));
   BenchmarkPerformanceOptions all_options_benchmark(&benchmark);
   all_options_benchmark.Run();
 }
@@ -166,7 +218,7 @@ TEST(BenchmarkTest, DoesntCrashWithExplicitInputFp32Model) {
   params.Set<std::string>("input_layer_shape",
                           "1,8,8,3:1,8,8,3:1,8,8,3:1,8,8,3");
   params.Set<std::string>("input_layer_value_range", "d,1,10:b,0,100");
-  BenchmarkTfLiteModel benchmark(std::move(params));
+  TestBenchmark benchmark(std::move(params));
   benchmark.Run();
 }
 
@@ -176,18 +228,33 @@ TEST(BenchmarkTest, DoesntCrashWithExplicitInputInt8Model) {
   // Note: the following input-related params are *specific* to model
   // 'g_int8_model_path' which is specified as
   // 'lite:testdata/add_quantized_int8.bin for the test.
+  int a_min = 1;
+  int a_max = 10;
   BenchmarkParams params = CreateInt8Params();
   params.Set<std::string>("input_layer", "a");
   params.Set<std::string>("input_layer_shape", "1,8,8,3");
-  params.Set<std::string>("input_layer_value_range", "a,1,10");
-  BenchmarkTfLiteModel benchmark(std::move(params));
+  params.Set<std::string>("input_layer_value_range",
+                          absl::StrFormat("a,%d,%d", a_min, a_max));
+  TestBenchmark benchmark(std::move(params));
   benchmark.Run();
+
+  auto input_tensor = benchmark.GetInputTensor(0);
+  ASSERT_THAT(input_tensor, testing::NotNull());
+  EXPECT_TRUE(std::all_of(
+      input_tensor->data.raw, input_tensor->data.raw + input_tensor->bytes,
+      [a_min, a_max](int i) { return a_min <= i && i <= a_max; }));
 }
 
 TEST(BenchmarkTest, DoesntCrashWithExplicitInputValueFilesFp32Model) {
   ASSERT_THAT(g_fp32_model_path, testing::NotNull());
-  const std::string file_path = CreateFilePath("fp32_binary");
-  WriteInputLayerValueFile(file_path, ModelGraphType::FP32, 192);
+  char file_value_b = 'b';
+  const std::string file_path_b = CreateFilePath("fp32_binary_b");
+  WriteInputLayerValueFile(file_path_b, ModelGraphType::FP32, 192,
+                           file_value_b);
+  char file_value_d = 'd';
+  const std::string file_path_d = CreateFilePath("fp32_binary_d");
+  WriteInputLayerValueFile(file_path_d, ModelGraphType::FP32, 192,
+                           file_value_d);
 
   // Note: the following input-related params are *specific* to model
   // 'g_fp32_model_path' which is specified as 'lite:testdata/multi_add.bin for
@@ -197,15 +264,19 @@ TEST(BenchmarkTest, DoesntCrashWithExplicitInputValueFilesFp32Model) {
   params.Set<std::string>("input_layer_shape",
                           "1,8,8,3:1,8,8,3:1,8,8,3:1,8,8,3");
   params.Set<std::string>("input_layer_value_files",
-                          "d:" + file_path + ",b:" + file_path);
-  BenchmarkTfLiteModel benchmark(std::move(params));
+                          "d:" + file_path_d + ",b:" + file_path_b);
+  TestBenchmark benchmark(std::move(params));
   benchmark.Run();
+
+  CheckInputTensorValue(benchmark.GetInputTensor(1), file_value_b);
+  CheckInputTensorValue(benchmark.GetInputTensor(3), file_value_d);
 }
 
 TEST(BenchmarkTest, DoesntCrashWithExplicitInputValueFilesInt8Model) {
   ASSERT_THAT(g_int8_model_path, testing::NotNull());
   const std::string file_path = CreateFilePath("int8_binary");
-  WriteInputLayerValueFile(file_path, ModelGraphType::INT8, 192);
+  char file_value = 'a';
+  WriteInputLayerValueFile(file_path, ModelGraphType::INT8, 192, file_value);
 
   // Note: the following input-related params are *specific* to model
   // 'g_int8_model_path' which is specified as
@@ -214,8 +285,42 @@ TEST(BenchmarkTest, DoesntCrashWithExplicitInputValueFilesInt8Model) {
   params.Set<std::string>("input_layer", "a");
   params.Set<std::string>("input_layer_shape", "1,8,8,3");
   params.Set<std::string>("input_layer_value_files", "a:" + file_path);
-  BenchmarkTfLiteModel benchmark(std::move(params));
+  TestBenchmark benchmark(std::move(params));
   benchmark.Run();
+
+  CheckInputTensorValue(benchmark.GetInputTensor(0), file_value);
+}
+
+TEST(BenchmarkTest, DoesntCrashWithExplicitInputValueFilesStringModel) {
+  ASSERT_THAT(g_string_model_path, testing::NotNull());
+  const std::string file_path = CreateFilePath("string_binary");
+  const std::string string_value_0 = "abcd";
+  const std::string string_value_1 = "12345";
+  const std::string string_value_2 = "a1b2c3d4e5";
+  std::ofstream file(file_path);
+  // Store the terminating null-character ('\0') at the end of the returned
+  // value by std::string::c_str().
+  file.write(string_value_0.c_str(), string_value_0.length() + 1);
+  file.write(string_value_1.c_str(), string_value_1.length() + 1);
+  file.write(string_value_2.c_str(), string_value_2.length() + 1);
+  file.close();
+
+  // Note: the following input-related params are *specific* to model
+  // 'g_string_model_path' which is specified as
+  // 'lite:testdata/string_input_model.bin for the test.
+  BenchmarkParams params = CreateStringParams();
+  params.Set<std::string>("input_layer", "a");
+  params.Set<std::string>("input_layer_shape", "1,3");
+  params.Set<std::string>("input_layer_value_files", "a:" + file_path);
+  TestBenchmark benchmark(std::move(params));
+  benchmark.Run();
+
+  auto input_tensor = benchmark.GetInputTensor(0);
+  ASSERT_THAT(input_tensor, testing::NotNull());
+  EXPECT_EQ(GetStringCount(input_tensor), 3);
+  CheckInputTensorValue(input_tensor, 0, string_value_0);
+  CheckInputTensorValue(input_tensor, 1, string_value_1);
+  CheckInputTensorValue(input_tensor, 2, string_value_2);
 }
 
 class MaxDurationWorksTestListener : public BenchmarkListener {
@@ -229,9 +334,9 @@ class MaxDurationWorksTestListener : public BenchmarkListener {
 
 TEST(BenchmarkTest, MaxDurationWorks) {
   ASSERT_THAT(g_fp32_model_path, testing::NotNull());
-  BenchmarkTfLiteModel benchmark(CreateParams(100000000 /* num_runs */,
-                                              1000000.0f /* min_secs */,
-                                              0.001f /* max_secs */));
+  TestBenchmark benchmark(CreateParams(100000000 /* num_runs */,
+                                       1000000.0f /* min_secs */,
+                                       0.001f /* max_secs */));
   MaxDurationWorksTestListener listener;
   benchmark.AddListener(&listener);
   benchmark.Run();
@@ -267,16 +372,19 @@ TEST(BenchmarkTest, ParametersArePopulatedWhenInputShapeIsNotSpecified) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  std::string fp32_model_path, int8_model_path;
+  std::string fp32_model_path, int8_model_path, string_model_path;
   std::vector<tflite::Flag> flags = {
       tflite::Flag::CreateFlag("fp32_graph", &fp32_model_path,
                                "Path to a fp32 model file."),
       tflite::Flag::CreateFlag("int8_graph", &int8_model_path,
                                "Path to a int8 model file."),
+      tflite::Flag::CreateFlag("string_graph", &string_model_path,
+                               "Path to a string model file."),
   };
 
   g_fp32_model_path = &fp32_model_path;
   g_int8_model_path = &int8_model_path;
+  g_string_model_path = &string_model_path;
 
   const bool parse_result =
       tflite::Flags::Parse(&argc, const_cast<const char**>(argv), flags);
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index f05cfb9fc80..6483559a6f6 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/op_resolver.h"
+#include "tensorflow/lite/profiling/platform_profiler.h"
 #include "tensorflow/lite/profiling/profile_summary_formatter.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_utils.h"
@@ -59,6 +60,20 @@ constexpr int kOpProfilingEnabledDefault = true;
 constexpr int kOpProfilingEnabledDefault = false;
 #endif
 
+// Dumps platform-wide tracing files via a platform-based profiler that's built
+// upon platform tracing tools, like ATrace on Android etc.
+class PlatformProfilingListener : public BenchmarkListener {
+ public:
+  explicit PlatformProfilingListener(Interpreter* interpreter) {
+    TFLITE_BENCHMARK_CHECK(interpreter);
+    platform_profiler_ = profiling::CreatePlatformProfiler();
+    interpreter->SetProfiler(platform_profiler_.get());
+  }
+
+ private:
+  std::unique_ptr<tflite::Profiler> platform_profiler_;
+};
+
 // Dumps ruy profiling events if the ruy profiler is enabled.
 class RuyProfileListener : public BenchmarkListener {
  public:
@@ -86,13 +101,18 @@ std::vector<std::string> Split(const std::string& str, const char delim) {
   return results;
 }
 
-void FillRandomString(tflite::DynamicBuffer* buffer,
-                      const std::vector<int>& sizes,
-                      const std::function<std::string()>& random_func) {
+int GetNumElements(const TfLiteIntArray* dim_array) {
   int num_elements = 1;
-  for (int dim : sizes) {
-    num_elements *= dim;
+  for (size_t i = 0; i < dim_array->size; i++) {
+    num_elements *= dim_array->data[i];
   }
+  return num_elements;
+}
+
+void FillRandomString(tflite::DynamicBuffer* buffer,
+                      const TfLiteIntArray* dim_array,
+                      const std::function<std::string()>& random_func) {
+  int num_elements = GetNumElements(dim_array);
   for (int i = 0; i < num_elements; ++i) {
     auto str = random_func();
     buffer->AddString(str.data(), str.length());
@@ -218,15 +238,6 @@ TfLiteStatus PopulateInputLayerInfo(
   return kTfLiteOk;
 }
 
-std::vector<int> TfLiteIntArrayToVector(const TfLiteIntArray* int_array) {
-  std::vector<int> values;
-  values.reserve(int_array->size);
-  for (size_t i = 0; i < int_array->size; i++) {
-    values.push_back(int_array->data[i]);
-  }
-  return values;
-}
-
 std::shared_ptr<profiling::ProfileSummaryFormatter>
 CreateProfileSummaryFormatter(bool format_as_csv) {
   return format_as_csv
@@ -261,6 +272,8 @@ BenchmarkParams BenchmarkTfLiteModel::DefaultParams() {
                           BenchmarkParam::Create<std::string>(""));
   default_params.AddParam("max_delegated_partitions",
                           BenchmarkParam::Create<int32_t>(0));
+  default_params.AddParam("enable_platform_tracing",
+                          BenchmarkParam::Create<bool>(false));
 
   for (const auto& delegate_util : GetRegisteredDelegateProviders()) {
     delegate_util->AddParams(&default_params);
@@ -300,7 +313,9 @@ std::vector<Flag> BenchmarkTfLiteModel::GetFlags() {
           "of input layer name and value file path separated by ':', e.g. "
           "input1:file_path1,input2:file_path2. If the input_name appears both "
           "in input_layer_value_range and input_layer_value_files, "
-          "input_layer_value_range of the input_name will be ignored."),
+          "input_layer_value_range of the input_name will be ignored. The file "
+          "format is binary and it should be array format or null separated "
+          "strings format."),
       CreateFlag<bool>("use_legacy_nnapi", &params_, "use legacy nnapi api"),
       CreateFlag<bool>("allow_fp16", &params_, "allow fp16"),
       CreateFlag<bool>("require_full_delegation", &params_,
@@ -313,7 +328,10 @@ std::vector<Flag> BenchmarkTfLiteModel::GetFlags() {
           "File path to export profile data as CSV, if not set "
           "prints to stdout."),
       CreateFlag<int>("max_delegated_partitions", &params_,
-                      "Max partitions to be delegated.")};
+                      "Max partitions to be delegated."),
+      CreateFlag<bool>("enable_platform_tracing", &params_,
+                       "enable platform-wide tracing, only meaningful when "
+                       "--enable_op_profiling is set to true.")};
 
   flags.insert(flags.end(), specific_flags.begin(), specific_flags.end());
 
@@ -356,6 +374,8 @@ void BenchmarkTfLiteModel::LogParams() {
                    << "]";
   TFLITE_LOG(INFO) << "Max number of delegated partitions : ["
                    << params_.Get<int32_t>("max_delegated_partitions") << "]";
+  TFLITE_LOG(INFO) << "Enable platform-wide tracing: ["
+                   << params_.Get<bool>("enable_platform_tracing") << "]";
 
   for (const auto& delegate_util : GetRegisteredDelegateProviders()) {
     delegate_util->LogParams(params_);
@@ -394,25 +414,41 @@ int64_t BenchmarkTfLiteModel::MayGetModelFileSize() {
 
 BenchmarkTfLiteModel::InputTensorData BenchmarkTfLiteModel::LoadInputTensorData(
     const TfLiteTensor& t, const std::string& input_file_path) {
+  std::ifstream value_file(input_file_path, std::ios::binary);
+  if (!value_file.good()) {
+    TFLITE_LOG(FATAL) << "Failed to read the input_layer_value_file:"
+                      << input_file_path;
+  }
   InputTensorData t_data;
   if (t.type == kTfLiteString) {
-    // TODO(b/149184079): Will update string type logic.
-  } else {
-    t_data.bytes = t.bytes;
-    std::ifstream value_file(input_file_path, std::ios::binary | std::ios::ate);
-    if (!value_file.good()) {
-      TFLITE_LOG(FATAL) << "Failed to read the input_layer_value_file:"
-                        << input_file_path;
+    t_data.data = VoidUniquePtr(
+        static_cast<void*>(new tflite::DynamicBuffer()),
+        [](void* ptr) { delete static_cast<DynamicBuffer*>(ptr); });
+    std::string line;
+    size_t num_line = 0;
+    // Read the line with the delimiter '\0'.
+    while (std::getline(value_file, line, '\0')) {
+      num_line++;
+      static_cast<DynamicBuffer*>(t_data.data.get())
+          ->AddString(line.data(), line.length());
     }
+    int num_elements = GetNumElements(t.dims);
+    if (num_line != num_elements) {
+      TFLITE_LOG(FATAL) << "The number of string in the input_layer_value_file("
+                        << input_file_path << ") is " << num_line
+                        << ". It should be " << num_elements << ".";
+    }
+  } else {
+    value_file.seekg(0, std::ios_base::end);
     if (value_file.tellg() != t.bytes) {
       TFLITE_LOG(FATAL) << "The size of " << input_file_path << " is "
                         << value_file.tellg() << " bytes. It should be "
                         << t.bytes << " bytes.";
     }
-    // Now initialize the type-erased unique_ptr (with custom deleter).
-    t_data.data = std::unique_ptr<void, void (*)(void*)>(
-        static_cast<void*>(new char[t.bytes]),
-        [](void* ptr) { delete[] static_cast<char*>(ptr); });
+    t_data.bytes = t.bytes;
+    t_data.data =
+        VoidUniquePtr(static_cast<void*>(new char[t.bytes]),
+                      [](void* ptr) { delete[] static_cast<char*>(ptr); });
     value_file.clear();
     value_file.seekg(0, std::ios_base::beg);
     value_file.read(static_cast<char*>(t_data.data.get()), t.bytes);
@@ -431,11 +467,7 @@ BenchmarkTfLiteModel::CreateRandomTensorData(const TfLiteTensor& t,
     low_range = layer_info->low;
     high_range = layer_info->high;
   }
-  std::vector<int> sizes = TfLiteIntArrayToVector(t.dims);
-  int num_elements = 1;
-  for (int i = 0; i < sizes.size(); ++i) {
-    num_elements *= sizes[i];
-  }
+  int num_elements = GetNumElements(t.dims);
   switch (t.type) {
     case kTfLiteFloat32: {
       return CreateInputTensorData<float>(
@@ -542,12 +574,17 @@ TfLiteStatus BenchmarkTfLiteModel::ResetInputsAndOutputs() {
     int i = interpreter_inputs[j];
     TfLiteTensor* t = interpreter_->tensor(i);
     if (t->type == kTfLiteString) {
-      tflite::DynamicBuffer buffer;
-      std::vector<int> sizes = TfLiteIntArrayToVector(t->dims);
-      FillRandomString(&buffer, sizes, []() {
-        return "we're have some friends over saturday to hang out in the yard";
-      });
-      buffer.WriteToTensor(t, /*new_shape=*/nullptr);
+      if (inputs_data_[j].data) {
+        static_cast<DynamicBuffer*>(inputs_data_[j].data.get())
+            ->WriteToTensor(t, /*new_shape=*/nullptr);
+      } else {
+        tflite::DynamicBuffer buffer;
+        FillRandomString(&buffer, t->dims, []() {
+          return "we're have some friends over saturday to hang out in the "
+                 "yard";
+        });
+        buffer.WriteToTensor(t, /*new_shape=*/nullptr);
+      }
     } else {
       std::memcpy(t->data.raw, inputs_data_[j].data.get(),
                   inputs_data_[j].bytes);
@@ -585,26 +622,26 @@ TfLiteStatus BenchmarkTfLiteModel::Init() {
       TFLITE_LOG(ERROR) << "Failed to apply " << delegate.first << " delegate.";
       return kTfLiteError;
     } else {
-      if (params_.Get<bool>("require_full_delegation")) {
-        bool fully_delegated = true;
-        if (interpreter_->execution_plan().size() != 1) {
+      bool fully_delegated = true;
+      if (interpreter_->execution_plan().size() != 1) {
+        fully_delegated = false;
+      } else {
+        int first_node_id = interpreter_->execution_plan()[0];
+        const TfLiteNode first_node =
+            interpreter_->node_and_registration(first_node_id)->first;
+        if (delegate.second.get() != first_node.delegate) {
           fully_delegated = false;
-        } else {
-          int first_node_id = interpreter_->execution_plan()[0];
-          const TfLiteNode first_node =
-              interpreter_->node_and_registration(first_node_id)->first;
-          if (delegate.second.get() != first_node.delegate) {
-            fully_delegated = false;
-          }
-        }
-
-        if (!fully_delegated) {
-          TFLITE_LOG(ERROR) << "Disallowed CPU fallback detected.";
-          return kTfLiteError;
         }
       }
-
-      TFLITE_LOG(INFO) << "Applied " << delegate.first << " delegate.";
+      if (params_.Get<bool>("require_full_delegation") && !fully_delegated) {
+        TFLITE_LOG(ERROR) << "Disallowed CPU fallback detected.";
+        return kTfLiteError;
+      }
+      const std::string delegate_status =
+          fully_delegated ? "completely" : "partially";
+      TFLITE_LOG(INFO) << "Applied " << delegate.first
+                       << " delegate, and the model graph will be "
+                       << delegate_status << " executed w/ the delegate.";
     }
   }
 
@@ -684,6 +721,12 @@ std::unique_ptr<tflite::OpResolver> BenchmarkTfLiteModel::GetOpResolver()
 std::unique_ptr<BenchmarkListener>
 BenchmarkTfLiteModel::MayCreateProfilingListener() const {
   if (!params_.Get<bool>("enable_op_profiling")) return nullptr;
+
+  if (params_.Get<bool>("enable_platform_tracing")) {
+    return std::unique_ptr<BenchmarkListener>(
+        new PlatformProfilingListener(interpreter_.get()));
+  }
+
   return std::unique_ptr<BenchmarkListener>(new ProfilingListener(
       interpreter_.get(), params_.Get<int32_t>("max_profiling_buffer_entries"),
       params_.Get<std::string>("profiling_output_csv_file"),
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
index 8737f2e2201..73082c01be6 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
@@ -89,10 +89,13 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
   std::unique_ptr<tflite::Interpreter> interpreter_;
 
  private:
+  // Implement type erasure with unique_ptr with custom deleter.
+  using VoidUniquePtr = std::unique_ptr<void, void (*)(void*)>;
+
   struct InputTensorData {
     InputTensorData() : data(nullptr, nullptr) {}
 
-    std::unique_ptr<void, void (*)(void*)> data;
+    VoidUniquePtr data;
     size_t bytes;
   };
 
@@ -105,11 +108,8 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
     std::generate_n(raw, num_elements, [&]() {
       return static_cast<T>(distribution(random_engine_));
     });
-    // Now initialize the type-erased unique_ptr (with custom deleter) from
-    // 'raw'.
-    tmp.data = std::unique_ptr<void, void (*)(void*)>(
-        static_cast<void*>(raw),
-        [](void* ptr) { delete[] static_cast<T*>(ptr); });
+    tmp.data = VoidUniquePtr(static_cast<void*>(raw),
+                             [](void* ptr) { delete[] static_cast<T*>(ptr); });
     return tmp;
   }
 
diff --git a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
index eff16783a14..10280df05b3 100644
--- a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
+++ b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
@@ -15,7 +15,7 @@ limitations under the License.
 
 // This file defines common C types and APIs for implementing operations,
 // delegates and other constructs in TensorFlow Lite. The actual operations and
-// delegtes can be defined using C++, but the interface between the interpreter
+// delegates can be defined using C++, but the interface between the interpreter
 // and the operations are C.
 //
 // Summary of abstractions
diff --git a/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/BenchmarkViewController.mm b/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/BenchmarkViewController.mm
index bb24040014b..7811fcb9c84 100644
--- a/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/BenchmarkViewController.mm
+++ b/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/BenchmarkViewController.mm
@@ -19,7 +19,12 @@
 #import <string>
 #import <vector>
 
+#if defined(USE_TFLITE_BENCHMARK_HEADERS)
+#include "tensorflow/lite/tools/benchmark/experimental/c/benchmark_c_api.h"
+#include "tensorflow/lite/tools/benchmark/logging.h"
+#else
 #import <TensorFlowLiteBenchmarkC/TensorFlowLiteBenchmarkC.h>
+#endif
 
 namespace {
 NSString* FilePathForResourceName(NSString* filename) {
diff --git a/tensorflow/lite/tools/evaluation/BUILD b/tensorflow/lite/tools/evaluation/BUILD
index 619ff0bd333..1610133bb05 100644
--- a/tensorflow/lite/tools/evaluation/BUILD
+++ b/tensorflow/lite/tools/evaluation/BUILD
@@ -43,6 +43,7 @@ cc_library(
         "//tensorflow/lite:context",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/lite/delegates/gpu:delegate",
diff --git a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/BUILD b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/BUILD
index d605b744690..05bce542cd9 100644
--- a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/BUILD
+++ b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
+
 package(
     default_visibility = [
         "//visibility:public",
@@ -5,8 +7,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
-
 py_binary(
     name = "preprocess_coco_minival",
     srcs = ["preprocess_coco_minival.py"],
@@ -20,7 +20,14 @@ cc_binary(
     name = "run_eval",
     srcs = ["run_eval.cc"],
     copts = tflite_copts(),
-    linkopts = tflite_linkopts(),
+    linkopts = tflite_linkopts() + select({
+        "//tensorflow:android": [
+            "-pie",  # Android 5.0 and later supports only PIE
+            "-lm",  # some builtin ops, e.g., tanh, need -lm
+            "-Wl,--rpath=/data/local/tmp/",  # Hexagon delegate libraries should be in /data/local/tmp
+        ],
+        "//conditions:default": [],
+    }),
     deps = [
         "//tensorflow/core:tflite_portable_logging",
         "//tensorflow/lite/c:common",
diff --git a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/README.md b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/README.md
index 96eb694f796..3876952d9b3 100644
--- a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/README.md
+++ b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/README.md
@@ -127,7 +127,13 @@ The following optional parameters can be used to modify the inference runtime:
 
 *   `delegate`: `string` \
     If provided, tries to use the specified delegate for accuracy evaluation.
-    Valid values: "nnapi", "gpu".
+    Valid values: "nnapi", "gpu", "hexagon".
+
+    NOTE: Please refer to the
+    [Hexagon delegate documentation](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/hexagon_delegate.md)
+    for instructions on how to set it up for the Hexagon delegate. The tool
+    assumes that `libhexagon_interface.so` and Qualcomm libraries lie in
+    `/data/local/tmp`.
 
 ### Debug Mode
 
diff --git a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
index 715b2cba7a8..6a61226d343 100644
--- a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
@@ -38,6 +38,7 @@ constexpr char kDebugModeFlag[] = "debug_mode";
 constexpr char kDelegateFlag[] = "delegate";
 constexpr char kNnapiDelegate[] = "nnapi";
 constexpr char kGpuDelegate[] = "gpu";
+constexpr char kHexagonDelegate[] = "hexagon";
 
 std::string GetNameFromPath(const std::string& str) {
   int pos = str.find_last_of("/\\");
@@ -62,6 +63,8 @@ bool EvaluateModel(const std::string& model_file_path,
     inference_params->set_delegate(TfliteInferenceParams::NNAPI);
   } else if (delegate == kGpuDelegate) {
     inference_params->set_delegate(TfliteInferenceParams::GPU);
+  } else if (delegate == kHexagonDelegate) {
+    inference_params->set_delegate(TfliteInferenceParams::HEXAGON);
   }
 
   // Get ground truth data.
diff --git a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/BUILD b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/BUILD
index e3d504d164c..5e2775870b5 100644
--- a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/BUILD
+++ b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
+
 package(
     default_visibility = [
         "//visibility:public",
@@ -5,13 +7,13 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
-
 common_linkopts = tflite_linkopts() + select({
-    "//conditions:default": [],
     "//tensorflow:android": [
-        "-pie",
+        "-pie",  # Android 5.0 and later supports only PIE
+        "-lm",  # some builtin ops, e.g., tanh, need -lm
+        "-Wl,--rpath=/data/local/tmp/",  # Hexagon delegate libraries should be in /data/local/tmp
     ],
+    "//conditions:default": [],
 })
 
 cc_binary(
diff --git a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/README.md b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/README.md
index 7fe020d5b32..cad13ba58bd 100644
--- a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/README.md
+++ b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/README.md
@@ -105,7 +105,13 @@ The following optional parameters can be used to modify the inference runtime:
 
 *   `delegate`: `string` \
     If provided, tries to use the specified delegate for accuracy evaluation.
-    Valid values: "nnapi", "gpu".
+    Valid values: "nnapi", "gpu", "hexagon".
+
+    NOTE: Please refer to the
+    [Hexagon delegate documentation](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/hexagon_delegate.md)
+    for instructions on how to set it up for the Hexagon delegate. The tool
+    assumes that `libhexagon_interface.so` and Qualcomm libraries lie in
+    `/data/local/tmp`.
 
 ## Downloading ILSVRC
 
diff --git a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
index 57bf331523b..47a1161b2d7 100644
--- a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
@@ -38,6 +38,7 @@ constexpr char kInterpreterThreadsFlag[] = "num_interpreter_threads";
 constexpr char kDelegateFlag[] = "delegate";
 constexpr char kNnapiDelegate[] = "nnapi";
 constexpr char kGpuDelegate[] = "gpu";
+constexpr char kHexagonDelegate[] = "hexagon";
 
 template <typename T>
 std::vector<T> GetFirstN(const std::vector<T>& v, int n) {
@@ -62,6 +63,8 @@ bool EvaluateModel(const std::string& model_file_path,
     inference_params->set_delegate(TfliteInferenceParams::NNAPI);
   } else if (delegate == kGpuDelegate) {
     inference_params->set_delegate(TfliteInferenceParams::GPU);
+  } else if (delegate == kHexagonDelegate) {
+    inference_params->set_delegate(TfliteInferenceParams::HEXAGON);
   }
   classification_params->mutable_topk_accuracy_eval_params()->set_k(10);
 
diff --git a/tensorflow/lite/tools/evaluation/utils.cc b/tensorflow/lite/tools/evaluation/utils.cc
index 39e93bee930..264d6d695bc 100644
--- a/tensorflow/lite/tools/evaluation/utils.cc
+++ b/tensorflow/lite/tools/evaluation/utils.cc
@@ -164,5 +164,19 @@ Interpreter::TfLiteDelegatePtr CreateHexagonDelegate(
 #endif  // defined(__ANDROID__)
 }
 
+Interpreter::TfLiteDelegatePtr CreateXNNPACKDelegate() {
+  TfLiteXNNPackDelegateOptions xnnpack_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  return CreateXNNPACKDelegate(&xnnpack_options);
+}
+
+Interpreter::TfLiteDelegatePtr CreateXNNPACKDelegate(
+    const TfLiteXNNPackDelegateOptions* xnnpack_options) {
+  auto xnnpack_delegate = TfLiteXNNPackDelegateCreate(xnnpack_options);
+  return Interpreter::TfLiteDelegatePtr(
+      xnnpack_delegate,
+      [](TfLiteDelegate* delegate) { TfLiteXNNPackDelegateDelete(delegate); });
+}
+
 }  // namespace evaluation
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/evaluation/utils.h b/tensorflow/lite/tools/evaluation/utils.h
index a143daf637a..121c1187fdc 100644
--- a/tensorflow/lite/tools/evaluation/utils.h
+++ b/tensorflow/lite/tools/evaluation/utils.h
@@ -29,6 +29,7 @@ limitations under the License.
 
 #include "tensorflow/lite/context.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/model.h"
 
 namespace tflite {
@@ -64,6 +65,10 @@ Interpreter::TfLiteDelegatePtr CreateGPUDelegate(
 Interpreter::TfLiteDelegatePtr CreateHexagonDelegate(
     const std::string& library_directory_path, bool profiling);
 
+Interpreter::TfLiteDelegatePtr CreateXNNPACKDelegate();
+Interpreter::TfLiteDelegatePtr CreateXNNPACKDelegate(
+    const TfLiteXNNPackDelegateOptions* options);
+
 }  // namespace evaluation
 }  // namespace tflite
 
diff --git a/tensorflow/lite/tools/gen_op_registration_test.cc b/tensorflow/lite/tools/gen_op_registration_test.cc
index e572d28d2e1..3037963264e 100644
--- a/tensorflow/lite/tools/gen_op_registration_test.cc
+++ b/tensorflow/lite/tools/gen_op_registration_test.cc
@@ -100,7 +100,7 @@ TEST_F(GenOpRegistrationTest, TestNormalizeCustomOpName) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: FLAGS_logtostderr = true;
+  // On Linux, add: absl::SetFlag(&FLAGS_logtostderr, true);
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index b78fb14b785..078a115805c 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -99,7 +99,8 @@ MINIMAL_SRCS := \
 # build files.
 
 PROFILER_SRCS := \
-  tensorflow/lite/profiling/memory_info.cc \
+	tensorflow/lite/profiling/memory_info.cc \
+	tensorflow/lite/profiling/platform_profiler.cc \
 	tensorflow/lite/profiling/time.cc
 
 PROFILE_SUMMARIZER_SRCS := \
@@ -132,7 +133,7 @@ tensorflow/lite/tools/make/downloads/fft2d/fftsg2d.c \
 tensorflow/lite/tools/make/downloads/flatbuffers/src/util.cpp
 CORE_CC_ALL_SRCS += \
 	$(shell find tensorflow/lite/tools/make/downloads/absl/absl/ \
-	             -type f -name \*.cc | grep -v test | grep -v benchmark | grep -v synchronization | grep -v debugging | grep -v hash | grep -v flags)
+	             -type f -name \*.cc | grep -v test | grep -v benchmark | grep -v synchronization | grep -v debugging | grep -v hash | grep -v flags | grep -v random)
 endif
 # Remove any duplicates.
 CORE_CC_ALL_SRCS := $(sort $(CORE_CC_ALL_SRCS))
diff --git a/tensorflow/lite/tools/make/build_aarch64_lib.sh b/tensorflow/lite/tools/make/build_aarch64_lib.sh
index a776e498193..832d619e7e2 100755
--- a/tensorflow/lite/tools/make/build_aarch64_lib.sh
+++ b/tensorflow/lite/tools/make/build_aarch64_lib.sh
@@ -28,5 +28,5 @@ else
   NO_JOB=1
 fi
 
-make -j ${NO_JOB} TARGET=aarch64 -C "${TENSORFLOW_DIR}" -f tensorflow/lite/tools/make/Makefile
+make -j ${NO_JOB} TARGET=aarch64 -C "${TENSORFLOW_DIR}" -f tensorflow/lite/tools/make/Makefile $@
 
diff --git a/tensorflow/lite/tools/make/build_bbb_lib.sh b/tensorflow/lite/tools/make/build_bbb_lib.sh
index a195c407793..59459d48cd1 100755
--- a/tensorflow/lite/tools/make/build_bbb_lib.sh
+++ b/tensorflow/lite/tools/make/build_bbb_lib.sh
@@ -19,4 +19,4 @@ set -e
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 cd "$SCRIPT_DIR/../../../.."
 
-CC_PREFIX=arm-linux-gnueabihf- make -j 3 -f tensorflow/lite/tools/make/Makefile TARGET=bbb TARGET_ARCH=armv7l
+CC_PREFIX=arm-linux-gnueabihf- make -j 3 -f tensorflow/lite/tools/make/Makefile TARGET=bbb TARGET_ARCH=armv7l $@
diff --git a/tensorflow/lite/tools/make/build_lib.sh b/tensorflow/lite/tools/make/build_lib.sh
index b9d33a01515..57f8e1c3b8d 100755
--- a/tensorflow/lite/tools/make/build_lib.sh
+++ b/tensorflow/lite/tools/make/build_lib.sh
@@ -20,5 +20,5 @@ set -e
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 TENSORFLOW_DIR="${SCRIPT_DIR}/../../../.."
 
-make -j 4 -C "${TENSORFLOW_DIR}" -f tensorflow/lite/tools/make/Makefile
+make -j 4 -C "${TENSORFLOW_DIR}" -f tensorflow/lite/tools/make/Makefile $@
 
diff --git a/tensorflow/lite/tools/make/build_rpi_lib.sh b/tensorflow/lite/tools/make/build_rpi_lib.sh
index 5282c77fbfa..d97b941cb64 100755
--- a/tensorflow/lite/tools/make/build_rpi_lib.sh
+++ b/tensorflow/lite/tools/make/build_rpi_lib.sh
@@ -28,8 +28,4 @@ else
   NO_JOB=1
 fi
 
-if [[ ! -z "${TARGET_ARCH}" ]]; then
-  make -j ${NO_JOB} TARGET=rpi -C "${TENSORFLOW_DIR}" -f tensorflow/lite/tools/make/Makefile TARGET_ARCH=${TARGET_ARCH}
-else
-  make -j ${NO_JOB} TARGET=rpi -C "${TENSORFLOW_DIR}" -f tensorflow/lite/tools/make/Makefile
-fi
+make -j ${NO_JOB} TARGET=rpi -C "${TENSORFLOW_DIR}" -f tensorflow/lite/tools/make/Makefile $@
diff --git a/tensorflow/lite/tools/make/download_dependencies.sh b/tensorflow/lite/tools/make/download_dependencies.sh
index 22ba383f688..30ae7579d3d 100755
--- a/tensorflow/lite/tools/make/download_dependencies.sh
+++ b/tensorflow/lite/tools/make/download_dependencies.sh
@@ -72,6 +72,7 @@ download_and_extract() {
   local sha256="${3}"
   echo "downloading ${url}" >&2
   mkdir -p "${dir}"
+  rm -rf ${dir}/*  # Delete existing files.
   tempdir=$(mktemp -d)
   filepath="${tempdir}/$(basename ${url})"
   curl -Lo ${filepath} ${url}
diff --git a/tensorflow/lite/tools/optimize/BUILD b/tensorflow/lite/tools/optimize/BUILD
index ee5e845b96b..8786abf69f3 100644
--- a/tensorflow/lite/tools/optimize/BUILD
+++ b/tensorflow/lite/tools/optimize/BUILD
@@ -12,6 +12,46 @@ exports_files(glob([
     "testdata/*.bin",
 ]))
 
+cc_library(
+    name = "modify_model_interface",
+    srcs = ["modify_model_interface.cc"],
+    hdrs = ["modify_model_interface.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels/internal:compatibility",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/memory",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "modify_model_interface_test",
+    srcs = ["modify_model_interface_test.cc"],
+    tags = [
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":modify_model_interface",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/memory",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+cc_binary(
+    name = "modify_model_interface_main",
+    srcs = ["modify_model_interface_main.cc"],
+    deps = [
+        ":modify_model_interface",
+        ":quantize_model",
+    ],
+)
+
 cc_library(
     name = "quantization_wrapper_utils",
     srcs = ["quantization_wrapper_utils.cc"],
diff --git a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
index 083a85c14f2..094ae889d70 100644
--- a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
+++ b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
@@ -99,56 +99,53 @@ inline void LstmStepWithAuxInput(
   if (!use_cifg) {
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         input_to_input_weights_ptr, n_cell, n_input, input_ptr, n_batch,
-        input_gate_scratch, /*result_stride=*/1);
+        input_gate_scratch);
   }
 
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       input_to_forget_weights_ptr, n_cell, n_input, input_ptr, n_batch,
-      forget_gate_scratch, /*result_stride=*/1);
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      input_to_cell_weights_ptr, n_cell, n_input, input_ptr, n_batch,
-      cell_scratch, /*result_stride=*/1);
+      forget_gate_scratch);
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(input_to_cell_weights_ptr,
+                                                    n_cell, n_input, input_ptr,
+                                                    n_batch, cell_scratch);
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       input_to_output_weights_ptr, n_cell, n_input, input_ptr, n_batch,
-      output_gate_scratch, /*result_stride=*/1);
+      output_gate_scratch);
 
   // If auxiliary input is available then compute aux_input_weight * aux_input
   if (aux_input_ptr != nullptr) {
     if (!use_cifg) {
       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
           aux_input_to_input_weights_ptr, n_cell, n_aux_input, aux_input_ptr,
-          n_batch, input_gate_scratch,
-          /*result_stride=*/1);
+          n_batch, input_gate_scratch);
     }
 
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         aux_input_to_forget_weights_ptr, n_cell, n_aux_input, aux_input_ptr,
-        n_batch, forget_gate_scratch, /*result_stride=*/1);
+        n_batch, forget_gate_scratch);
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         aux_input_to_cell_weights_ptr, n_cell, n_aux_input, aux_input_ptr,
-        n_batch, cell_scratch, /*result_stride=*/1);
+        n_batch, cell_scratch);
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         aux_input_to_output_weights_ptr, n_cell, n_aux_input, aux_input_ptr,
-        n_batch, output_gate_scratch, /*result_stride=*/1);
+        n_batch, output_gate_scratch);
   }
 
   // For each batch and cell: compute recurrent_weight * output_state.
   if (!use_cifg) {
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         recurrent_to_input_weights_ptr, n_cell, n_output, output_state_ptr,
-        n_batch, input_gate_scratch, /*result_stride=*/1);
+        n_batch, input_gate_scratch);
   }
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       recurrent_to_forget_weights_ptr, n_cell, n_output, output_state_ptr,
-      n_batch, forget_gate_scratch,
-      /*result_stride=*/1);
+      n_batch, forget_gate_scratch);
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       recurrent_to_cell_weights_ptr, n_cell, n_output, output_state_ptr,
-      n_batch, cell_scratch, /*result_stride=*/1);
+      n_batch, cell_scratch);
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       recurrent_to_output_weights_ptr, n_cell, n_output, output_state_ptr,
-      n_batch, output_gate_scratch,
-      /*result_stride=*/1);
+      n_batch, output_gate_scratch);
 
   // For each batch and cell: update input gate.
   if (!use_cifg) {
@@ -270,8 +267,7 @@ inline void LstmStepWithAuxInput(
       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
           projection_weights_ptr, n_output, n_cell,
           output_gate_scratch + k * n_cell,
-          /*n_batch=*/1, output_ptr + k * output_batch_leading_dim,
-          /*result_stride=*/1);
+          /*n_batch=*/1, output_ptr + k * output_batch_leading_dim);
       if (params->proj_clip > 0.0) {
         tensor_utils::ClipVector(output_ptr + k * output_batch_leading_dim,
                                  n_output, params->proj_clip,
diff --git a/tensorflow/lite/tools/optimize/calibration/logging_op_resolver_test.cc b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver_test.cc
index 25507daaa91..511e4d0288d 100644
--- a/tensorflow/lite/tools/optimize/calibration/logging_op_resolver_test.cc
+++ b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver_test.cc
@@ -171,7 +171,7 @@ TEST(LoggingOpResolverTest, CustomOps) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: FLAGS_logtostderr = true;
+  // On Linux, add: absl::SetFlag(&FLAGS_logtostderr, true);
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/lite/tools/optimize/modify_model_interface.cc b/tensorflow/lite/tools/optimize/modify_model_interface.cc
new file mode 100644
index 00000000000..7d51bc03434
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/modify_model_interface.cc
@@ -0,0 +1,364 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/optimize/modify_model_interface.h"
+
+#include <fstream>
+#include <memory>
+#include <sstream>
+#include <unordered_set>
+
+#include "flatbuffers/flexbuffers.h"
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/error_reporter.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace optimize {
+
+namespace {
+
+// Structure to hold input tensor, op and output tensor.
+// op must be either quantize or dequantize.
+struct TensorOpTensor {
+  size_t subgraph_index;  // index of the subgraph.
+  int32_t input_index;    // index of the input tensor.
+  int32_t op_index;       // index of the op.
+  int32_t output_index;   // index of the output tensor.
+  int32_t model_index;    // index of the added tensor in the model.
+};
+
+// Finds float tensors that are model inputs and is consumed by a quantize Op.
+// The returned TensorOpTensor should have reverse order.
+std::vector<TensorOpTensor> GetInputTensors(ModelT* model,
+                                            ErrorReporter* error_reporter) {
+  std::vector<TensorOpTensor> result;
+  // Get all input tensors.
+  for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
+       subgraph_idx++) {
+    SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
+    std::unordered_map<TensorT*, int> input_tensors;
+    for (size_t input_idx = 0; input_idx < subgraph->inputs.size();
+         input_idx++) {
+      TensorT* tensor = subgraph->tensors[subgraph->inputs[input_idx]].get();
+      if (tensor->type == TensorType_FLOAT32) {
+        input_tensors.insert({tensor, input_idx});
+      }
+    }
+
+    for (int32_t op_idx = subgraph->operators.size() - 1; op_idx >= 0;
+         op_idx--) {
+      OperatorT* op = subgraph->operators[op_idx].get();
+      const BuiltinOperator op_code =
+          model->operator_codes[op->opcode_index]->builtin_code;
+      TensorT* input_tensor = subgraph->tensors[op->inputs[0]].get();
+      if (input_tensors.find(input_tensor) == input_tensors.end()) {
+        continue;
+      }
+      if (op_code != BuiltinOperator_QUANTIZE) {
+        // Current only support INT8 quantized models.
+        TF_LITE_REPORT_ERROR(
+            error_reporter,
+            "modify_model_interface called on a model without quant/dequant.");
+        return {};
+      }
+      if (op->inputs.size() != 1) {
+        continue;
+      }
+      if (op->outputs.size() != 1) {
+        continue;
+      }
+      const int model_input_index = input_tensors[input_tensor];
+      TensorT* quant_output = subgraph->tensors[op->outputs[0]].get();
+      if (quant_output->type != TensorType_INT8) {
+        TF_LITE_REPORT_ERROR(error_reporter,
+                             "modify_model_interface currently only support "
+                             "int8 quantized models.");
+      }
+      if (quant_output->quantization == nullptr) {
+        continue;
+      }
+      result.push_back({subgraph_idx, op->inputs[0], op_idx, op->outputs[0],
+                        model_input_index});
+    }
+  }
+  return result;
+}
+
+// Finds float tensors that are model output and is consumed by a dequantize Op.
+// The returned TensorOpTensor should have reverse order.
+std::vector<TensorOpTensor> GetOutputTensors(ModelT* model,
+                                             ErrorReporter* error_reporter) {
+  std::vector<TensorOpTensor> result;
+  // Get all output tensors.
+  for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
+       subgraph_idx++) {
+    SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
+    std::unordered_map<TensorT*, int> output_tensors;
+    for (size_t output_idx = 0; output_idx < subgraph->outputs.size();
+         output_idx++) {
+      TensorT* tensor = subgraph->tensors[subgraph->outputs[output_idx]].get();
+      if (tensor->type == TensorType_FLOAT32) {
+        output_tensors.insert({tensor, output_idx});
+      }
+    }
+
+    for (int32_t op_idx = subgraph->operators.size() - 1; op_idx >= 0;
+         op_idx--) {
+      OperatorT* op = subgraph->operators[op_idx].get();
+      const BuiltinOperator op_code =
+          model->operator_codes[op->opcode_index]->builtin_code;
+      TensorT* output_tensor = subgraph->tensors[op->outputs[0]].get();
+      if (output_tensors.find(output_tensor) == output_tensors.end()) {
+        continue;
+      }
+      if (op_code != BuiltinOperator_DEQUANTIZE) {
+        // Current only support INT8 quantized models.
+        TF_LITE_REPORT_ERROR(
+            error_reporter,
+            "modify_model_interface called on a model without quant/dequant.");
+        return {};
+      }
+      if (op->inputs.size() != 1) {
+        continue;
+      }
+      if (op->outputs.size() != 1) {
+        continue;
+      }
+      const int model_output_index = output_tensors[output_tensor];
+      TensorT* dequant_input = subgraph->tensors[op->inputs[0]].get();
+      if (dequant_input->type != TensorType_INT8) {
+        // Current only support INT8 quantized models.
+        TF_LITE_REPORT_ERROR(error_reporter,
+                             "modify_model_interface currently only support "
+                             "int8 quantized models.");
+        return {};
+      }
+      if (dequant_input->quantization == nullptr) {
+        continue;
+      }
+      result.push_back({subgraph_idx, op->inputs[0], op_idx, op->outputs[0],
+                        model_output_index});
+    }
+  }
+  return result;
+}
+
+TfLiteStatus SetInputTypeToUINT8(ModelT* model,
+                                 const std::vector<TensorOpTensor>& inputs) {
+  // If the input type is uint8, change float to uint8.
+  for (auto tot : inputs) {
+    SubGraphT* subgraph = model->subgraphs.at(tot.subgraph_index).get();
+    TensorT* quant_tensor = subgraph->tensors[tot.output_index].get();
+    const float quant_tensor_scale = quant_tensor->quantization->scale[0];
+    const int quant_tensor_zp = quant_tensor->quantization->zero_point[0];
+    TensorT* float_tensor = subgraph->tensors[tot.input_index].get();
+    float_tensor->type = TensorType_UINT8;
+    if (float_tensor->quantization == nullptr) {
+      float_tensor->quantization = absl::make_unique<QuantizationParametersT>();
+    }
+    float_tensor->quantization->scale.push_back(quant_tensor_scale);
+    float_tensor->quantization->zero_point.push_back(quant_tensor_zp + 128);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus SetOutputTypeToUINT8(ModelT* model,
+                                  const std::vector<TensorOpTensor>& outputs) {
+  // Find Quant op code index.
+  size_t quant_op_index = 0;
+  for (size_t i = 0; i < model->operator_codes.size(); ++i) {
+    if (model->operator_codes[i]->builtin_code == BuiltinOperator_QUANTIZE) {
+      quant_op_index = i;
+    }
+  }
+  // If the output type is uint8, change float to uint8.
+  for (auto tot : outputs) {
+    SubGraphT* subgraph = model->subgraphs.at(tot.subgraph_index).get();
+    TensorT* quant_tensor = subgraph->tensors[tot.input_index].get();
+    const float quant_tensor_scale = quant_tensor->quantization->scale[0];
+    const int quant_tensor_zp = quant_tensor->quantization->zero_point[0];
+    TensorT* float_tensor = subgraph->tensors[tot.output_index].get();
+    float_tensor->type = TensorType_UINT8;
+    if (float_tensor->quantization == nullptr) {
+      float_tensor->quantization = absl::make_unique<QuantizationParametersT>();
+    }
+    float_tensor->quantization->scale.push_back(quant_tensor_scale);
+    float_tensor->quantization->zero_point.push_back(quant_tensor_zp + 128);
+
+    // Change op from dequant (int8 to float) to quant (int8 to uint8)
+    OperatorT* op = subgraph->operators[tot.op_index].get();
+    op->opcode_index = quant_op_index;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus RemoveInputTensor(ModelT* model,
+                               const std::vector<TensorOpTensor>& inputs) {
+  // Sanity check to make sure that erase start from the end.
+  int last_op_index = std::numeric_limits<int32_t>::max();
+  int last_tensor_index = std::numeric_limits<int32_t>::max();
+  for (auto tot : inputs) {
+    TFLITE_DCHECK(tot.input_index < last_tensor_index);
+    TFLITE_DCHECK(tot.op_index < last_op_index);
+    last_tensor_index = tot.input_index;
+    last_op_index = tot.op_index;
+  }
+  // Removes the input tensor and the related operator.
+  for (auto tot : inputs) {
+    SubGraphT* subgraph = model->subgraphs.at(tot.subgraph_index).get();
+    TFLITE_DCHECK(tot.input_index < subgraph->tensors.size());
+    TFLITE_DCHECK(tot.op_index < subgraph->operators.size());
+    subgraph->tensors.erase(subgraph->tensors.begin() + tot.input_index);
+    subgraph->operators.erase(subgraph->operators.begin() + tot.op_index);
+    subgraph->inputs[tot.model_index] = tot.output_index;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus RemoveOutputTensor(ModelT* model,
+                                const std::vector<TensorOpTensor>& outputs) {
+  // Sanity check to make sure that erase start from the end.
+  int last_op_index = std::numeric_limits<int32_t>::max();
+  int last_tensor_index = std::numeric_limits<int32_t>::max();
+  for (auto tot : outputs) {
+    TFLITE_DCHECK(tot.output_index < last_tensor_index);
+    TFLITE_DCHECK(tot.op_index < last_op_index);
+    last_tensor_index = tot.output_index;
+    last_op_index = tot.op_index;
+  }
+  // Removes the output tensor and the related operator.
+  for (auto tot : outputs) {
+    SubGraphT* subgraph = model->subgraphs.at(tot.subgraph_index).get();
+    TFLITE_DCHECK(tot.output_index < subgraph->tensors.size());
+    TFLITE_DCHECK(tot.op_index < subgraph->operators.size());
+    subgraph->tensors.erase(subgraph->tensors.begin() + tot.output_index);
+    subgraph->operators.erase(subgraph->operators.begin() + tot.op_index);
+    subgraph->outputs[tot.model_index] = tot.input_index;
+  }
+  return kTfLiteOk;
+}
+
+void WriteFile(const std::string& out_file, const uint8_t* bytes,
+               size_t num_bytes) {
+  std::fstream stream(out_file, std::ios::binary | std::ios::out);
+  for (size_t i = 0; i < num_bytes; i++) {
+    stream << bytes[i];
+  }
+  TFLITE_DCHECK(!stream.bad() && !stream.fail());
+}
+
+std::unique_ptr<flatbuffers::FlatBufferBuilder> FinishModel(
+    const tflite::ModelT* model) {
+  std::unique_ptr<flatbuffers::FlatBufferBuilder> builder(
+      new flatbuffers::FlatBufferBuilder());
+  auto packed_model = tflite::Model::Pack(*builder, model);
+  tflite::FinishModelBuffer(*builder, packed_model);
+  return builder;
+}
+
+std::unique_ptr<tflite::ModelT> CreateMutableModelFromFile(
+    const string& model_filepath) {
+  auto fb_model =
+      tflite::FlatBufferModel::BuildFromFile(model_filepath.c_str());
+  auto tflite_model = fb_model->GetModel();
+  auto copied_model = absl::make_unique<tflite::ModelT>();
+  tflite_model->UnPackTo(copied_model.get(), nullptr);
+  return copied_model;
+}
+
+}  // namespace
+
+TfLiteStatus ModifyModelInterface(flatbuffers::FlatBufferBuilder* builder,
+                                  ModelT* model, const TensorType& input_type,
+                                  const TensorType& output_type) {
+  // Find float tensors that are model output and is consumed by a float to int8
+  // quantize Op.
+  // Do output first since the tensors are added into input first.,
+  tflite::StderrReporter error_reporter;
+  std::vector<TensorOpTensor> outputs =
+      GetOutputTensors(model, &error_reporter);
+  if (output_type == TensorType_UINT8) {
+    SetOutputTypeToUINT8(model, outputs);
+  } else if (input_type == TensorType_INT8) {
+    RemoveOutputTensor(model, outputs);
+  } else {
+    return kTfLiteError;
+  }
+
+  // Find float tensors that are model input and is consumed by a float to int8
+  // quantize Op.
+  std::vector<TensorOpTensor> inputs = GetInputTensors(model, &error_reporter);
+  if (input_type == TensorType_UINT8) {
+    SetInputTypeToUINT8(model, inputs);
+  } else if (input_type == TensorType_INT8) {
+    RemoveInputTensor(model, inputs);
+  } else {
+    return kTfLiteError;
+  }
+
+  // Write to builder.
+  flatbuffers::Offset<Model> output_model_location =
+      Model::Pack(*builder, model);
+  FinishModelBuffer(*builder, output_model_location);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus ModifyModelInterface(const string& input_file,
+                                  const string& output_file,
+                                  const TensorType& input_type,
+                                  const TensorType& output_type) {
+  // Sanity Check
+  if (input_type != tflite::TensorType_INT8 &&
+      input_type != tflite::TensorType_UINT8) {
+    return kTfLiteError;
+  }
+  if (output_type != tflite::TensorType_INT8 &&
+      output_type != tflite::TensorType_UINT8) {
+    return kTfLiteError;
+  }
+
+  // Create model.
+  auto tflite_model = CreateMutableModelFromFile(input_file);
+
+  auto model_builder = FinishModel(tflite_model.get());
+
+  auto fixed_point_model_builder =
+      absl::make_unique<flatbuffers::FlatBufferBuilder>();
+  flatbuffers::FlatBufferBuilder builder;
+
+  tflite::TensorType input_override_type = tflite::TensorType_INT8;
+  if (input_type == tflite::TensorType_UINT8) {
+    input_override_type = tflite::TensorType_UINT8;
+  }
+  tflite::TensorType output_override_type = tflite::TensorType_INT8;
+  if (output_type == tflite::TensorType_UINT8) {
+    output_override_type = tflite::TensorType_UINT8;
+  }
+
+  auto status = ModifyModelInterface(&builder, tflite_model.get(),
+                                     input_override_type, output_override_type);
+  TFLITE_DCHECK_EQ(status, kTfLiteOk);
+
+  WriteFile(output_file, builder.GetBufferPointer(), builder.GetSize());
+
+  return kTfLiteOk;
+}
+
+}  // namespace optimize
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/modify_model_interface.h b/tensorflow/lite/tools/optimize/modify_model_interface.h
new file mode 100644
index 00000000000..cfe4f41ff90
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/modify_model_interface.h
@@ -0,0 +1,45 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_MODIFY_MODEL_INTERFACE_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_MODIFY_MODEL_INTERFACE_H_
+
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace optimize {
+
+// Changes the interface of a quantized model. This method allows the users to
+// replace float interface with other types.
+// This populates the builder with the new model.
+// Currently only int8 and unit8 are supported.
+//
+// Note: This is a private API, subject to change.
+TfLiteStatus ModifyModelInterface(flatbuffers::FlatBufferBuilder* builder,
+                                  ModelT* model, const TensorType& input_type,
+                                  const TensorType& output_type);
+
+// Same as above but allows input file path and output file path.
+//
+// Note: This is a private API, subject to change.
+TfLiteStatus ModifyModelInterface(const string& input_file,
+                                  const string& output_file,
+                                  const TensorType& input_type,
+                                  const TensorType& output_type);
+
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_MODIFY_MODEL_INTERFACE_H_
diff --git a/tensorflow/lite/tools/optimize/modify_model_interface_main.cc b/tensorflow/lite/tools/optimize/modify_model_interface_main.cc
new file mode 100644
index 00000000000..e76b46b9980
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/modify_model_interface_main.cc
@@ -0,0 +1,51 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string>
+
+#include "tensorflow/lite/tools/optimize/modify_model_interface.h"
+//
+// Note: This is a private API, subject to change.
+int main(int argc, char** argv) {
+  if (argc != 5) {
+    printf(
+        "Wrong number of arguments. Example: modify_model_interface_main "
+        "${input} ${output} ${input_interface} ${output_interface}");
+    return 1;
+  }
+
+  if (!strcmp(argv[3], "uint8") && !strcmp(argv[3], "int8")) {
+    printf("Only support uint8 and int8 for input interface");
+    return 1;
+  }
+
+  if (!strcmp(argv[4], "uint8") && !strcmp(argv[4], "int8")) {
+    printf("Only support uint8 and int8 for output interface");
+    return 1;
+  }
+
+  tflite::TensorType input = tflite::TensorType_INT8;
+  tflite::TensorType output = tflite::TensorType_INT8;
+
+  if (!strcmp(argv[3], "uint8")) {
+    input = tflite::TensorType_UINT8;
+  }
+  if (!strcmp(argv[4], "uint8")) {
+    output = tflite::TensorType_UINT8;
+  }
+
+  tflite::optimize::ModifyModelInterface(argv[1], argv[2], input, output);
+
+  return 0;
+}
diff --git a/tensorflow/lite/tools/optimize/modify_model_interface_test.cc b/tensorflow/lite/tools/optimize/modify_model_interface_test.cc
new file mode 100644
index 00000000000..01d0775c953
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/modify_model_interface_test.cc
@@ -0,0 +1,423 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/optimize/modify_model_interface.h"
+
+#include <gtest/gtest.h>
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace optimize {
+namespace {
+
+// Create a model with 1 quant, 1 FC, 1 dequant
+std::unique_ptr<ModelT> CreateModelSingleInputOutput() {
+  auto model = absl::make_unique<ModelT>();
+  auto subgraph = absl::make_unique<tflite::SubGraphT>();
+  auto buffer = absl::make_unique<tflite::BufferT>();
+  auto quant_op_code = absl::make_unique<OperatorCodeT>();
+  auto quant_op = absl::make_unique<OperatorT>();
+  auto fc_op_code = absl::make_unique<OperatorCodeT>();
+  auto fc_op = absl::make_unique<OperatorT>();
+  auto dequant_op_code = absl::make_unique<OperatorCodeT>();
+  auto dequant_op = absl::make_unique<OperatorT>();
+
+  model->subgraphs.push_back(std::move(subgraph));
+
+  // Op code
+  quant_op_code->builtin_code = BuiltinOperator_QUANTIZE;
+  quant_op_code->version = 2;
+
+  fc_op_code->builtin_code = BuiltinOperator_FULLY_CONNECTED;
+  fc_op_code->version = 2;
+
+  dequant_op_code->builtin_code = BuiltinOperator_DEQUANTIZE;
+  dequant_op_code->version = 2;
+
+  // Op.
+  quant_op->opcode_index = 0;
+  quant_op->inputs = {0};
+  quant_op->outputs = {1};
+
+  fc_op->opcode_index = 1;
+  fc_op->inputs = {1};
+  fc_op->outputs = {2};
+
+  dequant_op->opcode_index = 2;
+  dequant_op->inputs = {2};
+  dequant_op->outputs = {3};
+
+  model->subgraphs[0]->operators.push_back(std::move(quant_op));
+  model->subgraphs[0]->operators.push_back(std::move(fc_op));
+  model->subgraphs[0]->operators.push_back(std::move(dequant_op));
+
+  model->operator_codes.push_back(std::move(quant_op_code));
+  model->operator_codes.push_back(std::move(fc_op_code));
+  model->operator_codes.push_back(std::move(dequant_op_code));
+
+  // Model input/otuput.
+  model->subgraphs[0]->inputs = {0};
+  model->subgraphs[0]->outputs = {3};
+
+  // Tensors
+  auto tensor_0 = absl::make_unique<TensorT>();
+  tensor_0->name = "tensor_0";
+  tensor_0->shape = {};
+  tensor_0->type = TensorType_FLOAT32;
+
+  auto tensor_1 = absl::make_unique<TensorT>();
+  tensor_1->quantization = absl::make_unique<QuantizationParametersT>();
+  tensor_1->quantization->scale.push_back(0.35);
+  tensor_1->quantization->zero_point.push_back(28);
+  tensor_1->name = "tensor_1";
+  tensor_1->shape = {};
+  tensor_1->type = TensorType_INT8;
+
+  auto tensor_2 = absl::make_unique<TensorT>();
+  tensor_2->quantization = absl::make_unique<QuantizationParametersT>();
+  tensor_2->quantization->scale.push_back(0.12);
+  tensor_2->quantization->zero_point.push_back(50);
+  tensor_2->name = "tensor_2";
+  tensor_2->shape = {};
+  tensor_2->type = TensorType_INT8;
+
+  auto tensor_3 = absl::make_unique<TensorT>();
+  tensor_3->name = "tensor_3";
+  tensor_3->shape = {};
+  tensor_3->type = TensorType_FLOAT32;
+
+  model->subgraphs[0]->tensors.push_back(std::move(tensor_0));
+  model->subgraphs[0]->tensors.push_back(std::move(tensor_1));
+  model->subgraphs[0]->tensors.push_back(std::move(tensor_2));
+  model->subgraphs[0]->tensors.push_back(std::move(tensor_3));
+
+  // Buffer
+  model->buffers.push_back(std::move(buffer));
+
+  return model;
+}
+
+// Create a model with 2 quant, 1 FC, 2 dequant
+// The model mimics the behavior of the quantize_model.cc.
+std::unique_ptr<ModelT> CreateModelMultipleInputOutput() {
+  auto model = absl::make_unique<ModelT>();
+  auto subgraph = absl::make_unique<tflite::SubGraphT>();
+  auto buffer = absl::make_unique<tflite::BufferT>();
+  auto quant_op_code = absl::make_unique<OperatorCodeT>();
+  auto quant_op_1 = absl::make_unique<OperatorT>();
+  auto quant_op_2 = absl::make_unique<OperatorT>();
+  auto fc_op_code = absl::make_unique<OperatorCodeT>();
+  auto fc_op = absl::make_unique<OperatorT>();
+  auto dequant_op_code = absl::make_unique<OperatorCodeT>();
+  auto dequant_op_1 = absl::make_unique<OperatorT>();
+  auto dequant_op_2 = absl::make_unique<OperatorT>();
+
+  model->subgraphs.push_back(std::move(subgraph));
+
+  // Op code
+  quant_op_code->builtin_code = BuiltinOperator_QUANTIZE;
+  quant_op_code->version = 2;
+
+  fc_op_code->builtin_code = BuiltinOperator_FULLY_CONNECTED;
+  fc_op_code->version = 2;
+
+  dequant_op_code->builtin_code = BuiltinOperator_DEQUANTIZE;
+  dequant_op_code->version = 2;
+
+  // Op.
+  quant_op_1->opcode_index = 0;
+  quant_op_1->inputs = {4};
+  quant_op_1->outputs = {0};
+  quant_op_2->opcode_index = 0;
+  quant_op_2->inputs = {5};
+  quant_op_2->outputs = {1};
+
+  fc_op->opcode_index = 1;
+  fc_op->inputs = {0, 1};
+  fc_op->outputs = {2, 3};
+
+  dequant_op_1->opcode_index = 2;
+  dequant_op_1->inputs = {2};
+  dequant_op_1->outputs = {6};
+  dequant_op_2->opcode_index = 2;
+  dequant_op_2->inputs = {3};
+  dequant_op_2->outputs = {7};
+
+  model->subgraphs[0]->operators.push_back(std::move(quant_op_1));
+  model->subgraphs[0]->operators.push_back(std::move(quant_op_2));
+  model->subgraphs[0]->operators.push_back(std::move(fc_op));
+  model->subgraphs[0]->operators.push_back(std::move(dequant_op_1));
+  model->subgraphs[0]->operators.push_back(std::move(dequant_op_2));
+
+  model->operator_codes.push_back(std::move(quant_op_code));
+  model->operator_codes.push_back(std::move(fc_op_code));
+  model->operator_codes.push_back(std::move(dequant_op_code));
+
+  // Model input/otuput.
+  model->subgraphs[0]->inputs = {4, 5};
+  model->subgraphs[0]->outputs = {6, 7};
+
+  // Tensors
+  auto tensor_0 = absl::make_unique<TensorT>();
+  tensor_0->quantization = absl::make_unique<QuantizationParametersT>();
+  tensor_0->quantization->scale.push_back(0.35);
+  tensor_0->quantization->zero_point.push_back(28);
+  tensor_0->name = "tensor_0";
+  tensor_0->shape = {};
+  tensor_0->type = TensorType_INT8;
+
+  auto tensor_1 = absl::make_unique<TensorT>();
+  tensor_1->quantization = absl::make_unique<QuantizationParametersT>();
+  tensor_1->quantization->scale.push_back(0.12);
+  tensor_1->quantization->zero_point.push_back(50);
+  tensor_1->name = "tensor_1";
+  tensor_1->shape = {};
+  tensor_1->type = TensorType_INT8;
+
+  auto tensor_2 = absl::make_unique<TensorT>();
+  tensor_2->quantization = absl::make_unique<QuantizationParametersT>();
+  tensor_2->quantization->scale.push_back(0.45);
+  tensor_2->quantization->zero_point.push_back(28);
+  tensor_2->name = "tensor_2";
+  tensor_2->shape = {};
+  tensor_2->type = TensorType_INT8;
+
+  auto tensor_3 = absl::make_unique<TensorT>();
+  tensor_3->quantization = absl::make_unique<QuantizationParametersT>();
+  tensor_3->quantization->scale.push_back(0.22);
+  tensor_3->quantization->zero_point.push_back(50);
+  tensor_3->name = "tensor_3";
+  tensor_3->shape = {};
+  tensor_3->type = TensorType_INT8;
+
+  auto tensor_4 = absl::make_unique<TensorT>();
+  tensor_4->name = "tensor_4";
+  tensor_4->shape = {};
+  tensor_4->type = TensorType_FLOAT32;
+
+  auto tensor_5 = absl::make_unique<TensorT>();
+  tensor_5->name = "tensor_5";
+  tensor_5->shape = {};
+  tensor_5->type = TensorType_FLOAT32;
+
+  auto tensor_6 = absl::make_unique<TensorT>();
+  tensor_6->name = "tensor_6";
+  tensor_6->shape = {};
+  tensor_6->type = TensorType_FLOAT32;
+
+  auto tensor_7 = absl::make_unique<TensorT>();
+  tensor_7->name = "tensor_7";
+  tensor_7->shape = {};
+  tensor_7->type = TensorType_FLOAT32;
+
+  model->subgraphs[0]->tensors.push_back(std::move(tensor_0));
+  model->subgraphs[0]->tensors.push_back(std::move(tensor_1));
+  model->subgraphs[0]->tensors.push_back(std::move(tensor_2));
+  model->subgraphs[0]->tensors.push_back(std::move(tensor_3));
+  model->subgraphs[0]->tensors.push_back(std::move(tensor_4));
+  model->subgraphs[0]->tensors.push_back(std::move(tensor_5));
+  model->subgraphs[0]->tensors.push_back(std::move(tensor_6));
+  model->subgraphs[0]->tensors.push_back(std::move(tensor_7));
+
+  // Buffer
+  model->buffers.push_back(std::move(buffer));
+
+  return model;
+}
+
+TEST(ModelInference, Uint8SingleInputOutput) {
+  auto model = CreateModelSingleInputOutput();
+
+  // Ops.
+  EXPECT_EQ(model->subgraphs[0]->operators[0]->opcode_index, 0);
+  EXPECT_EQ(model->subgraphs[0]->operators[1]->opcode_index, 1);
+  EXPECT_EQ(model->subgraphs[0]->operators[2]->opcode_index, 2);
+
+  // Change model type.
+  flatbuffers::FlatBufferBuilder builder;
+  EXPECT_EQ(ModifyModelInterface(&builder, model.get(), TensorType_UINT8,
+                                 TensorType_UINT8),
+            kTfLiteOk);
+
+  // Verify results.
+  EXPECT_EQ(model->operator_codes.size(), 3);
+  EXPECT_EQ(model->subgraphs.size(), 1);
+  EXPECT_EQ(model->subgraphs[0]->operators.size(), 3);
+  EXPECT_EQ(model->subgraphs[0]->tensors.size(), 4);
+  EXPECT_EQ(model->buffers.size(), 1);
+
+  EXPECT_EQ(model->subgraphs[0]->tensors[0]->name, "tensor_0");
+  EXPECT_EQ(model->subgraphs[0]->tensors[0]->type, TensorType_UINT8);
+  EXPECT_FLOAT_EQ(model->subgraphs[0]->tensors[0]->quantization->scale[0],
+                  0.35);
+  EXPECT_EQ(model->subgraphs[0]->tensors[0]->quantization->zero_point[0], 156);
+
+  EXPECT_EQ(model->subgraphs[0]->tensors[3]->name, "tensor_3");
+  EXPECT_EQ(model->subgraphs[0]->tensors[3]->type, TensorType_UINT8);
+  EXPECT_FLOAT_EQ(model->subgraphs[0]->tensors[3]->quantization->scale[0],
+                  0.12);
+  EXPECT_EQ(model->subgraphs[0]->tensors[3]->quantization->zero_point[0], 178);
+
+  // Ops.
+  EXPECT_EQ(model->subgraphs[0]->operators[0]->opcode_index, 0);
+  EXPECT_EQ(model->subgraphs[0]->operators[1]->opcode_index, 1);
+  EXPECT_EQ(model->subgraphs[0]->operators[2]->opcode_index, 0);
+}
+
+TEST(ModelInference, Int8SingleInputOutput) {
+  auto model = CreateModelSingleInputOutput();
+
+  // Change model type.
+  flatbuffers::FlatBufferBuilder builder;
+  EXPECT_EQ(ModifyModelInterface(&builder, model.get(), TensorType_INT8,
+                                 TensorType_INT8),
+            kTfLiteOk);
+
+  // Verify results.
+  EXPECT_EQ(model->operator_codes.size(), 3);
+  EXPECT_EQ(model->subgraphs.size(), 1);
+  EXPECT_EQ(model->subgraphs[0]->operators.size(), 1);
+  EXPECT_EQ(model->subgraphs[0]->tensors.size(), 2);
+  EXPECT_EQ(model->buffers.size(), 1);
+
+  EXPECT_EQ(model->subgraphs[0]->inputs.size(), 1);
+  EXPECT_EQ(model->subgraphs[0]->inputs[0], 1);
+  EXPECT_EQ(model->subgraphs[0]->outputs.size(), 1);
+  EXPECT_EQ(model->subgraphs[0]->outputs[0], 2);
+}
+
+TEST(ModelInference, Uint8MutipleInputOutput) {
+  auto model = CreateModelMultipleInputOutput();
+
+  // Ops.
+  EXPECT_EQ(model->subgraphs[0]->operators[0]->opcode_index, 0);
+  EXPECT_EQ(model->subgraphs[0]->operators[1]->opcode_index, 0);
+  EXPECT_EQ(model->subgraphs[0]->operators[2]->opcode_index, 1);
+  EXPECT_EQ(model->subgraphs[0]->operators[3]->opcode_index, 2);
+  EXPECT_EQ(model->subgraphs[0]->operators[4]->opcode_index, 2);
+
+  // Change model type.
+  flatbuffers::FlatBufferBuilder builder;
+  EXPECT_EQ(ModifyModelInterface(&builder, model.get(), TensorType_UINT8,
+                                 TensorType_UINT8),
+            kTfLiteOk);
+
+  // Verify results.
+  // Model.
+  EXPECT_EQ(model->operator_codes.size(), 3);
+  EXPECT_EQ(model->subgraphs.size(), 1);
+  EXPECT_EQ(model->subgraphs[0]->operators.size(), 5);
+  EXPECT_EQ(model->subgraphs[0]->tensors.size(), 8);
+  EXPECT_EQ(model->subgraphs[0]->inputs.size(), 2);
+  EXPECT_EQ(model->subgraphs[0]->inputs[0], 4);
+  EXPECT_EQ(model->subgraphs[0]->inputs[1], 5);
+  EXPECT_EQ(model->subgraphs[0]->outputs.size(), 2);
+  EXPECT_EQ(model->subgraphs[0]->outputs[0], 6);
+  EXPECT_EQ(model->subgraphs[0]->outputs[1], 7);
+  EXPECT_EQ(model->buffers.size(), 1);
+
+  // Tensors,
+  EXPECT_EQ(model->subgraphs[0]->tensors[4]->name, "tensor_4");
+  EXPECT_EQ(model->subgraphs[0]->tensors[4]->type, TensorType_UINT8);
+  EXPECT_FLOAT_EQ(model->subgraphs[0]->tensors[4]->quantization->scale[0],
+                  0.35);
+  EXPECT_EQ(model->subgraphs[0]->tensors[4]->quantization->zero_point[0], 156);
+
+  EXPECT_EQ(model->subgraphs[0]->tensors[5]->name, "tensor_5");
+  EXPECT_EQ(model->subgraphs[0]->tensors[5]->type, TensorType_UINT8);
+  EXPECT_FLOAT_EQ(model->subgraphs[0]->tensors[5]->quantization->scale[0],
+                  0.12);
+  EXPECT_EQ(model->subgraphs[0]->tensors[5]->quantization->zero_point[0], 178);
+
+  EXPECT_EQ(model->subgraphs[0]->tensors[6]->name, "tensor_6");
+  EXPECT_EQ(model->subgraphs[0]->tensors[6]->type, TensorType_UINT8);
+  EXPECT_FLOAT_EQ(model->subgraphs[0]->tensors[6]->quantization->scale[0],
+                  0.45);
+  EXPECT_EQ(model->subgraphs[0]->tensors[6]->quantization->zero_point[0], 156);
+
+  EXPECT_EQ(model->subgraphs[0]->tensors[7]->name, "tensor_7");
+  EXPECT_EQ(model->subgraphs[0]->tensors[7]->type, TensorType_UINT8);
+  EXPECT_FLOAT_EQ(model->subgraphs[0]->tensors[7]->quantization->scale[0],
+                  0.22);
+  EXPECT_EQ(model->subgraphs[0]->tensors[7]->quantization->zero_point[0], 178);
+
+  // Ops.
+  EXPECT_EQ(model->subgraphs[0]->operators[0]->opcode_index, 0);
+  EXPECT_EQ(model->subgraphs[0]->operators[1]->opcode_index, 0);
+  EXPECT_EQ(model->subgraphs[0]->operators[2]->opcode_index, 1);
+  EXPECT_EQ(model->subgraphs[0]->operators[3]->opcode_index, 0);
+  EXPECT_EQ(model->subgraphs[0]->operators[4]->opcode_index, 0);
+}
+
+TEST(ModelInference, Int8MutipleInputOutput) {
+  auto model = CreateModelMultipleInputOutput();
+
+  // Change model type.
+  flatbuffers::FlatBufferBuilder builder;
+  EXPECT_EQ(ModifyModelInterface(&builder, model.get(), TensorType_INT8,
+                                 TensorType_INT8),
+            kTfLiteOk);
+
+  // Verify results.
+  EXPECT_EQ(model->operator_codes.size(), 3);
+  EXPECT_EQ(model->subgraphs.size(), 1);
+  EXPECT_EQ(model->subgraphs[0]->operators.size(), 1);
+  EXPECT_EQ(model->subgraphs[0]->tensors.size(), 4);
+  EXPECT_EQ(model->subgraphs[0]->inputs.size(), 2);
+  EXPECT_EQ(model->subgraphs[0]->inputs[0], 0);
+  EXPECT_EQ(model->subgraphs[0]->inputs[1], 1);
+  EXPECT_EQ(model->subgraphs[0]->outputs.size(), 2);
+  EXPECT_EQ(model->subgraphs[0]->outputs[0], 2);
+  EXPECT_EQ(model->subgraphs[0]->outputs[1], 3);
+  EXPECT_EQ(model->buffers.size(), 1);
+
+  // Tensors,
+  EXPECT_EQ(model->subgraphs[0]->tensors[0]->name, "tensor_0");
+  EXPECT_EQ(model->subgraphs[0]->tensors[0]->type, TensorType_INT8);
+  EXPECT_FLOAT_EQ(model->subgraphs[0]->tensors[0]->quantization->scale[0],
+                  0.35);
+  EXPECT_EQ(model->subgraphs[0]->tensors[0]->quantization->zero_point[0], 28);
+
+  EXPECT_EQ(model->subgraphs[0]->tensors[1]->name, "tensor_1");
+  EXPECT_EQ(model->subgraphs[0]->tensors[1]->type, TensorType_INT8);
+  EXPECT_FLOAT_EQ(model->subgraphs[0]->tensors[1]->quantization->scale[0],
+                  0.12);
+  EXPECT_EQ(model->subgraphs[0]->tensors[1]->quantization->zero_point[0], 50);
+
+  EXPECT_EQ(model->subgraphs[0]->tensors[2]->name, "tensor_2");
+  EXPECT_EQ(model->subgraphs[0]->tensors[2]->type, TensorType_INT8);
+  EXPECT_FLOAT_EQ(model->subgraphs[0]->tensors[2]->quantization->scale[0],
+                  0.45);
+  EXPECT_EQ(model->subgraphs[0]->tensors[2]->quantization->zero_point[0], 28);
+
+  EXPECT_EQ(model->subgraphs[0]->tensors[3]->name, "tensor_3");
+  EXPECT_EQ(model->subgraphs[0]->tensors[3]->type, TensorType_INT8);
+  EXPECT_FLOAT_EQ(model->subgraphs[0]->tensors[3]->quantization->scale[0],
+                  0.22);
+  EXPECT_EQ(model->subgraphs[0]->tensors[3]->quantization->zero_point[0], 50);
+
+  // Ops.
+  EXPECT_EQ(model->subgraphs[0]->operators[0]->opcode_index, 1);
+}
+
+}  // namespace
+}  // namespace optimize
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc
index 9a8c1e3a581..7492b650017 100644
--- a/tensorflow/lite/tools/optimize/operator_property.cc
+++ b/tensorflow/lite/tools/optimize/operator_property.cc
@@ -29,6 +29,11 @@ struct OpVariant {
   bool use_layer_norm = false;
   bool use_projection = false;
   bool use_peephole = false;
+  // An attribute to indicate if quantization is supported for this Op.
+  // This attribute is equivalent to the "quantizable" attribute in
+  // "OperatorProperty". It added here since OpVariants peeks inside the Op and
+  // determines its quantization related properties.
+  bool is_quantizable = true;
 };
 
 const OpVariant GetOperatorVariant(const ModelT* model, int subgraph_index,
@@ -38,6 +43,11 @@ const OpVariant GetOperatorVariant(const ModelT* model, int subgraph_index,
       model->subgraphs.at(subgraph_index)->operators[op_index].get();
   op_variant.op_code = model->operator_codes[op->opcode_index]->builtin_code;
   if (op_variant.op_code == BuiltinOperator_LSTM) {
+    if (op->inputs.size() == 5) {
+      // The 5 input ("basic") LSTM is not supported in this tooling (yet).
+      op_variant.is_quantizable = false;
+      return op_variant;
+    }
     const int cell_to_output_weight_index = 11;
     const int forget_layer_norm_coefficients_index = 21;
     const int projection_weights_index = 16;
@@ -95,6 +105,12 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.restrict_same_input_output_scale = true;
       property.version = 2;
       break;
+    case BuiltinOperator_SPLIT_V:
+      property.inputs = {{0, {}}};
+      property.arbitrary_outputs = true;
+      property.restrict_same_input_output_scale = true;
+      property.version = 2;
+      break;
     case BuiltinOperator_CONCATENATION:
       property.arbitrary_inputs = true;
       property.outputs = {{0, {}}};
@@ -200,6 +216,12 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       break;
     }
     case BuiltinOperator_LSTM: {
+      if (!op_variant.is_quantizable) {
+        // Early exist for 5 input LSTM.
+        // It is not supported in this tooling yet.
+        property.quantizable = false;
+        break;
+      }
       // TODO(jianlijianli): extend LSTM op spec to inlucde input, bias etc.
       // LSTM needs 5 intermediate tensors. This agrees with the fully quantized
       // kernels in lstm_eval.cc
diff --git a/tensorflow/lite/tools/optimize/quantize_weights.cc b/tensorflow/lite/tools/optimize/quantize_weights.cc
index 516ba51e91b..581819495b1 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights.cc
+++ b/tensorflow/lite/tools/optimize/quantize_weights.cc
@@ -183,8 +183,8 @@ bool CheckAllOpInputsQuantized(const SubGraphT* subgraph, const OperatorT* op,
 TfLiteStatus InsertQuantizableInputTensorsFromOperator(
     const ModelT* model, const OperatorT* op, uint64_t weights_min_num_elements,
     const CustomOpMap& custom_op_map,
-    absl::flat_hash_map<int32_t, TensorT*>* tensor_map) {
-  SubGraphT* subgraph = model->subgraphs.at(0).get();
+    absl::flat_hash_map<int32_t, TensorT*>* tensor_map, int subgraph_index) {
+  SubGraphT* subgraph = model->subgraphs.at(subgraph_index).get();
   const OperatorCodeT* op_code = model->operator_codes[op->opcode_index].get();
 
   std::vector<int32_t> op_input_indices =
@@ -346,112 +346,107 @@ TfLiteStatus QuantizeWeightsInt8(flatbuffers::FlatBufferBuilder* builder,
   std::unique_ptr<ModelT> model;
   model.reset(input_model->UnPack());
 
-  // TODO(suharshs): When models support multiple subgraphs, add support.
-  if (model->subgraphs.size() != 1) {
-    LOG(ERROR) << "Quantize weights tool only supports tflite models with one "
-                  "subgraph.";
-    return kTfLiteError;
-  }
+  for (int subgraph_index = 0; subgraph_index < model->subgraphs.size();
+       ++subgraph_index) {
+    SubGraphT* subgraph = model->subgraphs.at(subgraph_index).get();
 
-  SubGraphT* subgraph = model->subgraphs.at(0).get();
+    absl::flat_hash_map<int32_t, TensorT*> tensor_map;
+    for (int i = 0; i < subgraph->operators.size(); ++i) {
+      OperatorT* op = subgraph->operators[i].get();
+      TF_LITE_ENSURE_STATUS(InsertQuantizableInputTensorsFromOperator(
+          model.get(), op, weights_min_num_elements, custom_op_map, &tensor_map,
+          subgraph_index));
+    }
 
-  absl::flat_hash_map<int32_t, TensorT*> tensor_map;
-  for (int i = 0; i < subgraph->operators.size(); ++i) {
-    OperatorT* op = subgraph->operators[i].get();
-    TF_LITE_ENSURE_STATUS(InsertQuantizableInputTensorsFromOperator(
-        model.get(), op, weights_min_num_elements, custom_op_map, &tensor_map));
-  }
+    for (std::pair<int32_t, TensorT*> tensor_pair : tensor_map) {
+      // Quantize the tensor.
+      TF_LITE_ENSURE_STATUS(
+          utils::SymmetricQuantizeTensor(model.get(), tensor_pair.second));
+    }
 
-  // The hash map ensures that we quantize each tensor exactly once.
-  // TODO(suharshs): This map key isn't sufficient when we support multiple
-  // subgraphs.
-  for (std::pair<int32_t, TensorT*> tensor_pair : tensor_map) {
-    // Quantize the tensor.
-    TF_LITE_ENSURE_STATUS(
-        utils::SymmetricQuantizeTensor(model.get(), tensor_pair.second));
-  }
-
-  // Examine the tensor consumers to determine which require dequantize ops.
-  for (const auto& tensor_pair : tensor_map) {
-    int32_t tensor_idx = tensor_pair.first;
-    TensorT* tensor = tensor_pair.second;
-    std::vector<ConsumerOpInfo> consumer_op_infos =
-        GetTensorConsumers(model.get(), subgraph, tensor_idx);
-    if (IsQuantizationPassThroughOps(model.get(), consumer_op_infos)) {
-      std::tie(tensor_idx, tensor, consumer_op_infos) =
-          PassQuantizationAndGetConsumers(model.get(), subgraph,
-                                          consumer_op_infos, custom_op_map);
-      if (tensor_idx < 0) {
-        // Error message is already logged by PassQuantizationAndGetConsumers.
-        return kTfLiteError;
+    // Examine the tensor consumers to determine which require dequantize ops.
+    for (const auto& tensor_pair : tensor_map) {
+      int32_t tensor_idx = tensor_pair.first;
+      TensorT* tensor = tensor_pair.second;
+      std::vector<ConsumerOpInfo> consumer_op_infos =
+          GetTensorConsumers(model.get(), subgraph, tensor_idx);
+      if (IsQuantizationPassThroughOps(model.get(), consumer_op_infos)) {
+        std::tie(tensor_idx, tensor, consumer_op_infos) =
+            PassQuantizationAndGetConsumers(model.get(), subgraph,
+                                            consumer_op_infos, custom_op_map);
+        if (tensor_idx < 0) {
+          // Error message is already logged by PassQuantizationAndGetConsumers.
+          return kTfLiteError;
+        }
       }
-    }
 
-    std::vector<ConsumerOpInfo> dequant_op_infos;  // Ops that need dequants.
-    for (ConsumerOpInfo& consumer_op_info : consumer_op_infos) {
-      OperatorT* consumer_op = consumer_op_info.op;
-      const OperatorCodeT* consumer_op_code =
-          model->operator_codes[consumer_op->opcode_index].get();
-      // If the op is a hybrid op and all the required tensors are quantized,
-      // we have no further work to do, but for all ops that require
-      // dequantization we need to add a Dequantize op.
-      bool eval_hybrid =
-          use_hybrid_evaluation &&
-          IsHybridEvaluationOp(consumer_op, consumer_op_code, custom_op_map) &&
-          CheckAllOpInputsQuantized(subgraph, consumer_op, consumer_op_code,
-                                    custom_op_map) &&
-          IsQuantizedInput(consumer_op_code, custom_op_map,
-                           consumer_op_info.op_input_idx);
-      if (!eval_hybrid) {
-        dequant_op_infos.push_back(consumer_op_info);
+      std::vector<ConsumerOpInfo> dequant_op_infos;  // Ops that need dequants.
+      for (ConsumerOpInfo& consumer_op_info : consumer_op_infos) {
+        OperatorT* consumer_op = consumer_op_info.op;
+        const OperatorCodeT* consumer_op_code =
+            model->operator_codes[consumer_op->opcode_index].get();
+        // If the op is a hybrid op and all the required tensors are quantized,
+        // we have no further work to do, but for all ops that require
+        // dequantization we need to add a Dequantize op.
+        bool eval_hybrid =
+            use_hybrid_evaluation &&
+            IsHybridEvaluationOp(consumer_op, consumer_op_code,
+                                 custom_op_map) &&
+            CheckAllOpInputsQuantized(subgraph, consumer_op, consumer_op_code,
+                                      custom_op_map) &&
+            IsQuantizedInput(consumer_op_code, custom_op_map,
+                             consumer_op_info.op_input_idx);
+        if (!eval_hybrid) {
+          dequant_op_infos.push_back(consumer_op_info);
+        }
       }
-    }
 
-    // Check if this tensor is an output tensor.
-    int32_t output_index = -1;
-    for (int32_t i = 0; i < subgraph->outputs.size(); ++i) {
-      if (subgraph->outputs[i] == tensor_idx) {
-        output_index = i;
-        break;
+      // Check if this tensor is an output tensor.
+      int32_t output_index = -1;
+      for (int32_t i = 0; i < subgraph->outputs.size(); ++i) {
+        if (subgraph->outputs[i] == tensor_idx) {
+          output_index = i;
+          break;
+        }
       }
+
+      // If no ops require dequant and it is not output, we are done for this
+      // tensor.
+      if (dequant_op_infos.empty() && output_index < 0) {
+        continue;
+      }
+
+      // Create a new tensor to be the output of the dequantize op.
+      std::unique_ptr<TensorT> dequantize_output;
+      const string dequant_name = tensor->name + "_dequantize";
+      utils::MakeTensor(dequant_name, tensor->shape, TensorType_FLOAT32,
+                        &dequantize_output);
+      const int32_t dequantize_output_idx = subgraph->tensors.size();
+      subgraph->tensors.push_back(std::move(dequantize_output));
+
+      // Create the Dequantize operation.
+      std::unique_ptr<OperatorT> dequantize_op;
+      utils::MakeDequantizeOperator(model.get(), &dequantize_op, tensor_idx,
+                                    dequantize_output_idx);
+
+      // Update the op_input of all the ops that need the created dequantize
+      // operation.
+      int32_t min_op_idx = subgraph->operators.size();
+      for (ConsumerOpInfo& dequant_op_info : dequant_op_infos) {
+        dequant_op_info.op->inputs[dequant_op_info.op_input_idx] =
+            dequantize_output_idx;
+        min_op_idx = std::min(dequant_op_info.op_idx, min_op_idx);
+      }
+      // Update output name.
+      if (output_index >= 0) {
+        subgraph->outputs[output_index] = dequantize_output_idx;
+      }
+
+      // Insert the newly created Dequantize operation before the earliest
+      // consumer, since TFLite requires operators to be topo-sorted.
+      subgraph->operators.insert(subgraph->operators.begin() + min_op_idx,
+                                 std::move(dequantize_op));
     }
-
-    // If no ops require dequant and it is not output, we are done for this
-    // tensor.
-    if (dequant_op_infos.empty() && output_index < 0) {
-      continue;
-    }
-
-    // Create a new tensor to be the output of the dequantize op.
-    std::unique_ptr<TensorT> dequantize_output;
-    const string dequant_name = tensor->name + "_dequantize";
-    utils::MakeTensor(dequant_name, tensor->shape, TensorType_FLOAT32,
-                      &dequantize_output);
-    const int32_t dequantize_output_idx = subgraph->tensors.size();
-    subgraph->tensors.push_back(std::move(dequantize_output));
-
-    // Create the Dequantize operation.
-    std::unique_ptr<OperatorT> dequantize_op;
-    utils::MakeDequantizeOperator(model.get(), &dequantize_op, tensor_idx,
-                                  dequantize_output_idx);
-
-    // Update the op_input of all the ops that need the created dequantize
-    // operation.
-    int32_t min_op_idx = subgraph->operators.size();
-    for (ConsumerOpInfo& dequant_op_info : dequant_op_infos) {
-      dequant_op_info.op->inputs[dequant_op_info.op_input_idx] =
-          dequantize_output_idx;
-      min_op_idx = std::min(dequant_op_info.op_idx, min_op_idx);
-    }
-    // Update output name.
-    if (output_index >= 0) {
-      subgraph->outputs[output_index] = dequantize_output_idx;
-    }
-
-    // Insert the newly created Dequantize operation before the earliest
-    // consumer, since TFLite requires operators to be topo-sorted.
-    subgraph->operators.insert(subgraph->operators.begin() + min_op_idx,
-                               std::move(dequantize_op));
   }
 
   // Update the modified operator code versions.
@@ -469,73 +464,69 @@ TfLiteStatus QuantizeWeightsFloat16(flatbuffers::FlatBufferBuilder* builder,
   std::unique_ptr<ModelT> model;
   model.reset(input_model->UnPack());
 
-  // TODO(suharshs): When models support multiple subgraphs, add support.
-  if (model->subgraphs.size() != 1) {
-    LOG(ERROR) << "Quantize weights tool only supports tflite models with one "
-                  "subgraph.";
-    return kTfLiteError;
-  }
+  for (int subgraph_index = 0; subgraph_index < model->subgraphs.size();
+       ++subgraph_index) {
+    SubGraphT* subgraph = model->subgraphs.at(subgraph_index).get();
 
-  SubGraphT* subgraph = model->subgraphs.at(0).get();
-
-  absl::flat_hash_map<int32_t, TensorT*> tensor_map;
-  for (int i = 0; i < subgraph->operators.size(); ++i) {
-    OperatorT* op = subgraph->operators[i].get();
-    for (auto tensor_idx : op->inputs) {
-      // Skip optional tensors.
-      if (tensor_idx == kTfLiteOptionalTensor) {
-        continue;
-      }
-      TensorT* tensor = subgraph->tensors[tensor_idx].get();
-      BufferT* buffer = model->buffers[tensor->buffer].get();
-      if (buffer == nullptr) {
-        return kTfLiteError;
-      }
-      // Quantize tensors that have data to quantize.
-      bool is_constant = !model->buffers[tensor->buffer].get()->data.empty();
-      if (tensor->type == TensorType_FLOAT32 && is_constant) {
-        tensor_map.insert({tensor_idx, tensor});
+    absl::flat_hash_map<int32_t, TensorT*> tensor_map;
+    for (int i = 0; i < subgraph->operators.size(); ++i) {
+      OperatorT* op = subgraph->operators[i].get();
+      for (auto tensor_idx : op->inputs) {
+        // Skip optional tensors.
+        if (tensor_idx == kTfLiteOptionalTensor) {
+          continue;
+        }
+        TensorT* tensor = subgraph->tensors[tensor_idx].get();
+        BufferT* buffer = model->buffers[tensor->buffer].get();
+        if (buffer == nullptr) {
+          return kTfLiteError;
+        }
+        // Quantize tensors that have data to quantize.
+        bool is_constant = !model->buffers[tensor->buffer].get()->data.empty();
+        if (tensor->type == TensorType_FLOAT32 && is_constant) {
+          tensor_map.insert({tensor_idx, tensor});
+        }
       }
     }
-  }
 
-  // The hash map ensures that we quantize each tensor exactly once.
-  for (std::pair<int32_t, TensorT*> tensor_pair : tensor_map) {
-    // Quantize the tensor.
-    TF_LITE_ENSURE_STATUS(
-        utils::QuantizeTensorFloat16(model.get(), tensor_pair.second));
+    // The hash map ensures that we quantize each tensor exactly once.
+    for (std::pair<int32_t, TensorT*> tensor_pair : tensor_map) {
+      // Quantize the tensor.
+      TF_LITE_ENSURE_STATUS(
+          utils::QuantizeTensorFloat16(model.get(), tensor_pair.second));
 
-    int32_t tensor_idx = tensor_pair.first;
-    TensorT* tensor = tensor_pair.second;
-    std::vector<ConsumerOpInfo> dequant_op_infos =
-        GetTensorConsumers(model.get(), subgraph, tensor_idx);
+      int32_t tensor_idx = tensor_pair.first;
+      TensorT* tensor = tensor_pair.second;
+      std::vector<ConsumerOpInfo> dequant_op_infos =
+          GetTensorConsumers(model.get(), subgraph, tensor_idx);
 
-    // Create a new tensor to be the output of the dequantize op.
-    std::unique_ptr<TensorT> dequantize_output;
-    const string dequant_name = tensor->name + "_dequantize";
-    utils::MakeTensor(dequant_name, tensor->shape, TensorType_FLOAT32,
-                      &dequantize_output);
-    const int32_t dequantize_output_idx = subgraph->tensors.size();
-    subgraph->tensors.push_back(std::move(dequantize_output));
+      // Create a new tensor to be the output of the dequantize op.
+      std::unique_ptr<TensorT> dequantize_output;
+      const string dequant_name = tensor->name + "_dequantize";
+      utils::MakeTensor(dequant_name, tensor->shape, TensorType_FLOAT32,
+                        &dequantize_output);
+      const int32_t dequantize_output_idx = subgraph->tensors.size();
+      subgraph->tensors.push_back(std::move(dequantize_output));
 
-    // Create the Dequantize operation.
-    std::unique_ptr<OperatorT> dequantize_op;
-    utils::MakeDequantizeOperator(model.get(), &dequantize_op, tensor_idx,
-                                  dequantize_output_idx);
+      // Create the Dequantize operation.
+      std::unique_ptr<OperatorT> dequantize_op;
+      utils::MakeDequantizeOperator(model.get(), &dequantize_op, tensor_idx,
+                                    dequantize_output_idx);
 
-    // Update the op_input of all the ops that need the created dequantize
-    // operation.
-    int32_t min_op_idx = subgraph->operators.size();
-    for (ConsumerOpInfo& dequant_op_info : dequant_op_infos) {
-      dequant_op_info.op->inputs[dequant_op_info.op_input_idx] =
-          dequantize_output_idx;
-      min_op_idx = std::min(dequant_op_info.op_idx, min_op_idx);
+      // Update the op_input of all the ops that need the created dequantize
+      // operation.
+      int32_t min_op_idx = subgraph->operators.size();
+      for (ConsumerOpInfo& dequant_op_info : dequant_op_infos) {
+        dequant_op_info.op->inputs[dequant_op_info.op_input_idx] =
+            dequantize_output_idx;
+        min_op_idx = std::min(dequant_op_info.op_idx, min_op_idx);
+      }
+
+      // Insert the newly created Dequantize operation before the earliest
+      // consumer, since TFLite requires operators to be topo-sorted.
+      subgraph->operators.insert(subgraph->operators.begin() + min_op_idx,
+                                 std::move(dequantize_op));
     }
-
-    // Insert the newly created Dequantize operation before the earliest
-    // consumer, since TFLite requires operators to be topo-sorted.
-    subgraph->operators.insert(subgraph->operators.begin() + min_op_idx,
-                               std::move(dequantize_op));
   }
 
   flatbuffers::Offset<Model> output_model_location =
diff --git a/tensorflow/lite/tools/verifier.cc b/tensorflow/lite/tools/verifier.cc
index f5b369ea501..cc2ac8e5ed0 100644
--- a/tensorflow/lite/tools/verifier.cc
+++ b/tensorflow/lite/tools/verifier.cc
@@ -428,7 +428,7 @@ bool VerifySubGraphConsistency(const Model& model, const SubGraph& subgraph,
   absl::flat_hash_set<int> subgraph_input_tensors, constant_tensors,
       variable_tensors, output_tensors;
   if (subgraph.tensors()) {
-    for (int i = 0; i < subgraph.tensors()->Length(); ++i) {
+    for (int i = 0; i < subgraph.tensors()->size(); ++i) {
       const auto* tensor = subgraph.tensors()->Get(i);
       if (IsConstantTensor(*tensor, model)) {
         constant_tensors.insert(i);
@@ -444,7 +444,7 @@ bool VerifySubGraphConsistency(const Model& model, const SubGraph& subgraph,
   }
 
   if (subgraph.operators()) {
-    for (int op_idx = 0; op_idx < subgraph.operators()->Length(); ++op_idx) {
+    for (int op_idx = 0; op_idx < subgraph.operators()->size(); ++op_idx) {
       const auto* op = subgraph.operators()->Get(op_idx);
       if (!model.operator_codes() ||
           (op->opcode_index() >= model.operator_codes()->size())) {
diff --git a/tensorflow/lite/tools/versioning/BUILD b/tensorflow/lite/tools/versioning/BUILD
index 2958ccfd232..1ba221d3fa9 100644
--- a/tensorflow/lite/tools/versioning/BUILD
+++ b/tensorflow/lite/tools/versioning/BUILD
@@ -9,14 +9,20 @@ package(
 )
 
 cc_library(
-    name = "op_version",
-    srcs = ["op_version.cc"],
+    name = "versioning",
+    srcs = [
+        "op_version.cc",
+        "runtime_version.cc",
+    ],
     hdrs = [
         "op_version.h",
+        "runtime_version.h",
     ],
     deps = [
         "//tensorflow/core:tflite_portable_logging",
+        "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/kernels/internal:compatibility",
+        "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/schema:schema_fbs_with_mutable",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -25,10 +31,13 @@ cc_library(
 )
 
 tf_cc_test(
-    name = "op_version_test",
-    srcs = ["op_version_test.cc"],
+    name = "versioning_test",
+    srcs = [
+        "op_version_test.cc",
+        "runtime_version_test.cc",
+    ],
     deps = [
-        ":op_version",
+        ":versioning",
         "//tensorflow/lite/schema:schema_fbs_with_mutable",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index b3ef46503b3..0e4938ad3f6 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/tools/versioning/op_version.h"
 
+#include <algorithm>
 #include <string>
 #include <utility>
 #include <vector>
@@ -25,6 +26,13 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 
 namespace tflite {
+namespace {
+
+// Get the number of dimensions of a tensor with idx of an operator op.
+inline int GetNumDims(const SubGraph* subgraph, const Operator* op, int idx) {
+  return subgraph->tensors()->Get(op->inputs()->Get(idx))->shape()->size();
+}
+}  // namespace
 
 int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
   switch (op_sig.op) {
@@ -296,11 +304,19 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
       return 1;
 
+    case BuiltinOperator_SPACE_TO_BATCH_ND:
+    case BuiltinOperator_BATCH_TO_SPACE_ND:
+      if (op_sig.options.space_batch.num_dims != 4) {
+        return 3;
+      }
+      if (op_sig.input_types.at(0) == TensorType_INT8) {
+        return 2;
+      }
+      return 1;
+
     case BuiltinOperator_AVERAGE_POOL_2D:
     case BuiltinOperator_ADD:
-    case BuiltinOperator_SPACE_TO_BATCH_ND:
     case BuiltinOperator_SUB:
-    case BuiltinOperator_BATCH_TO_SPACE_ND:
     case BuiltinOperator_CONCATENATION:
     case BuiltinOperator_MAX_POOL_2D:
     case BuiltinOperator_MAXIMUM:
@@ -309,6 +325,7 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
     case BuiltinOperator_PADV2:
     case BuiltinOperator_SOFTMAX:
     case BuiltinOperator_SPACE_TO_DEPTH:
+    case BuiltinOperator_SPLIT_V:
     case BuiltinOperator_MEAN:
     case BuiltinOperator_SUM:
     case BuiltinOperator_REDUCE_MAX:
@@ -436,8 +453,12 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
     } break;
     // TODO(b/150176627): Add tests for GetOpSignature.
     case BuiltinOperator_STRIDED_SLICE: {
-      op_sig.options.strided_slice.num_dims =
-          subgraph->tensors()->Get(op->inputs()->Get(0))->shape()->size();
+      op_sig.options.strided_slice.num_dims = GetNumDims(subgraph, op, 0);
+    } break;
+
+    case BuiltinOperator_SPACE_TO_BATCH_ND:
+    case BuiltinOperator_BATCH_TO_SPACE_ND: {
+      op_sig.options.space_batch.num_dims = GetNumDims(subgraph, op, 0);
     } break;
 
     default:
diff --git a/tensorflow/lite/tools/versioning/op_version.h b/tensorflow/lite/tools/versioning/op_version.h
index 364d1a299cc..e12d9aef99a 100644
--- a/tensorflow/lite/tools/versioning/op_version.h
+++ b/tensorflow/lite/tools/versioning/op_version.h
@@ -52,6 +52,9 @@ typedef struct {
     struct {
       int32_t num_dims;
     } strided_slice;
+    struct {
+      int32_t num_dims;
+    } space_batch;
   } options;
 } OpSignature;
 
diff --git a/tensorflow/lite/tools/versioning/op_version_test.cc b/tensorflow/lite/tools/versioning/op_version_test.cc
index 8cd873aa697..528fca4337c 100644
--- a/tensorflow/lite/tools/versioning/op_version_test.cc
+++ b/tensorflow/lite/tools/versioning/op_version_test.cc
@@ -161,7 +161,23 @@ TEST(OpVersionTest, VersioningReluTest) {
 }
 
 TEST(OpVersionTest, VersioningBatchToSpaceNDTest) {
-  SimpleVersioningTest(BuiltinOperator_BATCH_TO_SPACE_ND);
+  OpSignature fake_op_sig = {
+      .op = BuiltinOperator_BATCH_TO_SPACE_ND,
+      .input_types = std::vector<TensorType>{TensorType_INT8},
+  };
+  fake_op_sig.options.space_batch.num_dims = 3;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+  fake_op_sig.options.space_batch.num_dims = 4;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
+
+  fake_op_sig = {
+      .op = BuiltinOperator_BATCH_TO_SPACE_ND,
+      .input_types = std::vector<TensorType>{TensorType_UINT8},
+  };
+  fake_op_sig.options.space_batch.num_dims = 3;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+  fake_op_sig.options.space_batch.num_dims = 4;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
 }
 
 TEST(OpVersionTest, VersioningTanhTest) {
diff --git a/tensorflow/lite/tools/versioning/runtime_version.cc b/tensorflow/lite/tools/versioning/runtime_version.cc
new file mode 100644
index 00000000000..fda074fabdf
--- /dev/null
+++ b/tensorflow/lite/tools/versioning/runtime_version.cc
@@ -0,0 +1,285 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/versioning/runtime_version.h"
+
+#include <cstring>
+
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
+#include "tensorflow/lite/minimal_logging.h"
+#include "tensorflow/lite/schema/mutable/schema_generated.h"
+
+namespace tflite {
+
+bool CompareRuntimeVersion(const std::string& v1, const std::string& v2) {
+  const std::vector<std::string> vec1 = absl::StrSplit(v1, '.');
+  const std::vector<std::string> vec2 = absl::StrSplit(v2, '.');
+  int i = 0;
+  while (i < vec1.size() && i < vec2.size()) {
+    int v1_val, v2_val;
+    if (absl::SimpleAtoi(vec1[i], &v1_val) &&
+        absl::SimpleAtoi(vec2[i], &v2_val)) {
+      if (v1_val != v2_val) return v1_val < v2_val;
+    }
+    ++i;
+  }
+  // If there are remaining items in v2 not being compared, then v1 should
+  // precede v2.
+  return i < vec2.size();
+}
+
+void UpdateMinimumRuntimeVersionForModel(uint8_t* model_buffer_pointer) {
+  // Use this as the placeholder string if a particular op is not yet included
+  // in any Tensorflow's RC/Final release source package. Once that op is
+  // included in the release, please update this with the real version string.
+  static constexpr char kPendingReleaseOpVersion[] = "UNKNOWN";
+  // A map from the version key of an op to its minimum runtime version.
+  // For example, {{kAveragePool, 1}, "1.5.0"},  means the 1st version of
+  // AveragePool requires a minimum TF Lite runtime version '1.5.0`.
+  static const std::map<std::pair<BuiltinOperator, int>, std::string>*
+      op_version_map =
+          new std::map<std::pair<BuiltinOperator, int>, std::string>({
+              {{BuiltinOperator_AVERAGE_POOL_2D, 1}, "1.5.0"},
+              {{BuiltinOperator_AVERAGE_POOL_2D, 2}, "1.14.0"},
+              {{BuiltinOperator_CONV_2D, 1}, "1.5.0"},
+              {{BuiltinOperator_CONV_2D, 2}, "1.14.0"},
+              {{BuiltinOperator_CONV_2D, 3}, "1.14.0"},
+              {{BuiltinOperator_DEPTHWISE_CONV_2D, 1}, "1.5.0"},
+              {{BuiltinOperator_DEPTHWISE_CONV_2D, 2}, "1.12.0"},
+              {{BuiltinOperator_DEPTHWISE_CONV_2D, 3}, "1.14.0"},
+              {{BuiltinOperator_ADD, 1}, "1.5.0"},
+              {{BuiltinOperator_ADD, 2}, "1.14.0"},
+              {{BuiltinOperator_ADD_N, 1}, "1.14.0"},
+              {{BuiltinOperator_SPACE_TO_BATCH_ND, 1}, "1.6.0"},
+              {{BuiltinOperator_SPACE_TO_BATCH_ND, 2}, "1.14.0"},
+              {{BuiltinOperator_SUB, 1}, "1.6.0"},
+              {{BuiltinOperator_SUB, 2}, "1.14.0"},
+              {{BuiltinOperator_DIV, 1}, "1.6.0"},
+              {{BuiltinOperator_BATCH_TO_SPACE_ND, 1}, "1.6.0"},
+              {{BuiltinOperator_BATCH_TO_SPACE_ND, 2}, "1.14.0"},
+              {{BuiltinOperator_CAST, 1}, "1.5.0"},
+              {{BuiltinOperator_CONCATENATION, 1}, "1.5.0"},
+              {{BuiltinOperator_CONCATENATION, 2}, "1.14.0"},
+              {{BuiltinOperator_DEPTH_TO_SPACE, 1}, "2.1.0"},
+              {{BuiltinOperator_FAKE_QUANT, 1}, "1.5.0"},
+              {{BuiltinOperator_FAKE_QUANT, 2}, "1.10.0"},
+              {{BuiltinOperator_FULLY_CONNECTED, 1}, "1.5.0"},
+              {{BuiltinOperator_FULLY_CONNECTED, 2}, "1.10.0"},
+              {{BuiltinOperator_FULLY_CONNECTED, 3}, "1.14.0"},
+              {{BuiltinOperator_FULLY_CONNECTED, 4}, "1.14.0"},
+              {{BuiltinOperator_FULLY_CONNECTED, 5}, "2.0.0"},
+              {{BuiltinOperator_FULLY_CONNECTED, 6}, "2.1.0"},
+              {{BuiltinOperator_GATHER, 1}, "1.6.0"},
+              {{BuiltinOperator_GATHER, 2}, "1.14.0"},
+              {{BuiltinOperator_GATHER, 3}, "1.15.0"},
+              {{BuiltinOperator_GATHER_ND, 1}, "1.14.0"},
+              {{BuiltinOperator_SVDF, 1}, "1.5.0"},
+              {{BuiltinOperator_SVDF, 2}, "1.14.0"},
+              {{BuiltinOperator_SVDF, 3}, "2.2.0"},
+              {{BuiltinOperator_L2_NORMALIZATION, 1}, "1.5.0"},
+              {{BuiltinOperator_L2_NORMALIZATION, 2}, "1.14.0"},
+              {{BuiltinOperator_L2_POOL_2D, 1}, "1.5.0"},
+              {{BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION, 1}, "1.5.0"},
+              {{BuiltinOperator_MAX_POOL_2D, 1}, "1.5.0"},
+              {{BuiltinOperator_MAX_POOL_2D, 2}, "1.14.0"},
+              {{BuiltinOperator_MAXIMUM, 1}, "1.14.0"},
+              {{BuiltinOperator_MAXIMUM, 2}, "1.14.0"},
+              {{BuiltinOperator_MINIMUM, 1}, "1.14.0"},
+              {{BuiltinOperator_MINIMUM, 2}, "1.14.0"},
+              {{BuiltinOperator_MUL, 1}, "1.5.0"},
+              {{BuiltinOperator_MUL, 2}, "1.14.0"},
+              {{BuiltinOperator_MUL, 3}, "1.15.0"},
+              {{BuiltinOperator_PAD, 1}, "1.5.0"},
+              {{BuiltinOperator_PAD, 2}, "1.14.0"},
+              {{BuiltinOperator_TILE, 1}, "1.10.1"},
+              {{BuiltinOperator_TILE, 2}, "2.2.0"},
+              {{BuiltinOperator_PADV2, 1}, "1.9.0"},
+              {{BuiltinOperator_PADV2, 2}, "1.14.0"},
+              {{BuiltinOperator_RESHAPE, 1}, "1.5.0"},
+              {{BuiltinOperator_SOFTMAX, 1}, "1.5.0"},
+              {{BuiltinOperator_SOFTMAX, 2}, "1.14.0"},
+              {{BuiltinOperator_SPACE_TO_DEPTH, 1}, "1.5.0"},
+              {{BuiltinOperator_SPACE_TO_DEPTH, 2}, "1.14.0"},
+              {{BuiltinOperator_TRANSPOSE, 1}, "1.6.0"},
+              {{BuiltinOperator_TRANSPOSE, 2}, "1.14.0"},
+              {{BuiltinOperator_TRANSPOSE, 3}, "1.15.0"},
+              {{BuiltinOperator_LSTM, 1}, "1.7.0"},
+              {{BuiltinOperator_LSTM, 2}, "1.10.0"},
+              {{BuiltinOperator_LSTM, 3}, "1.14.0"},
+              {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM, 1}, "1.13.1"},
+              {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM, 1}, "1.14.0"},
+              {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM, 1}, "1.14.0"},
+              {{BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN, 1}, "1.14.0"},
+              {{BuiltinOperator_MEAN, 1}, "1.6.0"},
+              {{BuiltinOperator_MEAN, 2}, "1.14.0"},
+              {{BuiltinOperator_SUM, 1}, "1.10.0"},
+              {{BuiltinOperator_SUM, 2}, "1.15.0"},
+              {{BuiltinOperator_REDUCE_MAX, 1}, "1.11.0"},
+              {{BuiltinOperator_REDUCE_MAX, 2}, "1.14.0"},
+              {{BuiltinOperator_REDUCE_MIN, 1}, "1.11.0"},
+              {{BuiltinOperator_REDUCE_MIN, 2}, "1.14.0"},
+              {{BuiltinOperator_REDUCE_PROD, 1}, "1.11.0"},
+              {{BuiltinOperator_REDUCE_ANY, 1}, "1.11.0"},
+              {{BuiltinOperator_RELU6, 1}, "1.5.0"},
+              {{BuiltinOperator_RELU6, 2}, "1.14.0"},
+              {{BuiltinOperator_RESIZE_BILINEAR, 1}, "1.7.0"},
+              {{BuiltinOperator_RESIZE_BILINEAR, 2}, "1.14.0"},
+              {{BuiltinOperator_RESIZE_BILINEAR, 3}, "2.2.0"},
+              {{BuiltinOperator_RESIZE_NEAREST_NEIGHBOR, 1}, "1.13.1"},
+              {{BuiltinOperator_RESIZE_NEAREST_NEIGHBOR, 2}, "1.14.0"},
+              {{BuiltinOperator_SQUEEZE, 1}, "1.6.0"},
+              {{BuiltinOperator_SPLIT, 1}, "1.5.0"},
+              {{BuiltinOperator_SPLIT, 2}, "1.14.0"},
+              {{BuiltinOperator_SPLIT, 3}, "1.14.0"},
+              {{BuiltinOperator_SPLIT_V, 1}, "1.13.1"},
+              {{BuiltinOperator_STRIDED_SLICE, 1}, "1.6.0"},
+              {{BuiltinOperator_STRIDED_SLICE, 2}, "1.14.0"},
+              {{BuiltinOperator_STRIDED_SLICE, 3}, "2.1.0"},
+              {{BuiltinOperator_TOPK_V2, 1}, "1.7.0"},
+              {{BuiltinOperator_TOPK_V2, 2}, "1.14.0"},
+              {{BuiltinOperator_ARG_MAX, 1}, "1.9.0"},
+              {{BuiltinOperator_ARG_MAX, 2}, "1.14.0"},
+              {{BuiltinOperator_ARG_MIN, 1}, "1.9.0"},
+              {{BuiltinOperator_ARG_MIN, 2}, "1.14.0"},
+              {{BuiltinOperator_TRANSPOSE_CONV, 1}, "1.9.0"},
+              {{BuiltinOperator_SPARSE_TO_DENSE, 1}, "1.9.0"},
+              {{BuiltinOperator_SPARSE_TO_DENSE, 2}, "1.14.0"},
+              {{BuiltinOperator_SPARSE_TO_DENSE, 3}, "1.15.0"},
+              {{BuiltinOperator_EXPAND_DIMS, 1}, "1.10.0"},
+              {{BuiltinOperator_PACK, 1}, "1.11.0"},
+              {{BuiltinOperator_PACK, 2}, "1.14.0"},
+              {{BuiltinOperator_SHAPE, 1}, "1.10.0"},
+              {{BuiltinOperator_SLICE, 1}, "1.14.0"},
+              {{BuiltinOperator_SLICE, 2}, "1.14.0"},
+              {{BuiltinOperator_SLICE, 3}, "1.14.0"},
+              {{BuiltinOperator_TANH, 1}, "1.14.0"},
+              {{BuiltinOperator_TANH, 2}, "1.14.0"},
+              {{BuiltinOperator_ONE_HOT, 1}, "1.11.0"},
+              {{BuiltinOperator_UNPACK, 1}, "1.11.0"},
+              {{BuiltinOperator_UNPACK, 2}, "1.14.0"},
+              {{BuiltinOperator_UNPACK, 3}, "2.2.0"},
+              {{BuiltinOperator_LEAKY_RELU, 1}, "1.13.1"},
+              {{BuiltinOperator_LOGISTIC, 1}, "1.14.0"},
+              {{BuiltinOperator_LOGISTIC, 2}, "1.14.0"},
+              {{BuiltinOperator_LOG_SOFTMAX, 1}, "1.14.0"},
+              {{BuiltinOperator_LOG_SOFTMAX, 2}, "1.14.0"},
+              {{BuiltinOperator_SQUARED_DIFFERENCE, 1}, "1.13.1"},
+              {{BuiltinOperator_MIRROR_PAD, 1}, "1.13.1"},
+              {{BuiltinOperator_UNIQUE, 1}, "1.14.0"},
+              {{BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN, 1}, "1.14.0"},
+              {{BuiltinOperator_WHERE, 1}, "1.14.0"},
+              {{BuiltinOperator_DEQUANTIZE, 1}, "1.13.1"},
+              {{BuiltinOperator_DEQUANTIZE, 2}, "1.14.0"},
+              {{BuiltinOperator_DEQUANTIZE, 3}, "1.15.0"},
+              {{BuiltinOperator_REVERSE_SEQUENCE, 1}, "1.14.0"},
+              {{BuiltinOperator_EQUAL, 1}, "1.14.0"},
+              {{BuiltinOperator_EQUAL, 2}, "1.14.0"},
+              {{BuiltinOperator_NOT_EQUAL, 1}, "1.14.0"},
+              {{BuiltinOperator_NOT_EQUAL, 2}, "1.14.0"},
+              {{BuiltinOperator_GREATER, 1}, "1.14.0"},
+              {{BuiltinOperator_GREATER, 2}, "1.14.0"},
+              {{BuiltinOperator_GREATER_EQUAL, 1}, "1.14.0"},
+              {{BuiltinOperator_GREATER_EQUAL, 2}, "1.14.0"},
+              {{BuiltinOperator_LESS, 1}, "1.14.0"},
+              {{BuiltinOperator_LESS, 2}, "1.14.0"},
+              {{BuiltinOperator_LESS_EQUAL, 1}, "1.14.0"},
+              {{BuiltinOperator_LESS_EQUAL, 2}, "1.14.0"},
+              {{BuiltinOperator_SEGMENT_SUM, 1}, "2.2.0"},
+              {{BuiltinOperator_SELECT, 1}, "1.14.0"},
+              {{BuiltinOperator_SELECT, 2}, "1.14.0"},
+              {{BuiltinOperator_SELECT_V2, 1}, "2.2.0"},
+              {{BuiltinOperator_FLOOR_DIV, 1}, "1.14.0"},
+              {{BuiltinOperator_FLOOR_DIV, 2}, "1.14.0"},
+              {{BuiltinOperator_FLOOR, 1}, "1.9.0"},
+              {{BuiltinOperator_CEIL, 1}, "1.14.0"},
+              {{BuiltinOperator_MATRIX_DIAG, 1}, "1.14.0"},
+              {{BuiltinOperator_MATRIX_SET_DIAG, 1}, "1.14.0"},
+              {{BuiltinOperator_ELU, 1}, "1.14.0"},
+              {{BuiltinOperator_ROUND, 1}, "1.14.0"},
+              {{BuiltinOperator_RELU, 1}, "1.5.0"},
+              {{BuiltinOperator_RELU, 2}, "2.1.0"},
+              {{BuiltinOperator_RELU_N1_TO_1, 1}, "1.5.0"},
+              {{BuiltinOperator_PRELU, 1}, "1.8.0"},
+              {{BuiltinOperator_EXP, 1}, "1.7.0"},
+              {{BuiltinOperator_COS, 1}, "1.14.0"},
+              {{BuiltinOperator_NEG, 1}, "1.9.0"},
+              {{BuiltinOperator_POW, 1}, "1.10.0"},
+              {{BuiltinOperator_LOGICAL_OR, 1}, "1.11.0"},
+              {{BuiltinOperator_LOGICAL_AND, 1}, "1.11.0"},
+              {{BuiltinOperator_LOGICAL_NOT, 1}, "1.11.0"},
+              {{BuiltinOperator_FLOOR_MOD, 1}, "1.13.0"},
+              {{BuiltinOperator_RANGE, 1}, "1.13.0"},
+              {{BuiltinOperator_SIN, 1}, "1.9.0"},
+              {{BuiltinOperator_LOG, 1}, "1.14.0"},
+              {{BuiltinOperator_RSQRT, 1}, "1.10.0"},
+              {{BuiltinOperator_SQUARE, 1}, "1.12.0"},
+              {{BuiltinOperator_ZEROS_LIKE, 1}, "1.12.0"},
+              {{BuiltinOperator_ABS, 1}, "1.13.0"},
+              {{BuiltinOperator_HARD_SWISH, 1}, "1.15.0"},
+              {{BuiltinOperator_FILL, 1}, "1.13.0"},
+              {{BuiltinOperator_REVERSE_V2, 1}, "1.14.0"},
+              {{BuiltinOperator_REVERSE_V2, 2}, "2.2.0"},
+              {{BuiltinOperator_RANK, 1}, "1.14.0"},
+          });
+
+  auto model = GetMutableModel(model_buffer_pointer);
+  std::string model_min_version;
+  auto subgraphs = model->subgraphs();
+  for (int i = 0; i < subgraphs->Length(); ++i) {
+    const SubGraph* subgraph = subgraphs->Get(i);
+    for (int j = 0; j < subgraph->operators()->Length(); ++j) {
+      const Operator* op = subgraph->operators()->Get(j);
+      const OperatorCode* op_code =
+          model->operator_codes()->Get(op->opcode_index());
+      std::pair<BuiltinOperator, int> version_key = {op_code->builtin_code(),
+                                                     op_code->version()};
+      auto it = op_version_map->find(version_key);
+      if (it == op_version_map->end() ||
+          it->second == kPendingReleaseOpVersion) {
+        // In case we didn't find the current op in the map, or the operator
+        // doesn't have a minimum runtime version associated, continue.
+        continue;
+      }
+      if (CompareRuntimeVersion(model_min_version, it->second)) {
+        // Current min model runtime version should be bumped if we see a higher
+        // op version.
+        model_min_version = it->second;
+      }
+    }
+  }
+  // The size of the `min_runtime_version` metadata buffer is 16 bytes. If the
+  // generated `model_min_version` is equal or longer than 16 bytes, print a
+  // warning message and return.
+  if (model_min_version.size() >= 16) {
+    TFLITE_LOG(TFLITE_LOG_WARNING,
+               "Skip writing minimum runtime version string since it's "
+               "longer than 16 bytes.");
+    return;
+  }
+  // Copy over the bytes from `model_min_version` into the buffer.
+  for (int i = 0; i < model->metadata()->size(); ++i) {
+    if (model->metadata()->Get(i)->name()->str() == "min_runtime_version") {
+      auto buffer = model->metadata()->Get(i)->buffer();
+      auto buffer_data =
+          model->mutable_buffers()->GetMutableObject(buffer)->mutable_data();
+      memset(buffer_data->data(), 0, buffer_data->size());
+      memcpy(buffer_data->data(), model_min_version.data(),
+             model_min_version.size());
+      break;
+    }
+  }
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/versioning/runtime_version.h b/tensorflow/lite/tools/versioning/runtime_version.h
new file mode 100644
index 00000000000..f4889172746
--- /dev/null
+++ b/tensorflow/lite/tools/versioning/runtime_version.h
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_VERSIONING_RUNTIME_VERSION_H_
+#define TENSORFLOW_LITE_TOOLS_VERSIONING_RUNTIME_VERSION_H_
+
+#include <string>
+
+#include "flatbuffers/flatbuffers.h"  // TF:flatbuffers
+
+namespace tflite {
+// Update minimum runtime version of the given TFL flatbuffer model.
+void UpdateMinimumRuntimeVersionForModel(uint8_t* model_buffer_pointer);
+
+// Returns true if the first version string precedes the second.
+// For example, '1.14' should precede '1.9', also '1.14.1' should precede
+// '1.14'. If two version string is equal, then false will be returned.
+bool CompareRuntimeVersion(const std::string&, const std::string&);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_VERSIONING_RUNTIME_VERSION_H_
diff --git a/tensorflow/lite/tools/versioning/runtime_version_test.cc b/tensorflow/lite/tools/versioning/runtime_version_test.cc
new file mode 100644
index 00000000000..c7b70552340
--- /dev/null
+++ b/tensorflow/lite/tools/versioning/runtime_version_test.cc
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/versioning/runtime_version.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace tflite {
+
+TEST(OpVersionTest, CompareRuntimeVersion) {
+  EXPECT_TRUE(CompareRuntimeVersion("1.9", "1.13"));
+  EXPECT_FALSE(CompareRuntimeVersion("1.13", "1.13"));
+  EXPECT_TRUE(CompareRuntimeVersion("1.14", "1.14.1"));
+  EXPECT_FALSE(CompareRuntimeVersion("1.14.1", "1.14"));
+  EXPECT_FALSE(CompareRuntimeVersion("1.14.1", "1.9"));
+  EXPECT_FALSE(CompareRuntimeVersion("1.0.9", "1.0.8"));
+  EXPECT_FALSE(CompareRuntimeVersion("2.1.0", "1.2.0"));
+  EXPECT_TRUE(CompareRuntimeVersion("", "1.13"));
+  EXPECT_FALSE(CompareRuntimeVersion("", ""));
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/zip_files.py b/tensorflow/lite/tools/zip_files.py
new file mode 100644
index 00000000000..9dc662360f7
--- /dev/null
+++ b/tensorflow/lite/tools/zip_files.py
@@ -0,0 +1,41 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Lint as: python3
+"""Creates a zip package of the files passed in."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import zipfile
+
+from absl import app
+from absl import flags
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string("export_zip_path", None, "Path to zip file.")
+flags.DEFINE_string("file_directory", None, "Path to the files to be zipped.")
+
+
+def main(_):
+  with zipfile.ZipFile(FLAGS.export_zip_path, mode="w") as zf:
+    for root, _, files in os.walk(FLAGS.file_directory):
+      for f in files:
+        if f.endswith(".java"):
+          zf.write(os.path.join(root, f))
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/tensorflow/lite/util.cc b/tensorflow/lite/util.cc
index a876eebd639..335c6773039 100644
--- a/tensorflow/lite/util.cc
+++ b/tensorflow/lite/util.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <complex>
 #include <cstring>
 
+#include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
@@ -131,4 +132,15 @@ bool IsUnresolvedCustomOp(const TfLiteRegistration& registration) {
          registration.invoke == &UnresolvedOpInvoke;
 }
 
+std::string GetOpNameByRegistration(const TfLiteRegistration& registration) {
+  auto op = registration.builtin_code;
+  std::string result =
+      EnumNameBuiltinOperator(static_cast<BuiltinOperator>(op));
+  if ((op == kTfLiteBuiltinCustom || op == kTfLiteBuiltinDelegate) &&
+      registration.custom_name) {
+    result += " " + std::string(registration.custom_name);
+  }
+  return result;
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/util.h b/tensorflow/lite/util.h
index 42ce0deef96..3b042eb5986 100644
--- a/tensorflow/lite/util.h
+++ b/tensorflow/lite/util.h
@@ -21,6 +21,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_UTIL_H_
 #define TENSORFLOW_LITE_UTIL_H_
 
+#include <string>
 #include <vector>
 
 #include "tensorflow/lite/c/common.h"
@@ -73,6 +74,8 @@ TfLiteRegistration CreateUnresolvedCustomOp(const char* custom_op_name);
 // Checks whether the provided op is an unresolved custom op.
 bool IsUnresolvedCustomOp(const TfLiteRegistration& registration);
 
+// Returns a descriptive name with the given op TfLiteRegistration.
+std::string GetOpNameByRegistration(const TfLiteRegistration& registration);
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_UTIL_H_
diff --git a/tensorflow/lite/util_test.cc b/tensorflow/lite/util_test.cc
index b1886a7e8f5..e282431284b 100644
--- a/tensorflow/lite/util_test.cc
+++ b/tensorflow/lite/util_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
@@ -90,6 +91,32 @@ TEST(CombineHashes, TestHashOutputsDifferent) {
   EXPECT_NE(output1, output2);
 }
 
+TEST(GetOpNameByRegistration, ValidBuiltinCode) {
+  TfLiteRegistration registration;
+  registration.builtin_code = tflite::BuiltinOperator_ADD;
+  const auto op_name = GetOpNameByRegistration(registration);
+  EXPECT_EQ("ADD", op_name);
+}
+
+TEST(GetOpNameByRegistration, InvalidBuiltinCode) {
+  TfLiteRegistration registration;
+  registration.builtin_code = -1;
+  const auto op_name = GetOpNameByRegistration(registration);
+  EXPECT_EQ("", op_name);
+}
+
+TEST(GetOpNameByRegistration, CustomName) {
+  TfLiteRegistration registration;
+  registration.builtin_code = tflite::BuiltinOperator_CUSTOM;
+  registration.custom_name = "TestOp";
+  auto op_name = GetOpNameByRegistration(registration);
+  EXPECT_EQ("CUSTOM TestOp", op_name);
+
+  registration.builtin_code = tflite::BuiltinOperator_DELEGATE;
+  registration.custom_name = "TestDelegate";
+  op_name = GetOpNameByRegistration(registration);
+  EXPECT_EQ("DELEGATE TestDelegate", op_name);
+}
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index 5d65eb88b4f..6807e2b340c 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -119,6 +119,7 @@ tensorflow/third_party/mkl/mkl.BUILD
 tensorflow/third_party/mkl_dnn/LICENSE
 tensorflow/third_party/mkl_dnn/build_defs.bzl
 tensorflow/third_party/mkl_dnn/mkldnn.BUILD
+tensorflow/third_party/mkl_dnn/mkldnn_v1.BUILD
 tensorflow/third_party/mpi/.gitignore
 tensorflow/third_party/nanopb.BUILD
 tensorflow/third_party/nccl/BUILD
@@ -398,5 +399,6 @@ tensorflow/tools/pip_package/pip_smoke_test.py
 tensorflow/tools/pip_package/setup.py
 tensorflow/tools/pip_package/simple_console.py
 tensorflow/tools/pip_package/simple_console_for_windows.py
+tensorflow/tools/pip_package/xla_build/CMakeLists.txt
 tensorflow/virtual_root_template_v1.__init__.py
 tensorflow/virtual_root_template_v2.__init__.py
\ No newline at end of file
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 4442745d462..d932899ab0d 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -207,6 +207,7 @@ py_library(
         "//tensorflow/python/profiler",
         "//tensorflow/python/profiler:profiler_client",
         "//tensorflow/python/profiler:profiler_v2",
+        "//tensorflow/python/profiler:trace",
         "//tensorflow/python/saved_model",
         "//tensorflow/python/tools:module_util",
         "//tensorflow/python/tools/api/generator:create_python_api",
@@ -349,14 +350,15 @@ cc_library(
     hdrs = ["grappler/cost_analyzer.h"],
     deps = [
         "//tensorflow/core:lib",
+        "//tensorflow/core/grappler/costs:analytical_cost_estimator",
+        "//tensorflow/core/grappler/costs:measuring_cost_estimator",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/costs:analytical_cost_estimator",
         "//tensorflow/core/grappler/costs:cost_estimator",
-        "//tensorflow/core/grappler/costs:measuring_cost_estimator",
         "//tensorflow/core/grappler/costs:utils",
     ] + tf_protos_grappler(),
+    alwayslink = 1,
 )
 
 # Necessary for the pywrap inclusion below. Combining targets does not work
@@ -509,6 +511,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//third_party/python_runtime:headers",
     ],
+    alwayslink = 1,
 )
 
 cc_library(
@@ -2026,7 +2029,10 @@ py_library(
         "framework/python_memory_checker.py",
     ],
     srcs_version = "PY2AND3",
-    deps = [":_python_memory_checker_helper"],
+    deps = [
+        ":_python_memory_checker_helper",
+        "//tensorflow/python/profiler:traceme",
+    ],
 )
 
 tf_python_pybind_extension(
@@ -2056,8 +2062,8 @@ tf_py_test(
     main = "framework/registry_test.py",
     python_version = "PY3",
     deps = [
+        ":client_testlib",
         ":framework_for_generated_wrappers",
-        "//tensorflow/python:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -3889,8 +3895,8 @@ cuda_py_test(
     srcs = ["training/experimental/mixed_precision_test.py"],
     python_version = "PY3",
     deps = [
+        ":client_testlib",
         ":mixed_precision",
-        "//tensorflow/python:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -4974,7 +4980,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "math_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["ops/math_ops_test.py"],
     python_version = "PY3",
     tags = ["no_windows_gpu"],
@@ -5740,6 +5746,7 @@ tf_cuda_library(
         "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",
     ],
+    alwayslink = 1,
 )
 
 py_library(
@@ -5872,7 +5879,7 @@ filegroup(
         "//tensorflow/core/util:port",  # util_port
         "//tensorflow/core/util/tensor_bundle",  # checkpoint_reader
         "//tensorflow/lite/toco/python:toco_python_api",  # toco
-        "//tensorflow/python:tf_session_helper",  # tf_session
+        ":tf_session_helper",  # tf_session
         "//tensorflow/python/eager:pywrap_tfe_lib",  # pywrap_tfe_lib
         "//tensorflow/stream_executor:stream_executor_pimpl",  # stat_summarizer
         "//tensorflow/tools/graph_transforms:transform_graph_lib",  # transform_graph
@@ -6486,7 +6493,7 @@ tf_py_test(
 
 cuda_py_test(
     name = "adam_test",
-    size = "small",
+    size = "medium",
     srcs = ["training/adam_test.py"],
     python_version = "PY3",
     deps = [
@@ -7014,6 +7021,7 @@ py_tests(
     ],
 )
 
+# TODO(scottzhu): Move all the tf.layer related targets.
 py_library(
     name = "layers_base",
     srcs = [
@@ -7022,19 +7030,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":array_ops",
-        ":control_flow_ops",
-        ":framework_for_generated_wrappers",
-        ":layers_util",
-        ":platform",
-        ":smart_cond",
-        ":tensor_util",
-        ":util",
-        ":variable_scope",
-        ":variables",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/keras:engine",
-        "//third_party/py/numpy",
+        "//tensorflow/python/keras/legacy_tf_layers:layers_base",
     ],
 )
 
@@ -7063,134 +7059,12 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":array_ops",
-        ":array_ops_gen",
-        ":control_flow_ops",
-        ":framework",
-        ":framework_for_generated_wrappers",
-        ":init_ops",
         ":layers_base",
-        ":math_ops",
-        ":nn",
-        ":nn_ops",
-        ":platform",
-        ":resource_variable_ops",
-        ":resource_variable_ops_gen",
-        ":standard_ops",
-        ":state_ops",
-        ":training",
-        ":util",
-        ":variable_scope",
-        ":variables",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/keras/layers",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-    ],
-)
-
-tf_py_test(
-    name = "layers_base_test",
-    size = "small",
-    srcs = ["layers/base_test.py"],
-    main = "layers/base_test.py",
-    python_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":framework_for_generated_wrappers",
-        ":framework_test_lib",
-        ":init_ops",
-        ":layers",
-        ":layers_base",
-        ":math_ops",
-        ":random_ops",
-        ":variable_scope",
-        "//tensorflow/python/eager:context",
-    ],
-)
-
-tf_py_test(
-    name = "layers_core_test",
-    size = "small",
-    srcs = ["layers/core_test.py"],
-    main = "layers/core_test.py",
-    python_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":framework_for_generated_wrappers",
-        ":framework_test_lib",
-        ":layers",
-        ":math_ops",
-        ":nn_ops",
-        ":random_ops",
-        ":variable_scope",
-        ":variables",
-        "//third_party/py/numpy",
-    ],
-)
-
-tf_py_test(
-    name = "layers_convolutional_test",
-    size = "small",
-    srcs = ["layers/convolutional_test.py"],
-    main = "layers/convolutional_test.py",
-    python_version = "PY3",
-    deps = [
-        ":client_testlib",
-        ":framework_for_generated_wrappers",
-        ":framework_test_lib",
-        ":layers",
-        ":math_ops",
-        ":nn_ops",
-        ":random_ops",
-    ],
-)
-
-tf_py_test(
-    name = "layers_utils_test",
-    size = "small",
-    srcs = ["layers/utils_test.py"],
-    main = "layers/utils_test.py",
-    python_version = "PY3",
-    deps = [
-        ":client_testlib",
-        ":layers",
-    ],
-)
-
-tf_py_test(
-    name = "layers_pooling_test",
-    size = "small",
-    srcs = ["layers/pooling_test.py"],
-    main = "layers/pooling_test.py",
-    python_version = "PY3",
-    deps = [
-        ":client_testlib",
-        ":framework_test_lib",
-        ":layers",
-        ":random_ops",
-    ],
-)
-
-cuda_py_test(
-    name = "layers_normalization_test",
-    size = "medium",
-    srcs = ["layers/normalization_test.py"],
-    main = "layers/normalization_test.py",
-    python_version = "PY3",
-    shard_count = 10,
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":framework_for_generated_wrappers",
-        ":framework_test_lib",
-        ":layers",
-        ":math_ops",
-        ":random_ops",
-        ":variables",
-        "//third_party/py/numpy",
+        "//tensorflow/python/keras/engine:input_spec",
+        "//tensorflow/python/keras/legacy_tf_layers:convolutional",
+        "//tensorflow/python/keras/legacy_tf_layers:core",
+        "//tensorflow/python/keras/legacy_tf_layers:normalization",
+        "//tensorflow/python/keras/legacy_tf_layers:pooling",
     ],
 )
 
@@ -7473,7 +7347,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "nn_grad_test",
-    size = "small",
+    size = "medium",
     srcs = ["ops/nn_grad_test.py"],
     python_version = "PY3",
     deps = [
@@ -7704,7 +7578,7 @@ tf_py_test(
     deps = [
         ":client_testlib",
         ":graph_placer",
-        "//tensorflow/python:math_ops",
+        ":math_ops",
     ],
 )
 
@@ -7900,7 +7774,7 @@ tf_py_test(
 
 cuda_py_test(
     name = "auto_mixed_precision_test",
-    size = "small",
+    size = "medium",
     srcs = [
         "grappler/auto_mixed_precision_test.py",
     ],
@@ -8138,6 +8012,6 @@ cuda_py_test(
     srcs = ["ops/raw_ops_test.py"],
     python_version = "PY3",
     deps = [
-        "//tensorflow/python:client_testlib",
+        ":client_testlib",
     ],
 )
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index f2e0e5127dd..0dfc624e6bd 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -98,16 +98,21 @@ from tensorflow.python.ops.distributions import distributions
 from tensorflow.python.ops.linalg import linalg
 from tensorflow.python.ops.linalg.sparse import sparse
 from tensorflow.python.ops.losses import losses
+from tensorflow.python.ops.ragged import ragged_ops as _ragged_ops
 from tensorflow.python.ops.signal import signal
 from tensorflow.python.profiler import profiler
 from tensorflow.python.profiler import profiler_client
 from tensorflow.python.profiler import profiler_v2
+from tensorflow.python.profiler import trace
 from tensorflow.python.saved_model import saved_model
 from tensorflow.python.summary import summary
 from tensorflow.python.tpu import api
 from tensorflow.python.user_ops import user_ops
 from tensorflow.python.util import compat
 
+# Update the RaggedTensor package docs w/ a list of ops that support dispatch.
+ragged.__doc__ += _ragged_ops.ragged_dispatch.ragged_op_list()
+
 # Import to make sure the ops are registered.
 from tensorflow.python.ops import gen_audio_ops
 from tensorflow.python.ops import gen_boosted_trees_ops
diff --git a/tensorflow/python/autograph/core/config.py b/tensorflow/python/autograph/core/config.py
index b63eeb304f8..85091416126 100644
--- a/tensorflow/python/autograph/core/config.py
+++ b/tensorflow/python/autograph/core/config.py
@@ -45,6 +45,7 @@ CONVERSION_RULES = (
     DoNotConvert('pstats'),
     DoNotConvert('re'),
     DoNotConvert('threading'),
+    DoNotConvert('urllib'),
 
     # Known libraries
     DoNotConvert('matplotlib'),
diff --git a/tensorflow/python/autograph/core/unsupported_features_checker.py b/tensorflow/python/autograph/core/unsupported_features_checker.py
index b9694d68e37..190fd3d882d 100644
--- a/tensorflow/python/autograph/core/unsupported_features_checker.py
+++ b/tensorflow/python/autograph/core/unsupported_features_checker.py
@@ -33,17 +33,28 @@ class UnsupportedFeaturesChecker(gast.NodeVisitor):
     if (node.attr is not None
         and node.attr.startswith('__') and not node.attr.endswith('__')):
       raise errors.UnsupportedLanguageElementError(
-          'mangled names are not yet supported by AutoGraph')
+          'mangled names are not yet supported')
+    self.generic_visit(node)
+
+  def visit_For(self, node):
+    if node.orelse:
+      raise errors.UnsupportedLanguageElementError(
+          'for/else statement not yet supported')
+    self.generic_visit(node)
+
+  def visit_While(self, node):
+    if node.orelse:
+      raise errors.UnsupportedLanguageElementError(
+          'while/else statement not yet supported')
+    self.generic_visit(node)
 
   # These checks could potentially be replaced with inspect.isgeneratorfunction
   # to avoid a getsource/parse/ast-walk round trip.
   def visit_Yield(self, node):
-    raise errors.UnsupportedLanguageElementError(
-        'generators are not supported by AutoGraph')
+    raise errors.UnsupportedLanguageElementError('generators are not supported')
 
   def visit_YieldFrom(self, node):
-    raise errors.UnsupportedLanguageElementError(
-        'generators are not supported by AutoGraph')
+    raise errors.UnsupportedLanguageElementError('generators are not supported')
 
 
 def verify(node):
diff --git a/tensorflow/python/autograph/operators/BUILD b/tensorflow/python/autograph/operators/BUILD
index b68a07e2224..6db9e4f8e3b 100644
--- a/tensorflow/python/autograph/operators/BUILD
+++ b/tensorflow/python/autograph/operators/BUILD
@@ -126,7 +126,7 @@ py_test(
     srcs_version = "PY3",
     tags = [
         "no_windows",
-        # TODO(kkimlabs): Temporay workaround since KokoroPresubmit was failing.
+        # TODO(kkb): Temporay workaround since KokoroPresubmit was failing.
         #                 cl/259400943 for more context.
         "no_oss_py2",
     ],
diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index 5b2380827b1..a1ef5eeedab 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -274,7 +274,7 @@ def _verify_tf_cond_vars(body_vars, orelse_vars, symbol_names):
   assert isinstance(composite_body_vars, tuple)
   assert isinstance(composite_orelse_vars, tuple)
 
-  # TODO(kkimlabs): Make this more consistent.
+  # TODO(kkb): Make this more consistent.
   # The basic outputs should always be a tuple.
   if not isinstance(basic_body_vars, tuple):
     basic_body_vars = (basic_body_vars,)
diff --git a/tensorflow/python/autograph/operators/control_flow_deprecated_py2.py b/tensorflow/python/autograph/operators/control_flow_deprecated_py2.py
index 669ce5ebc63..e01a2f206c8 100644
--- a/tensorflow/python/autograph/operators/control_flow_deprecated_py2.py
+++ b/tensorflow/python/autograph/operators/control_flow_deprecated_py2.py
@@ -252,7 +252,7 @@ def _verify_tf_cond_vars(body_vars, orelse_vars, symbol_names):
   assert isinstance(composite_body_vars, tuple)
   assert isinstance(composite_orelse_vars, tuple)
 
-  # TODO(kkimlabs): Make this more consistent.
+  # TODO(kkb): Make this more consistent.
   # The basic outputs should always be a tuple.
   if not isinstance(basic_body_vars, tuple):
     basic_body_vars = (basic_body_vars,)
diff --git a/tensorflow/python/client/session_ref.cc b/tensorflow/python/client/session_ref.cc
index db4426ec05f..d911e185153 100644
--- a/tensorflow/python/client/session_ref.cc
+++ b/tensorflow/python/client/session_ref.cc
@@ -53,7 +53,8 @@ struct RunCounter {
 };
 
 std::string SessionToHandle(Session* session) {
-  return strings::Printf("%llu", reinterpret_cast<uint64>(session));
+  return strings::Printf("%llu", static_cast<unsigned long long>(
+                                     reinterpret_cast<uintptr_t>(session)));
 }
 
 // The Session interface has many methods of the form:
diff --git a/tensorflow/python/client/session_ref.h b/tensorflow/python/client/session_ref.h
index b0fb12b1897..a1d96c630c1 100644
--- a/tensorflow/python/client/session_ref.h
+++ b/tensorflow/python/client/session_ref.h
@@ -77,7 +77,7 @@ class SessionRef : public Session {
  private:
   mutex run_lock_;
   condition_variable run_finished_;
-  uint64 run_count_ GUARDED_BY(run_lock_) = {0};
+  uint64 run_count_ TF_GUARDED_BY(run_lock_) = {0};
   std::shared_ptr<Session> session_;
 
   // Borrowed reference to global session logger.
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 3d331a81a11..017404dba75 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -31,7 +31,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 3, 3)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 3, 17)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 
diff --git a/tensorflow/python/compat/v2_compat.py b/tensorflow/python/compat/v2_compat.py
index c563a215c10..eac841fb2fe 100644
--- a/tensorflow/python/compat/v2_compat.py
+++ b/tensorflow/python/compat/v2_compat.py
@@ -90,6 +90,7 @@ def disable_v2_behavior():
   User can call this function to disable 2.x behavior during complex migrations.
   """
   _v2_behavior_usage_gauge.get_cell("disable").set(True)
+  tf2.disable()
   ops.disable_eager_execution()
   tensor_shape.disable_v2_tensorshape()  # Also switched by tf2
   variable_scope.disable_resource_variables()
diff --git a/tensorflow/python/compiler/tensorrt/BUILD b/tensorflow/python/compiler/tensorrt/BUILD
index a7c206f4495..87a89b66d9e 100644
--- a/tensorflow/python/compiler/tensorrt/BUILD
+++ b/tensorflow/python/compiler/tensorrt/BUILD
@@ -36,8 +36,8 @@ py_library(
     srcs = ["trt_convert.py"],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/compiler/tf2tensorrt:_pywrap_py_utils",
         "//tensorflow/compiler/tf2tensorrt:trt_ops_loader",
-        "//tensorflow/compiler/tf2tensorrt:wrap_py_utils",
         "//tensorflow/python:convert_to_constants",
         "//tensorflow/python:func_graph",
         "//tensorflow/python:graph_util",
diff --git a/tensorflow/python/compiler/tensorrt/test/base_test.py b/tensorflow/python/compiler/tensorrt/test/base_test.py
index 8622fd13147..9d2d3abd4fb 100644
--- a/tensorflow/python/compiler/tensorrt/test/base_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/base_test.py
@@ -166,9 +166,10 @@ class SimpleMultiEnginesTest2(trt_test.TfTrtIntegrationTestBase):
     """Whether to run the test."""
     # Disable the test in fp16 mode since multiple matmul and add ops together
     # can cause overflow.
-    return ((run_params.precision_mode != "FP16") and
-            not (trt_test.IsQuantizationMode(run_params.precision_mode) and
-                 not run_params.use_calibration))
+    return (
+        (run_params.precision_mode != "FP16") and
+        not (trt_test.IsQuantizationMode(run_params.precision_mode) and
+             not run_params.use_calibration)), "test FP32 and non-calibration"
 
 
 class ConstInputTest(trt_test.TfTrtIntegrationTestBase):
diff --git a/tensorflow/python/compiler/tensorrt/test/combined_nms_test.py b/tensorflow/python/compiler/tensorrt/test/combined_nms_test.py
index 5e09b1423e5..ffb1bf85e87 100644
--- a/tensorflow/python/compiler/tensorrt/test/combined_nms_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/combined_nms_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.compiler.tf2tensorrt.wrap_py_utils import get_linked_tensorrt_version
+from tensorflow.compiler.tf2tensorrt._pywrap_py_utils import get_linked_tensorrt_version
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -93,12 +93,11 @@ class CombinedNmsTest(trt_test.TfTrtIntegrationTestBase):
     # There is no CombinedNonMaxSuppression op for GPU at the moment, so
     # calibration will fail.
     # TODO(laigd): fix this.
-    if trt_test.IsQuantizationMode(run_params.precision_mode):
-      return False
-
     # Only run for TRT 5.1 and above.
     ver = get_linked_tensorrt_version()
-    return ver[0] > 5 or (ver[0] == 5 and ver[1] >= 1)
+    return (ver[0] > 5 or
+            (ver[0] == 5 and ver[1] >= 1)) and not trt_test.IsQuantizationMode(
+                run_params.precision_mode), 'test >=TRT5.1 and non-INT8'
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py b/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py
index 9af3cf3a779..95dbe727ac3 100644
--- a/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py
@@ -98,8 +98,8 @@ class DynamicInputShapesTest(trt_test.TfTrtIntegrationTestBase):
     return ["TRTEngineOp_0"]
 
   def ShouldRunTest(self, run_params):
-    return (run_params.dynamic_engine and
-            not trt_test.IsQuantizationMode(run_params.precision_mode))
+    return (run_params.dynamic_engine and not trt_test.IsQuantizationMode(
+        run_params.precision_mode)), "test dynamic engine and non-INT8"
 
   def ExpectedAbsoluteTolerance(self, run_params):
     """The absolute tolerance to compare floating point results."""
diff --git a/tensorflow/python/compiler/tensorrt/test/int32_test.py b/tensorflow/python/compiler/tensorrt/test/int32_test.py
index 81a3b01828d..ecc68656a60 100644
--- a/tensorflow/python/compiler/tensorrt/test/int32_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/int32_test.py
@@ -81,7 +81,8 @@ class CalibrationInt32Support(trt_test.TfTrtIntegrationTestBase):
     # Although test passes with all configurations but only
     # execute INT8 with use_calibration=True because
     # that is the purpose of the test.
-    return trt_test.IsQuantizationWithCalibration(run_params)
+    return trt_test.IsQuantizationWithCalibration(
+        run_params), 'test calibration and INT8'
 
   def ExpectedEnginesToBuild(self, run_params):
     return ['TRTEngineOp_0']
diff --git a/tensorflow/python/compiler/tensorrt/test/lru_cache_test.py b/tensorflow/python/compiler/tensorrt/test/lru_cache_test.py
index 28240699a58..9ebbfd51bc6 100644
--- a/tensorflow/python/compiler/tensorrt/test/lru_cache_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/lru_cache_test.py
@@ -70,8 +70,8 @@ class LRUCacheTest(trt_test.TfTrtIntegrationTestBase):
     return ["TRTEngineOp_0"]
 
   def ShouldRunTest(self, run_params):
-    return (run_params.dynamic_engine and
-            not trt_test.IsQuantizationMode(run_params.precision_mode))
+    return (run_params.dynamic_engine and not trt_test.IsQuantizationMode(
+        run_params.precision_mode)), "test dynamic engine and non-INT8"
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py b/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py
index 83a3ddc8ef0..5c761aae5b1 100644
--- a/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.compiler.tf2tensorrt.wrap_py_utils import get_linked_tensorrt_version
-from tensorflow.compiler.tf2tensorrt.wrap_py_utils import is_tensorrt_enabled
+from tensorflow.compiler.tf2tensorrt._pywrap_py_utils import get_linked_tensorrt_version
+from tensorflow.compiler.tf2tensorrt._pywrap_py_utils import is_tensorrt_enabled
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import keras
 from tensorflow.python.compiler.tensorrt import trt_convert
diff --git a/tensorflow/python/compiler/tensorrt/test/quantization_test.py b/tensorflow/python/compiler/tensorrt/test/quantization_test.py
index 40282291210..7ed3414817c 100644
--- a/tensorflow/python/compiler/tensorrt/test/quantization_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/quantization_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.compiler.tf2tensorrt.wrap_py_utils import get_linked_tensorrt_version
+from tensorflow.compiler.tf2tensorrt._pywrap_py_utils import get_linked_tensorrt_version
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -64,11 +64,11 @@ class QuantizationMissingAllRangesTest(trt_test.TfTrtIntegrationTestBase):
     return _GetParams(self)
 
   def ShouldRunTest(self, run_params):
-    if get_linked_tensorrt_version()[0] < 5:
-      return False
     # Only test static engine mode, with or without calibration.
-    return (trt_test.IsQuantizationMode(run_params.precision_mode) and
-            not run_params.convert_online and not run_params.dynamic_engine)
+    return (get_linked_tensorrt_version()[0] >= 5 and
+            trt_test.IsQuantizationMode(run_params.precision_mode) and
+            not run_params.convert_online and not run_params.dynamic_engine
+           ), "test static engine, offline conversion and INT8"
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -89,11 +89,10 @@ class QuantizationWithRangesTest(trt_test.TfTrtIntegrationTestBase):
     return _GetParams(self)
 
   def ShouldRunTest(self, run_params):
-    if get_linked_tensorrt_version()[0] < 5:
-      return False
     # Test static/dynamic engine with/without calibration.
-    return (trt_test.IsQuantizationMode(run_params.precision_mode) and
-            not run_params.convert_online)
+    return (get_linked_tensorrt_version()[0] >= 5 and
+            trt_test.IsQuantizationMode(run_params.precision_mode) and
+            not run_params.convert_online), "test offline conversion and INT8"
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -119,7 +118,8 @@ class NonQuantizedPrecisionsWithRangesTest(trt_test.TfTrtIntegrationTestBase):
 
   def ShouldRunTest(self, run_params):
     # Only test FP32/FP16 mode.
-    return not trt_test.IsQuantizationMode(run_params.precision_mode)
+    return not trt_test.IsQuantizationMode(
+        run_params.precision_mode), "test non-INT8"
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/python/compiler/tensorrt/test/reshape_transpose_test.py b/tensorflow/python/compiler/tensorrt/test/reshape_transpose_test.py
index 2a4a65ba991..df5c0944751 100644
--- a/tensorflow/python/compiler/tensorrt/test/reshape_transpose_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/reshape_transpose_test.py
@@ -73,7 +73,7 @@ class ReshapeTest(trt_test.TfTrtIntegrationTestBase):
   def ShouldRunTest(self, run_params):
     """Whether to run the test."""
     return (not trt_test.IsQuantizationMode(run_params.precision_mode) and
-            not run_params.dynamic_engine)
+            not run_params.dynamic_engine), "test static engine and non-INT8"
 
 
 class TransposeTest(trt_test.TfTrtIntegrationTestBase):
@@ -102,7 +102,7 @@ class TransposeTest(trt_test.TfTrtIntegrationTestBase):
   def ShouldRunTest(self, run_params):
     """Whether to run the test."""
     return (not trt_test.IsQuantizationMode(run_params.precision_mode) and
-            not run_params.dynamic_engine)
+            not run_params.dynamic_engine), "test static engine and non-INT8"
 
 
 class IncompatibleTransposeTest(TransposeTest):
diff --git a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
index 0f30b355336..3245a100265 100644
--- a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
+++ b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
@@ -31,7 +31,7 @@ import warnings
 import numpy as np
 import six
 
-from tensorflow.compiler.tf2tensorrt.wrap_py_utils import is_tensorrt_enabled
+from tensorflow.compiler.tf2tensorrt._pywrap_py_utils import is_tensorrt_enabled
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.compiler.tensorrt import trt_convert
@@ -293,8 +293,8 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
   def ShouldRunTest(self, run_params):
     """Whether to run the test."""
     # Ensure use_calibration=True in case of INT8 precision
-    return (run_params.use_calibration or
-            not IsQuantizationMode(run_params.precision_mode))
+    return (run_params.use_calibration or not IsQuantizationMode(
+        run_params.precision_mode)), "test either calibration or non-INT8"
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build, implemented by subclass."""
@@ -765,8 +765,9 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     return self._MakeSavedModelV1(run_params)
 
   def RunTest(self, run_params):
-    if not self.ShouldRunTest(run_params):
-      return
+    should_run, reason_for_skipping = self.ShouldRunTest(run_params)
+    if not should_run:
+      return self.skipTest(reason_for_skipping)
 
     saved_model_dir = self._MakeSavedModel(run_params)
 
diff --git a/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py b/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
index 959ee862773..878ab4cbd8e 100644
--- a/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 from unittest import SkipTest  # pylint: disable=g-importing-member
 
-from tensorflow.compiler.tf2tensorrt.wrap_py_utils import get_linked_tensorrt_version
+from tensorflow.compiler.tf2tensorrt._pywrap_py_utils import get_linked_tensorrt_version
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
@@ -128,7 +128,8 @@ class ExplicitBatchTest(TrtModeTestBase):
   def ShouldRunTest(self, run_params):
     # Only run for TRT 6 and above.
     ver = get_linked_tensorrt_version()
-    return ver[0] >= 6 and (not run_params.use_calibration)
+    return run_params.is_v2 and ver[0] >= 6 and (
+        not run_params.use_calibration), "test v2, >=TRT6 and non-calibration"
 
 
 class DynamicShapesTest(TrtModeTestBase):
@@ -164,7 +165,8 @@ class DynamicShapesTest(TrtModeTestBase):
   def ShouldRunTest(self, run_params):
     # Only run for TRT 6 and above.
     ver = get_linked_tensorrt_version()
-    return ver[0] >= 6 and (not run_params.use_calibration)
+    return run_params.is_v2 and ver[0] >= 6 and (
+        not run_params.use_calibration), "test v2 >=TRT6 and non-calibration"
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/compiler/tensorrt/trt_convert.py b/tensorflow/python/compiler/tensorrt/trt_convert.py
index 8bd5414988c..255d65abda9 100644
--- a/tensorflow/python/compiler/tensorrt/trt_convert.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert.py
@@ -64,9 +64,9 @@ gen_trt_ops = LazyLoader(
     "gen_trt_ops", globals(),
     "tensorflow.compiler.tf2tensorrt.ops.gen_trt_ops")
 
-wrap_py_utils = LazyLoader(
-    "wrap_py_utils", globals(),
-    "tensorflow.compiler.tf2tensorrt.wrap_py_utils")
+_pywrap_py_utils = LazyLoader(
+    "_pywrap_py_utils", globals(),
+    "tensorflow.compiler.tf2tensorrt._pywrap_py_utils")
 
 # Register TRT ops in python, so that when users import this module they can
 # execute a TRT-converted graph without calling any of the methods in this
@@ -255,8 +255,8 @@ def _check_trt_version_compatibility():
   Raises:
     RuntimeError: if the TensorRT library version is incompatible.
   """
-  linked_version = wrap_py_utils.get_linked_tensorrt_version()
-  loaded_version = wrap_py_utils.get_loaded_tensorrt_version()
+  linked_version = _pywrap_py_utils.get_linked_tensorrt_version()
+  loaded_version = _pywrap_py_utils.get_loaded_tensorrt_version()
   assert isinstance(linked_version, tuple)
   assert isinstance(loaded_version, tuple)
   assert len(linked_version) == 3
diff --git a/tensorflow/python/compiler/tensorrt/trt_convert_test.py b/tensorflow/python/compiler/tensorrt/trt_convert_test.py
index 8d64d5e3ba9..fbe312fc4d6 100644
--- a/tensorflow/python/compiler/tensorrt/trt_convert_test.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert_test.py
@@ -25,7 +25,7 @@ import tempfile
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.compiler.tf2tensorrt.wrap_py_utils import is_tensorrt_enabled
+from tensorflow.compiler.tf2tensorrt._pywrap_py_utils import is_tensorrt_enabled
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index 68d94706a34..c9facc23ae5 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -540,6 +540,20 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "random_dataset_test",
+    size = "small",
+    srcs = ["random_dataset_test.py"],
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_library(
     name = "reader_dataset_ops_test_base",
     srcs = [
@@ -739,7 +753,7 @@ tf_py_test(
 
 tf_py_test(
     name = "stats_dataset_ops_test",
-    size = "large",
+    size = "small",
     srcs = ["stats_dataset_ops_test.py"],
     tags = [
         "no_pip",
diff --git a/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
index 6dc51a4e448..5f13bdae849 100644
--- a/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
@@ -391,7 +391,7 @@ class AutoShardDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     # Tests that Rebatch is a passthrough op.
     dataset = dataset_ops.Dataset.list_files(self.test_filenames, shuffle=False)
     dataset = dataset.apply(
-        testing.assert_next(["Shard", "FlatMap", "BatchV2", "Map", "Rebatch"]))
+        testing.assert_next(["Shard", "FlatMap", "BatchV2", "Rebatch"]))
     dataset = dataset.flat_map(core_readers.TFRecordDataset)
     dataset = dataset.batch(5)
     dataset = distribute._RebatchDataset(dataset, num_replicas=1)
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
index 84ef45d9593..b5cbc56e2f0 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
@@ -17,37 +17,109 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 from absl.testing import parameterized
 
+from tensorflow.python.compat import compat
 from tensorflow.python.data.experimental.ops import testing
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import logging_ops
 from tensorflow.python.platform import test
 
 
+def _test_combinations():
+
+  def make_range():
+    return dataset_ops.Dataset.range(10)
+
+  def fn_with_side_effect(arg):
+    logging_ops.print_v2(arg)
+    return arg
+
+  # Test case for map function with capture args
+  def apply_map_with_capture(ds):
+    const = constant_op.constant(-1, dtype=dtypes.int64)
+    return ds.map(lambda x: (x, const))
+
+  # Test case for map functions with multiple components
+  def apply_map_with_multiple_components(ds):
+    ds = ds.map(lambda x: (x, x), num_parallel_calls=2)  # Not eliminated
+    return ds.map(lambda x, y: (x, y))  # Eliminated
+
+  parallel_map_name = "ParallelMapV2" if compat.forward_compatible(
+      2020, 3, 6) else "ParallelMap"
+
+  cases = [
+      ("Skip0", lambda ds: ds.skip(0), None),
+      ("SkipN", lambda ds: ds.skip(5), "FiniteSkip"),
+      ("Repeat1", lambda ds: ds.repeat(1), None),
+      ("RepeatN", lambda ds: ds.repeat(10), "FiniteRepeat"),
+      ("Prefetch0", lambda ds: ds.prefetch(0), None),
+      ("PrefetchN", lambda ds: ds.prefetch(1), "Prefetch"),
+      ("Take-1", lambda ds: ds.take(-1), None),
+      ("TakeN", lambda ds: ds.take(2), "FiniteTake"),
+      ("MapIdentity", lambda ds: ds.map(lambda x: x), None),
+      ("MapNonIdentity", lambda ds: ds.map(lambda x: x * 2), "Map"),
+      ("MapWithSideEffect", lambda ds: ds.map(fn_with_side_effect), "Map"),
+      ("MapWithCapture", apply_map_with_capture, "Map"),
+      ("MapWithMultipleComponents", apply_map_with_multiple_components,
+       parallel_map_name),
+      ("MapRestructure", lambda ds: ds.map(lambda x: {"value": x}), ""),
+      ("PMapIdentity", lambda ds: ds.map(lambda x: x, num_parallel_calls=2),
+       None),
+      ("PMapNonIdentity",
+       lambda ds: ds.map(lambda x: x * 2, num_parallel_calls=2),
+       parallel_map_name),
+  ]
+
+  def reduce_fn(result, case):
+    name, transformation, expected = case
+    return result + combinations.combine(
+        init_dataset_fn=make_range,
+        transformation=combinations.NamedObject(name, transformation),
+        expected_name=expected)
+
+  test_combinations = functools.reduce(reduce_fn, cases, [])
+
+  return test_combinations
+
+
 class NoopEliminationTest(test_base.DatasetTestBase, parameterized.TestCase):
 
-  @combinations.generate(test_base.default_test_combinations())
-  def testNoopElimination(self):
-    a = constant_op.constant(1, dtype=dtypes.int64)
-    b = constant_op.constant(2, dtype=dtypes.int64)
-    some_tensor = math_ops.mul(a, b)
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         _test_combinations()))
+  def testNoopElimination(self, init_dataset_fn, transformation, expected_name):
+    """Runs a noop elimination test case.
 
-    dataset = dataset_ops.Dataset.range(5)
-    dataset = dataset.apply(
-        testing.assert_next(
-            ["FiniteRepeat", "FiniteSkip", "Prefetch", "MemoryCacheImpl"]))
-    dataset = dataset.repeat(some_tensor).skip(5).take(-1).skip(0).repeat(
-        1).prefetch(0).prefetch(1).cache()
+    Args:
+      init_dataset_fn: Function to create the initial dataset
+      transformation: Transformation to apply
+      expected_name: Name of the transformation if it is not eliminated
+    """
+    dataset = init_dataset_fn()
+
+    if expected_name:
+      dataset = dataset.apply(
+          testing.assert_next([expected_name, "FiniteTake"]))
+    else:
+      dataset = dataset.apply(testing.assert_next(["FiniteTake"]))
+
+    dataset = dataset.apply(transformation)
+    dataset = dataset.take(1)
     options = dataset_ops.Options()
     options.experimental_optimization.apply_default_optimizations = False
     options.experimental_optimization.noop_elimination = True
     dataset = dataset.with_options(options)
-    self.assertDatasetProduces(dataset, expected_output=range(5))
+
+    # Run the first iteration for the side effect of checking the assertion.
+    get_next = self.getNext(dataset)
+    self.evaluate(get_next())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/random_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/random_dataset_test.py
new file mode 100644
index 00000000000..06add745381
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/random_dataset_test.py
@@ -0,0 +1,55 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental.RandomDataset()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.ops import random_ops
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.framework import combinations
+from tensorflow.python.framework import random_seed
+from tensorflow.python.platform import test
+
+
+class RandomDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(global_seed=[None, 10], local_seed=[None, 20])))
+  def testDeterminism(self, global_seed, local_seed):
+    expect_determinism = (global_seed is not None) or (local_seed is not None)
+
+    random_seed.set_random_seed(global_seed)
+    ds = random_ops.RandomDataset(seed=local_seed).take(10)
+
+    output_1 = self.getDatasetOutput(ds)
+    ds = self.graphRoundTrip(ds)
+    output_2 = self.getDatasetOutput(ds)
+
+    if expect_determinism:
+      self.assertEqual(output_1, output_2)
+    else:
+      # Technically not guaranteed since the two randomly-chosen int64 seeds
+      # could match, but that is sufficiently unlikely (1/2^128 with perfect
+      # random number generation).
+      self.assertNotEqual(output_1, output_2)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
index cefb987f4ce..054b926bed9 100644
--- a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
@@ -92,7 +92,8 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testScalarInputError(self):
     dataset = dataset_ops.Dataset.range(1024)
     distribute._RebatchDataset(dataset.batch(4), num_replicas=4)
-    with self.assertRaisesRegexp(ValueError, "at least one dimension"):
+    with self.assertRaisesRegexp(ValueError, ("You can fix the issue "
+                                              "by adding the `batch`")):
       distribute._RebatchDataset(dataset, num_replicas=4)
 
   @combinations.generate(
diff --git a/tensorflow/python/data/experimental/kernel_tests/replicate_test.py b/tensorflow/python/data/experimental/kernel_tests/replicate_test.py
index e28257cc214..df59ebb996e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/replicate_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/replicate_test.py
@@ -262,26 +262,30 @@ class RemoteReplicateTest(test_base.DatasetTestBase, parameterized.TestCase):
     with ops.device(self._device2):
       self.assertDatasetProduces(dataset2, range(0, 200, 2))
 
+  # TODO(b/150821179): Re-enable this test.
   @combinations.generate(
       combinations.combine(tf_api_version=[2], mode=["eager"]))
-  def testVariableInput(self):
+  def _testVariableInput(self):
     with ops.device(self._device0):
       counter_var = variable_scope.get_variable(
           "counter", (), dtypes.int32, use_resource=True)
       dataset0 = dataset_ops.Dataset.range(100).map(
           lambda _: counter_var.assign_add(1))
-    # We don't support stateful ops across processes in functions as of now.
     with self.assertRaises(errors.InvalidArgumentError):
       replicated_ds = distribute.replicate(dataset0,
                                            [self._device1, self._device2])
       dataset1 = replicated_ds[self._device1]
+      dataset2 = replicated_ds[self._device2]
       with ops.device(self._device0):
         get_next0 = self.getNext(dataset0)
       with ops.device(self._device1):
         get_next1 = self.getNext(dataset1)
+      with ops.device(self._device2):
+        get_next2 = self.getNext(dataset2)
       for _ in range(100):
         self.evaluate(get_next0())
         self.evaluate(get_next1())
+        self.evaluate(get_next2())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_dataset_serialization_test.py
index d189ade7a27..d11f0335549 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_dataset_serialization_test.py
@@ -66,47 +66,6 @@ class ShuffleDatasetSerializationTest(
             seed=seed,
             reshuffle_each_iteration=reshuffle_each_iteration), num_outputs)
 
-  # TODO(b/133780904): Re-enable this test once randomness state is hoisted out
-  # of the input pipeline.
-  @combinations.generate(
-      combinations.times(
-          test_base.default_test_combinations(),
-          combinations.combine(
-              reshuffle_each_iteration=[True, False],
-              buffer_size=[1, 3, 5, 8, 10])))
-  def _testNonDeterministicSeeding(self, reshuffle_each_iteration, buffer_size):
-    range_limit = 5
-    num_repeats = 2
-    num_outputs = range_limit * num_repeats
-
-    def ds_fn():
-      # pylint: disable=cell-var-from-loop
-      return self._build_shuffle_dataset(
-          range_limit=range_limit,
-          num_repeats=num_repeats,
-          buffer_size=buffer_size,
-          seed=None,  # Iterator seeds are generated non-deterministically.
-          reshuffle_each_iteration=reshuffle_each_iteration)
-      # pylint: enable=cell-var-from-loop
-
-    # We checkpoint the initial state of the Dataset so that we can restore
-    # the seeds in the next run. Since the seeding is non-deterministic
-    # the dataset gets initialized with different seeds each time.
-    expected = self.gen_outputs(
-        ds_fn,
-        break_points=[0],
-        num_outputs=num_outputs,
-        ckpt_saved=False,
-        verify_exhausted=False,
-        save_checkpoint_at_end=False)
-    actual = self.gen_outputs(
-        ds_fn,
-        break_points=self.gen_break_points(num_outputs),
-        num_outputs=num_outputs,
-        ckpt_saved=True,
-        verify_exhausted=False)
-    self.match(expected, actual)
-
   @combinations.generate(
       combinations.combine(
           tf_api_version=1,
diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
index 213eb29f587..341c85f400f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
@@ -421,7 +421,7 @@ class FeatureStatsDatasetTest(
       num_output = total_records // batch_size + 1
 
     self.parallelCallsStats(
-        dataset_fn, {"ParseExampleDataset"},
+        dataset_fn, {"ParseExampleDatasetV2"},
         num_output,
         check_elements=False)
 
@@ -439,19 +439,19 @@ class FeatureStatsDatasetTest(
     handle = self.getHandle(aggregator)
     self.assertStatisticsHasCount(
         handle,
-        self.regexForNodeName("record_stats::ParseExampleDataset",
+        self.regexForNodeName("record_stats::ParseExampleDatasetV2",
                               "features_count"), total_records)
     self.assertStatisticsHasCount(
         handle,
-        self.regexForNodeName("record_stats::ParseExampleDataset",
+        self.regexForNodeName("record_stats::ParseExampleDatasetV2",
                               "feature_values_count"), total_records)
     self.assertStatisticsHasSum(
         handle,
-        self.regexForNodeName("record_stats::ParseExampleDataset",
+        self.regexForNodeName("record_stats::ParseExampleDatasetV2",
                               "features_count"), total_records * 4)
     self.assertStatisticsHasSum(
         handle,
-        self.regexForNodeName("record_stats::ParseExampleDataset",
+        self.regexForNodeName("record_stats::ParseExampleDatasetV2",
                               "feature_values_count"),
         self._sum_keywords(1) * num_epochs + 3 * total_records)
 
diff --git a/tensorflow/python/data/experimental/ops/batching.py b/tensorflow/python/data/experimental/ops/batching.py
index 17bd7e1bfdd..d976977e305 100644
--- a/tensorflow/python/data/experimental/ops/batching.py
+++ b/tensorflow/python/data/experimental/ops/batching.py
@@ -207,11 +207,8 @@ def map_and_batch(map_func,
 
   Maps `map_func` across `batch_size` consecutive elements of this dataset
   and then combines them into a batch. Functionally, it is equivalent to `map`
-  followed by `batch`. However, by fusing the two transformations together, the
-  implementation can be more efficient. Surfacing this transformation in the API
-  is temporary. Once automatic input pipeline optimization is implemented,
-  the fusing of `map` and `batch` will happen automatically and this API will be
-  deprecated.
+  followed by `batch`. This API is temporary and deprecated since input pipeline
+  optimization now fuses consecutive `map` and `batch` operations automatically.
 
   Args:
     map_func: A function mapping a nested structure of tensors to another
diff --git a/tensorflow/python/data/experimental/ops/distribute.py b/tensorflow/python/data/experimental/ops/distribute.py
index 235ba83fc35..1d7e642e5e7 100644
--- a/tensorflow/python/data/experimental/ops/distribute.py
+++ b/tensorflow/python/data/experimental/ops/distribute.py
@@ -92,8 +92,10 @@ class _RebatchDataset(dataset_ops.UnaryDataset):
         return None
 
       if len(output_shape) < 1:
-        raise ValueError("Input shape should have at least one dimension. "
-                         "Perhaps your input dataset is not batched?")
+        raise ValueError("Expected a dataset whose elements have rank >= 1 "
+                         "but found a dataset whose elements are scalars. "
+                         "You can fix the issue by adding the `batch` "
+                         "transformation to the dataset.")
       output_dims = [d.value for d in output_shape.dims]
 
       if output_dims[0] is not None and output_dims[0] % num_replicas == 0:
diff --git a/tensorflow/python/data/kernel_tests/cache_test.py b/tensorflow/python/data/kernel_tests/cache_test.py
index 1a923645a04..00068a7fd4c 100644
--- a/tensorflow/python/data/kernel_tests/cache_test.py
+++ b/tensorflow/python/data/kernel_tests/cache_test.py
@@ -364,6 +364,24 @@ class MemoryCacheTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertEqual(next(it1), i)
       self.assertEqual(next(it2), i)
 
+  @combinations.generate(combinations.combine(tf_api_version=2, mode="eager"))
+  def testCacheKnownCardinality(self):
+
+    # Check that a dataset which produces random permutation of range(10) ends
+    # up being cached when we read all of its element but do not reach EOF.
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = dataset.shuffle(10, reshuffle_each_iteration=True).cache()
+
+    it = iter(dataset)
+
+    results = []
+    for _ in range(10):
+      results.append(next(it))
+
+    it = iter(dataset)
+    for i in range(10):
+      self.assertEqual(next(it), results[i])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/checkpoint_test.py b/tensorflow/python/data/kernel_tests/checkpoint_test.py
index 4f92ddfe175..b0c9a77dd1e 100644
--- a/tensorflow/python/data/kernel_tests/checkpoint_test.py
+++ b/tensorflow/python/data/kernel_tests/checkpoint_test.py
@@ -20,7 +20,10 @@ from __future__ import print_function
 import os
 
 from absl.testing import parameterized
-
+from tensorflow.python.data.experimental.ops import grouping
+from tensorflow.python.data.experimental.ops import interleave_ops
+from tensorflow.python.data.experimental.ops import scan_ops
+from tensorflow.python.data.experimental.ops import take_while_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import combinations
@@ -31,6 +34,7 @@ from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
@@ -373,6 +377,160 @@ class CheckpointTest(test_base.DatasetTestBase, parameterized.TestCase):
         self.assertEqual(i * 2 + j, self.evaluate(get_next()))
       checkpoint.save(file_prefix=checkpoint_prefix)
 
+  @combinations.generate(test_base.eager_only_combinations())
+  def testSaveRestoreReshuffleDataset(self):
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = dataset.shuffle(10, reshuffle_each_iteration=True)
+    iterator = iter(dataset)
+    ckpt = trackable_utils.Checkpoint(
+        step=variables.Variable(0), iterator=iterator)
+    manager = checkpoint_management.CheckpointManager(
+        ckpt, self.get_temp_dir(), max_to_keep=3)
+
+    iter1 = [next(iterator).numpy() for _ in range(5)]
+
+    manager.save()
+    iter2 = [next(iterator).numpy() for _ in range(5)]
+
+    ckpt.restore(manager.latest_checkpoint)
+    iter3 = [next(iterator).numpy() for _ in range(5)]
+
+    self.assertNotEqual(iter1, iter2)
+    self.assertCountEqual(iter2, iter3)
+
+  def _assertNotCheckpointable(self, dataset):
+    iterator = iter(dataset)
+    ckpt = trackable_utils.Checkpoint(
+        step=variables.Variable(0), iterator=iterator)
+    manager = checkpoint_management.CheckpointManager(
+        ckpt, self.get_temp_dir(), max_to_keep=3)
+    with self.assertRaises(errors.FailedPreconditionError):
+      manager.save()
+
+  @staticmethod
+  def _statefulInt64Func(_):
+    return random_ops.random_uniform((), 0, 1, dtypes.int64)
+
+  @staticmethod
+  def _statefulBoolFunc(_):
+    return random_ops.random_uniform((), 0, 1, dtypes.int64) < 1
+
+  @staticmethod
+  def _statefulDatasetFunc(_):
+    x = random_ops.random_uniform((), 0, 1, dtypes.int64)
+    return dataset_ops.Dataset.range(x)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testStatefulFilterNotCheckpointable(self):
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = dataset.filter(self._statefulBoolFunc)
+    self._assertNotCheckpointable(dataset)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testStatefulFlatMapNotCheckpointable(self):
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = dataset.flat_map(self._statefulDatasetFunc)
+    self._assertNotCheckpointable(dataset)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testStatefulInterleaveNotCheckpointable(self):
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = dataset.interleave(self._statefulDatasetFunc)
+    self._assertNotCheckpointable(dataset)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testStatefulMapNotCheckpointable(self):
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = dataset.map(self._statefulBoolFunc)
+    self._assertNotCheckpointable(dataset)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testStatefulParallelInterleaveNotCheckpointable(self):
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = dataset.interleave(
+        self._statefulDatasetFunc, num_parallel_calls=2)
+    self._assertNotCheckpointable(dataset)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testStatefulParallelMapNotCheckpointable(self):
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = dataset.map(self._statefulBoolFunc, num_parallel_calls=2)
+    self._assertNotCheckpointable(dataset)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testStatefulGroupByReducerNotCheckpointable(self):
+    stateful_key_func = self._statefulInt64Func
+    key_func = lambda _: math_ops.cast(0, dtypes.int64)
+    stateful_init_func = self._statefulBoolFunc
+    init_func = lambda x: True
+    stateful_reduce_func = lambda _, x: self._statefulBoolFunc(x)
+    reduce_func = lambda _, x: True
+    stateful_finalize_func = self._statefulBoolFunc
+    finalize_func = lambda x: True
+
+    test_cases = [
+        (stateful_key_func, init_func, reduce_func, finalize_func),
+        (key_func, stateful_init_func, reduce_func, finalize_func),
+        (key_func, init_func, stateful_reduce_func, finalize_func),
+        (key_func, init_func, reduce_func, stateful_finalize_func),
+    ]
+    for key_func, init_func, reduce_func, finalize_func in test_cases:
+      dataset = dataset_ops.Dataset.range(10)
+      reducer = grouping.Reducer(init_func, reduce_func, finalize_func)
+      dataset = dataset.apply(grouping.group_by_reducer(key_func, reducer))
+      self._assertNotCheckpointable(dataset)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testStatefulGroupByWindowNotCheckpointable(self):
+    stateful_key_func = self._statefulInt64Func
+    key_func = lambda _: math_ops.cast(0, dtypes.int64)
+    stateful_reduce_func = lambda _, x: self._statefulDatasetFunc(x)
+    reduce_func = lambda _, x: x
+    stateful_window_func = self._statefulInt64Func
+    window_func = lambda x: math_ops.cast(0, dtypes.int64)
+
+    test_cases = [
+        (stateful_key_func, reduce_func, window_func),
+        (key_func, stateful_reduce_func, window_func),
+        (key_func, reduce_func, stateful_window_func),
+    ]
+    for key_func_fn, reduce_func_fn, window_func in test_cases:
+      dataset = dataset_ops.Dataset.range(10)
+      dataset = dataset.apply(
+          grouping.group_by_window(
+              key_func_fn, reduce_func_fn, window_size_func=window_func))
+      self._assertNotCheckpointable(dataset)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testStatefulMapAndBatchNotCheckpointable(self):
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = dataset.map(self._statefulBoolFunc)
+    dataset = dataset.batch(2)
+    self._assertNotCheckpointable(dataset)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testStatefulExperimentalParallelInterleaveNotCheckpointable(self):
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = dataset.apply(
+        interleave_ops.parallel_interleave(self._statefulDatasetFunc, 2))
+    self._assertNotCheckpointable(dataset)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testStatefulScanNotCheckpointable(self):
+    dataset = dataset_ops.Dataset.range(10)
+
+    def stateful_scan(state, element):
+      return state, self._statefulBoolFunc(element)
+
+    dataset = dataset.apply(scan_ops.scan(0, stateful_scan))
+    self._assertNotCheckpointable(dataset)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testStatefulTakeWhileNotCheckpointable(self):
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = dataset.apply(take_while_ops.take_while(self._statefulBoolFunc))
+    self._assertNotCheckpointable(dataset)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/reduce_test.py b/tensorflow/python/data/kernel_tests/reduce_test.py
index 53a40fe5a33..75b5984eba1 100644
--- a/tensorflow/python/data/kernel_tests/reduce_test.py
+++ b/tensorflow/python/data/kernel_tests/reduce_test.py
@@ -260,7 +260,7 @@ class ReduceTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testOptions(self):
     dataset = dataset_ops.Dataset.range(5)
     dataset = dataset.apply(testing.assert_next(["MapAndBatch"]))
-    dataset = dataset.map(lambda x: x).batch(5)
+    dataset = dataset.map(lambda x: x * 2).batch(5)
     self.evaluate(dataset.reduce(0, lambda state, value: state))
 
 
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index c7b2257c510..4c957b45f68 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -410,7 +410,8 @@ class DatasetV2(tracking_base.Trackable, composite_tensor.CompositeTensor):
   def element_spec(self):
     """The type specification of an element of this dataset.
 
-    >>> dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3]).element_spec
+    >>> dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3])
+    >>> dataset.element_spec
     TensorSpec(shape=(), dtype=tf.int32, name=None)
 
     Returns:
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index f9781ca6791..d34e34d2307 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -871,7 +871,10 @@ py_test(
     srcs = ["lib/source_utils_test.py"],
     python_version = "PY3",
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
+    tags = [
+        "no_oss_py38",  #TODO(b/151449908)
+        "no_windows",
+    ],
     deps = [
         ":debug_data",
         ":debug_utils",
diff --git a/tensorflow/python/debug/lib/check_numerics_callback.py b/tensorflow/python/debug/lib/check_numerics_callback.py
index fa9a16dd8ab..26d45c3b589 100644
--- a/tensorflow/python/debug/lib/check_numerics_callback.py
+++ b/tensorflow/python/debug/lib/check_numerics_callback.py
@@ -26,6 +26,7 @@ import numpy as np
 from tensorflow.core.protobuf import debug_event_pb2
 from tensorflow.python.debug.lib import op_callbacks_common
 from tensorflow.python.debug.lib import source_utils
+from tensorflow.python.eager import monitoring
 from tensorflow.python.framework import op_callbacks
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -89,6 +90,10 @@ SAFE_OPS = (
 
 _state = threading.local()
 
+_check_numerics_callback_create_counter = monitoring.Counter(
+    "/tensorflow/api/python/debugging/check_numerics_callback_create_counter",
+    "Counter for number of times the check_numerics op callback is created.")
+
 
 def limit_string_length(string, max_len=50):
   """Limit the length of input string.
@@ -419,6 +424,7 @@ def enable_check_numerics(stack_height_limit=30,
   logging.info(
       "Enabled check-numerics callback in thread %s",
       threading.current_thread().name)
+  _check_numerics_callback_create_counter.get_cell().increase_by(1)
 
 
 @tf_export("debugging.disable_check_numerics")
diff --git a/tensorflow/python/debug/lib/debug_events_monitors.py b/tensorflow/python/debug/lib/debug_events_monitors.py
index 839d662d9e4..fd7c28f1328 100644
--- a/tensorflow/python/debug/lib/debug_events_monitors.py
+++ b/tensorflow/python/debug/lib/debug_events_monitors.py
@@ -200,7 +200,24 @@ class InfNanMonitor(BaseMonitor):
                                 output_slot,
                                 execution_index=None,
                                 graph_execution_trace_index=None):
-    """Check for bad numerical values based on debug summary of tensor value."""
+    """Check for bad numerical values based on debug summary of tensor value.
+
+    If tensor_debug_mode is one in which debug_tensor_value does not carry
+    information about the presence or count of inf / nan values (e.g., SHAPE),
+    this method is a no-op.
+
+    When infs and/or nans are found, `InfNanAlert` objects are created and
+    appended to `self._alerts`.
+
+    Args:
+      tensor_debug_mode: TensorDebugMode proto enum.
+      debug_tensor_value: Debug tensor value as a list of numbers.
+      wall_time: Wall timestamp for the tensor event.
+      op_type: Type of the op that generated the tensor (e.g., "Conv2D").
+      output_slot: Output slot index of the tensor for the op.
+      execution_index: Top-level execution index.
+      graph_execution_trace_index: Intra-graph execution index.
+    """
     # FULL_TENSOR mode is handled by a separate code path.
     assert tensor_debug_mode != debug_event_pb2.TensorDebugMode.FULL_TENSOR
     if not debug_tensor_value:
@@ -228,8 +245,19 @@ class InfNanMonitor(BaseMonitor):
             execution_index=execution_index,
             graph_execution_trace_index=graph_execution_trace_index))
     elif tensor_debug_mode == debug_event_pb2.TensorDebugMode.FULL_HEALTH:
-      raise NotImplementedError(
-          "InfNanMonitor does not support FULL_HEALTH tensor-debug mode yet.")
+      (_, _, _, _, size, num_neg_inf, num_pos_inf, num_nan,
+       _, _, _) = debug_tensor_value
+      if num_neg_inf or num_pos_inf or num_nan:
+        self._alerts.append(InfNanAlert(
+            wall_time,
+            op_type,
+            output_slot,
+            size=size,
+            num_neg_inf=num_neg_inf,
+            num_pos_inf=num_pos_inf,
+            num_nan=num_nan,
+            execution_index=execution_index,
+            graph_execution_trace_index=graph_execution_trace_index))
 
   def on_execution(self,
                    execution_index,
diff --git a/tensorflow/python/debug/lib/debug_events_monitors_test.py b/tensorflow/python/debug/lib/debug_events_monitors_test.py
index b05d63c33d7..05eaa510648 100644
--- a/tensorflow/python/debug/lib/debug_events_monitors_test.py
+++ b/tensorflow/python/debug/lib/debug_events_monitors_test.py
@@ -66,6 +66,7 @@ class DebugEventsMonitorTest(dumping_callback_test_lib.DumpingCallbackTestBase,
   @parameterized.named_parameters(
       ("NoTensor", "NO_TENSOR"),
       ("ConciseHealth", "CONCISE_HEALTH"),
+      ("FullHealth", "FULL_HEALTH"),
       ("FullTensor", "FULL_TENSOR"),
   )
   def testOnExecutionIsCalled(self, tensor_debug_mode):
@@ -96,6 +97,12 @@ class DebugEventsMonitorTest(dumping_callback_test_lib.DumpingCallbackTestBase,
         self.assertLen(execution.debug_tensor_values, 1)
         # [tensor_id, element_count, neg_inf_count, pos_inf_count, nan_count].
         self.assertLen(execution.debug_tensor_values[0], 5)
+      elif tensor_debug_mode == "FULL_HEALTH":
+        self.assertLen(execution.debug_tensor_values, 1)
+        # [tensor_id, device_id, dtype, rank, element_count,
+        #  neg_inf_count, pos_inf_count, nan_count,
+        #  neg_finite_count, zero_count, pos_finite_count].
+        self.assertLen(execution.debug_tensor_values[0], 11)
       elif tensor_debug_mode == "FULL_TENSOR":
         # Full tensor values are not stored in the debug_tensor_values field.
         self.assertIsNone(execution.debug_tensor_values)
@@ -104,6 +111,7 @@ class DebugEventsMonitorTest(dumping_callback_test_lib.DumpingCallbackTestBase,
 
   @parameterized.named_parameters(
       ("ConciseHealth", "CONCISE_HEALTH"),
+      ("FullHealth", "FULL_HEALTH"),
       ("FullTensor", "FULL_TENSOR"),
   )
   def testOnGraphExecutionTraceIsCalled(self, tensor_debug_mode):
@@ -149,6 +157,21 @@ class DebugEventsMonitorTest(dumping_callback_test_lib.DumpingCallbackTestBase,
         self.assertLen(traces[0].debug_tensor_value, 5)
         self.assertLen(traces[1].debug_tensor_value, 5)
         self.assertLen(traces[2].debug_tensor_value, 5)
+      elif tensor_debug_mode == "FULL_HEALTH":
+        self.assertLen(traces, 3)  # [Placeholder:0, Unique:0 , Sum:0].
+        self.assertEqual(traces[0].op_type, "Placeholder")
+        self.assertEqual(traces[0].output_slot, 0)
+        self.assertEqual(traces[1].op_type, "Unique")
+        self.assertEqual(traces[1].output_slot, 0)
+        # Unique:1 is not traced under FULL_HEALTH mode, as it's int-dtype.
+        self.assertEqual(traces[2].op_type, "Sum")
+        self.assertEqual(traces[2].output_slot, 0)
+        # [tensor_id, device_id, dtype, rank, element_count,
+        #  neg_inf_count, pos_inf_count, nan_count,
+        #  neg_finite_count, zero_count, pos_finite_count].
+        self.assertLen(traces[0].debug_tensor_value, 11)
+        self.assertLen(traces[1].debug_tensor_value, 11)
+        self.assertLen(traces[2].debug_tensor_value, 11)
       elif tensor_debug_mode == "FULL_TENSOR":
         self.assertLen(traces, 4)  # [Placeholder:0, Unique:0, Unique:1, Sum:0].
         self.assertEqual(traces[0].op_type, "Placeholder")
@@ -236,21 +259,37 @@ class InfNanMonitorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertEqual(alert.execution_index, 50)
     self.assertIsNone(alert.graph_execution_trace_index)
 
-  def testInfNanMonitorOnExecutionUnderConciseHealthMode(self):
+  @parameterized.named_parameters(
+      ("ConciseHealth",
+       debug_event_pb2.TensorDebugMode.CONCISE_HEALTH,
+       # [tensor_id, size, num_neg_inf, num_pos_inf, num_nan].
+       [[-1, 10, 1, 2, 3],
+        [-1, 100, 0, 0, 0]]),
+      ("FullHealth",
+       debug_event_pb2.TensorDebugMode.FULL_HEALTH,
+       # [tensor_id, device_id, dtype, rank, element_count,
+       #  neg_inf_count, pos_inf_count, nan_count,
+       #  neg_finite_count, zero_count, pos_finite_count].
+       [[-1, -1, 1, 1, 10, 1, 2, 3, 0, 0, 0],
+        [-1, -1, 1, 1, 100, 0, 0, 0, 10, 30, 60]]),
+  )
+  def testInfNanMonitorOnExecutionUnderHealthMode(self,
+                                                  tensor_debug_mode,
+                                                  debug_tensor_values):
     mock_reader = test.mock.MagicMock()
     monitor = debug_events_monitors.InfNanMonitor(mock_reader)
     execution_digest = debug_events_reader.ExecutionDigest(
         1234, 1, "BarOp", output_tensor_device_ids=[0, 1])
+
     execution = debug_events_reader.Execution(
         execution_digest,
         "worker01",
         ["a1", "b2", "e3"],
-        debug_event_pb2.TensorDebugMode.CONCISE_HEALTH,
+        tensor_debug_mode,
         graph_id=None,
         input_tensor_ids=[12, 34],
         output_tensor_ids=[56, 78],
-        # [tensor_id, size, num_neg_inf, num_pos_inf, num_nan].
-        debug_tensor_values=[[-1, 10, 1, 2, 3], [-1, 100, 0, 0, 0]])
+        debug_tensor_values=debug_tensor_values)
     monitor.on_execution(60, execution)
 
     self.assertLen(monitor.alerts(), 1)
@@ -265,6 +304,35 @@ class InfNanMonitorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertEqual(alert.execution_index, 60)
     self.assertIsNone(alert.graph_execution_trace_index)
 
+  @parameterized.named_parameters(
+      ("Shape",
+       debug_event_pb2.TensorDebugMode.SHAPE,
+       # [tensor_id, dtype, rank, element_cont, ...shape_truncate_6]
+       [[-1, 1, 2, 6, 3, 2, 0, 0, 0, 0],
+        [-1, 10, 1, 7, 7, 0, 0, 0, 0, 0]]),
+  )
+  def testInfNanMonitorOnExecutionUnderModeWithNoInfNanInfo(
+      self,
+      tensor_debug_mode,
+      debug_tensor_values):
+    mock_reader = test.mock.MagicMock()
+    monitor = debug_events_monitors.InfNanMonitor(mock_reader)
+    execution_digest = debug_events_reader.ExecutionDigest(
+        1234, 1, "BarOp", output_tensor_device_ids=[0, 1])
+
+    execution = debug_events_reader.Execution(
+        execution_digest,
+        "worker01",
+        ["a1", "b2", "e3"],
+        tensor_debug_mode,
+        graph_id=None,
+        input_tensor_ids=[12, 34],
+        output_tensor_ids=[56, 78],
+        debug_tensor_values=debug_tensor_values)
+    monitor.on_execution(60, execution)
+
+    self.assertEmpty(monitor.alerts())
+
   @parameterized.named_parameters(
       ("FloatsScalarWithInfAndNan", np.inf, np.float32, 1, 0, 1, 0),
       ("Floats2DWithInfAndNan", [[0, np.nan, np.nan, -np.inf]
diff --git a/tensorflow/python/debug/lib/distributed_callbacks_test.py b/tensorflow/python/debug/lib/distributed_callbacks_test.py
index 0a095120d63..f1d00ff6844 100644
--- a/tensorflow/python/debug/lib/distributed_callbacks_test.py
+++ b/tensorflow/python/debug/lib/distributed_callbacks_test.py
@@ -92,7 +92,7 @@ class DistributedDumpingCallbackTest(
 
       caught_error = None
       try:
-        distribution.experimental_run_v2(train_step)
+        distribution.run(train_step)
       except errors.InvalidArgumentError as error:
         caught_error = error
       self.assertTrue(caught_error)
@@ -128,7 +128,7 @@ class DistributedDumpingCallbackTest(
           grads_and_vars = zip(grads, mini_model.weights)
           optimizer.apply_gradients(grads_and_vars)
 
-      distribution.experimental_run_v2(train_step)
+      distribution.run(train_step)
 
       updated_var_values = self.evaluate(mini_model.variables)
       num_devices = len(distribution.extended.worker_devices)
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index eeea386c2c9..fd4895e6e02 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -564,6 +564,7 @@ py_library(
 tpu_py_test(
     name = "tpu_strategy_test",
     srcs = ["tpu_strategy_test.py"],
+    disable_experimental = True,
     python_version = "PY3",
     deps = [
         ":tpu_strategy",
@@ -791,7 +792,9 @@ distribute_py_test(
     main = "input_lib_test.py",
     shard_count = 10,
     tags = [
+        "manual",
         "multi_and_single_gpu",
+        "notap",  # TODO(b/151467526)
     ],
     deps = [
         ":collective_all_reduce_strategy",
@@ -839,6 +842,8 @@ cuda_py_test(
     srcs = ["cross_device_ops_test.py"],
     tags = [
         "multi_and_single_gpu",
+        "no_oss",  # TODO(b/151025792): enable after this is fixed.
+        "notap",  # TODO(b/151025792): enable after this is fixed.
     ],
     deps = [
         ":collective_all_reduce_strategy",
@@ -933,6 +938,9 @@ distribute_py_test(
     tags = [
         "multi_and_single_gpu",
     ],
+    tpu_tags = [
+        "no_oss",  # Target too big to run serially reliably.
+    ],
     deps = [
         ":mirrored_strategy",
         ":parameter_server_strategy",
diff --git a/tensorflow/python/distribute/README.md b/tensorflow/python/distribute/README.md
index 49c62494736..f44a4ee8531 100644
--- a/tensorflow/python/distribute/README.md
+++ b/tensorflow/python/distribute/README.md
@@ -67,7 +67,7 @@ def train_step(iterator):
     grads = tape.gradient(loss, model.variables)
     return grads
 
-  return tpu_strategy.experimental_run_v2(
+  return tpu_strategy.run(
       step_fn, args=(next(iterator),))
 
 # Run the loop body once on at dataset.
diff --git a/tensorflow/python/distribute/central_storage_strategy.py b/tensorflow/python/distribute/central_storage_strategy.py
index eed3fe2e101..c4555d7d5bd 100644
--- a/tensorflow/python/distribute/central_storage_strategy.py
+++ b/tensorflow/python/distribute/central_storage_strategy.py
@@ -48,7 +48,7 @@ class CentralStorageStrategy(distribute_lib.Strategy):
     # Iterate over the distributed dataset
     for x in dist_dataset:
       # process dataset elements
-      strategy.experimental_run_v2(train_step, args=(x,))
+      strategy.run(train_step, args=(x,))
   ```
   """
 
@@ -125,7 +125,7 @@ class CentralStorageStrategy(distribute_lib.Strategy):
     inputs = strategy.experimental_distribute_datasets_from_function(dataset_fn)
 
     for batch in inputs:
-      replica_results = strategy.experimental_run_v2(replica_fn, args=(batch,))
+      replica_results = strategy.run(replica_fn, args=(batch,))
     ```
 
     IMPORTANT: The `tf.data.Dataset` returned by `dataset_fn` should have a
@@ -152,8 +152,8 @@ class CentralStorageStrategy(distribute_lib.Strategy):
     will be all the values on that worker.
 
     Args:
-      value: A value returned by `experimental_run()`, `experimental_run_v2()`,
-        `extended.call_for_each_replica()`, or a variable created in `scope`.
+      value: A value returned by `run()`, `extended.call_for_each_replica()`,
+      or a variable created in `scope`.
 
     Returns:
       A tuple of values contained in `value`. If `value` represents a single
@@ -161,7 +161,7 @@ class CentralStorageStrategy(distribute_lib.Strategy):
     """
     return super(CentralStorageStrategy, self).experimental_local_results(value)
 
-  def experimental_run_v2(self, fn, args=(), kwargs=None, options=None):  # pylint: disable=useless-super-delegation
+  def run(self, fn, args=(), kwargs=None, options=None):  # pylint: disable=useless-super-delegation
     """Run `fn` on each replica, with the given arguments.
 
     In `CentralStorageStrategy`, `fn` is  called on each of the compute
@@ -177,13 +177,12 @@ class CentralStorageStrategy(distribute_lib.Strategy):
     Returns:
       Return value from running `fn`.
     """
-    return super(CentralStorageStrategy,
-                 self).experimental_run_v2(fn, args, kwargs, options)
+    return super(CentralStorageStrategy, self).run(fn, args, kwargs, options)
 
   def reduce(self, reduce_op, value, axis):  # pylint: disable=useless-super-delegation
     """Reduce `value` across replicas.
 
-    Given a per-replica value returned by `experimental_run_v2`, say a
+    Given a per-replica value returned by `run`, say a
     per-example loss, the batch will be divided across all the replicas. This
     function allows you to aggregate across replicas and optionally also across
     batch elements.  For example, if you have a global batch size of 8 and 2
@@ -221,7 +220,7 @@ class CentralStorageStrategy(distribute_lib.Strategy):
 
       # Iterate over the distributed dataset
       for x in dist_dataset:
-        result = strategy.experimental_run_v2(train_step, args=(x,))
+        result = strategy.run(train_step, args=(x,))
 
     result = strategy.reduce(tf.distribute.ReduceOp.SUM, result,
                              axis=None).numpy()
@@ -234,7 +233,7 @@ class CentralStorageStrategy(distribute_lib.Strategy):
     Args:
       reduce_op: A `tf.distribute.ReduceOp` value specifying how values should
         be combined.
-      value: A "per replica" value, e.g. returned by `experimental_run_v2` to
+      value: A "per replica" value, e.g. returned by `run` to
         be combined into a single tensor.
       axis: Specifies the dimension to reduce along within each
         replica's tensor. Should typically be set to the batch dimension, or
diff --git a/tensorflow/python/distribute/checkpointing_test.py b/tensorflow/python/distribute/checkpointing_test.py
index 1c84fac5abe..040faf6f6ce 100644
--- a/tensorflow/python/distribute/checkpointing_test.py
+++ b/tensorflow/python/distribute/checkpointing_test.py
@@ -118,7 +118,8 @@ class TrainingCheckpointTests(test.TestCase, parameterized.TestCase):
             loss = v + v
           gradients = tape.gradient(loss, [v])
           opt.apply_gradients(zip(gradients, [v]))
-        distribution.experimental_run_v2(f)
+
+        distribution.run(f)
 
       return v, opt, step
 
diff --git a/tensorflow/python/distribute/cluster_resolver/brain_jobs_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/brain_jobs_cluster_resolver_test.py
deleted file mode 100644
index 7e7d69d1a96..00000000000
--- a/tensorflow/python/distribute/cluster_resolver/brain_jobs_cluster_resolver_test.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the 'License');
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an 'AS IS' BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for BrainJobsClusterResolver."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-
-from tensorflow.python.distribute.cluster_resolver.brain_jobs_cluster_resolver import BrainJobsClusterResolver
-from tensorflow.python.framework import test_util
-from tensorflow.python.platform import test
-from tensorflow.python.training import server_lib
-
-
-@test_util.run_all_in_graph_and_eager_modes
-class BrainJobsClusterResolverTest(test.TestCase):
-
-  def _verifyClusterSpecEquality(self, cluster_spec, expected_proto):
-    self.assertProtoEquals(expected_proto, cluster_spec.as_cluster_def())
-    self.assertProtoEquals(
-        expected_proto, server_lib.ClusterSpec(cluster_spec).as_cluster_def())
-    self.assertProtoEquals(
-        expected_proto,
-        server_lib.ClusterSpec(cluster_spec.as_cluster_def()).as_cluster_def())
-    self.assertProtoEquals(
-        expected_proto,
-        server_lib.ClusterSpec(cluster_spec.as_dict()).as_cluster_def())
-
-  def testNormalClusterSpecRead(self):
-    brain_job_flags = ('chief|/bns/atlanta/borg/atlanta/bns/my_user/my_job.'
-                       'chief|1,worker|/bns/atlanta/borg/atlanta/bns/my_user/'
-                       'my_job.worker|2')
-
-    cluster_resolver = BrainJobsClusterResolver(brain_jobs=brain_job_flags,
-                                                brain_port=1234,
-                                                brain_rpc_layer='rpc2',
-                                                task_type='chief',
-                                                task_id=0)
-    expected_proto = """
-    job { name: 'chief' tasks { key: 0 value: '/bns/atlanta/borg/atlanta/bns/my_user/my_job.chief/0' }}
-    job { name: 'worker' tasks { key: 0 value: '/bns/atlanta/borg/atlanta/bns/my_user/my_job.worker/0'},
-                         tasks { key: 1 value: '/bns/atlanta/borg/atlanta/bns/my_user/my_job.worker/1'}}
-    """
-    actual_cluster_spec = cluster_resolver.cluster_spec()
-    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
-    self.assertEqual('', cluster_resolver.rpc_layer)
-    self.assertEqual(1234, cluster_resolver.port)
-    self.assertEqual('/bns/atlanta/borg/atlanta/bns/my_user/my_job.chief/0',
-                     cluster_resolver.master())
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py
index ab9721e1bfb..d35bb85cd1b 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
+import weakref
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
@@ -166,13 +167,14 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
                container_strategy,
                communication,
                cluster_resolver):
-    cluster_resolver = cluster_resolver or TFConfigClusterResolver()
+    self._cluster_resolver = cluster_resolver or TFConfigClusterResolver()
     distribute_lib.StrategyExtendedV1.__init__(self, container_strategy)
     assert isinstance(
         communication,
         cross_device_ops_lib.CollectiveCommunication)
     self._communication = communication
-    self._initialize_strategy(cluster_resolver)
+    self._initialize_strategy(self._cluster_resolver)
+    self._cfer_fn_cache = weakref.WeakKeyDictionary()
     assert isinstance(self._get_cross_device_ops(),
                       cross_device_ops_lib.CollectiveAllReduce)
 
diff --git a/tensorflow/python/distribute/ctl_correctness_test.py b/tensorflow/python/distribute/ctl_correctness_test.py
index fd2926adcf6..59fae808c21 100644
--- a/tensorflow/python/distribute/ctl_correctness_test.py
+++ b/tensorflow/python/distribute/ctl_correctness_test.py
@@ -125,8 +125,7 @@ def iteration_inside_func(initial_weights, dataset, optimizer_fn,
       if iteration_type == 'dataset':
         for x in dist_input:
           if strategy:
-            per_replica_losses = strategy.experimental_run_v2(step_fn,
-                                                              args=(x,))
+            per_replica_losses = strategy.run(step_fn, args=(x,))
             total_loss += strategy.reduce(reduce_util.ReduceOp.SUM,
                                           per_replica_losses,
                                           axis=None)
@@ -137,8 +136,7 @@ def iteration_inside_func(initial_weights, dataset, optimizer_fn,
         iterator = iter(dist_input)
         for _ in range(_STEPS_PER_EPOCH):
           if strategy:
-            per_replica_losses = strategy.experimental_run_v2(
-                step_fn, args=(next(iterator),))
+            per_replica_losses = strategy.run(step_fn, args=(next(iterator),))
             total_loss += strategy.reduce(reduce_util.ReduceOp.SUM,
                                           per_replica_losses,
                                           axis=None)
@@ -184,8 +182,7 @@ def iteration_outside_func(initial_weights, dataset, optimizer_fn,
         return loss
 
       if strategy:
-        per_replica_losses = strategy.experimental_run_v2(
-            step_fn, args=(dist_inputs,))
+        per_replica_losses = strategy.run(step_fn, args=(dist_inputs,))
         return strategy.reduce(reduce_util.ReduceOp.SUM,
                                per_replica_losses,
                                axis=None)
diff --git a/tensorflow/python/distribute/custom_training_loop_gradient_test.py b/tensorflow/python/distribute/custom_training_loop_gradient_test.py
index c4bdcc5337f..ebf5d440c3e 100644
--- a/tensorflow/python/distribute/custom_training_loop_gradient_test.py
+++ b/tensorflow/python/distribute/custom_training_loop_gradient_test.py
@@ -87,7 +87,7 @@ class GradientTapeTest(test.TestCase, parameterized.TestCase,
     results = []
     for x in dist_dataset:
       output = distribution.experimental_local_results(
-          distribution.experimental_run_v2(train_step, args=(x,)))
+          distribution.run(train_step, args=(x,)))
       results.append(output)
     self.assert_equal_flattened([[10., 12.], [14., 16.]], results)
 
@@ -110,7 +110,7 @@ class GradientTapeTest(test.TestCase, parameterized.TestCase,
         grads = tape.gradient(y, x)
         return grads
       return distribution.experimental_local_results(
-          distribution.experimental_run_v2(train_step, args=(x,)))
+          distribution.run(train_step, args=(x,)))
 
     dist_dataset = distribution.experimental_distribute_dataset(dataset)
     results = []
@@ -141,7 +141,7 @@ class GradientTapeTest(test.TestCase, parameterized.TestCase,
           with backprop.GradientTape() as tape:
             y = model(x)
           return tape.gradient(y, x)
-        return distribution.experimental_run_v2(replica_step)
+        return distribution.run(replica_step)
 
       grads = distribution.experimental_local_results(train_step())
       self.assertLen(grads, distribution.num_replicas_in_sync)
diff --git a/tensorflow/python/distribute/custom_training_loop_input_test.py b/tensorflow/python/distribute/custom_training_loop_input_test.py
index 39caf09a392..5d1584f5aa7 100644
--- a/tensorflow/python/distribute/custom_training_loop_input_test.py
+++ b/tensorflow/python/distribute/custom_training_loop_input_test.py
@@ -87,7 +87,7 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
         return math_ops.square(x)
 
       outputs = distribution.experimental_local_results(
-          distribution.experimental_run_v2(computation, args=(x,)))
+          distribution.run(computation, args=(x,)))
       return outputs
 
     self.assertAllEqual(
@@ -110,7 +110,7 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
       def assign_add():
         v.assign_add(1.0)
 
-      distribution.experimental_run_v2(assign_add)
+      distribution.run(assign_add)
       return array_ops.zeros([])
 
     train_step()
@@ -130,7 +130,7 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
     results = []
     for x in dist_dataset:
       output = distribution.experimental_local_results(
-          distribution.experimental_run_v2(train_step, args=(x,)))
+          distribution.run(train_step, args=(x,)))
       results.append(output)
     self.assert_equal_flattened([[25., 36.], [49., 64.]], results)
 
@@ -148,7 +148,7 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
 
     with self.assertRaisesRegexp(NotImplementedError,
                                  "does not support pure eager execution"):
-      distribution.experimental_run_v2(train_step, args=(next(input_iterator),))
+      distribution.run(train_step, args=(next(input_iterator),))
 
   @combinations.generate(
       combinations.combine(
@@ -166,7 +166,7 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
     results = []
     for x in dist_dataset:
       output = distribution.experimental_local_results(
-          distribution.experimental_run_v2(train_step, args=(x,)))
+          distribution.run(train_step, args=(x,)))
       results.append(output)
     self.assert_equal_flattened([[25., 36.], [49., 64.]], results)
 
@@ -184,7 +184,7 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
     @def_function.function
     def f_train_step(input_data):
       return distribution.experimental_local_results(
-          distribution.experimental_run_v2(train_step, args=(input_data,)))
+          distribution.run(train_step, args=(input_data,)))
 
     dist_dataset = distribution.experimental_distribute_dataset(dataset)
     results = []
@@ -214,7 +214,7 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
         }]
 
       inputs = next(iterator)
-      outputs = distribution.experimental_run_v2(computation, args=(inputs,))
+      outputs = distribution.run(computation, args=(inputs,))
       return nest.map_structure(distribution.experimental_local_results,
                                 outputs)
 
@@ -238,7 +238,7 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
     @def_function.function
     def f_train_step(input_data):
       return distribution.experimental_local_results(
-          distribution.experimental_run_v2(train_step, args=(input_data,)))
+          distribution.run(train_step, args=(input_data,)))
 
     dist_dataset = distribution.experimental_distribute_dataset(dataset)
     results = []
@@ -270,7 +270,7 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
             distribution.reduce("MEAN", x, axis=0), product_of_means.dtype)
 
       for y in dist_dataset:  # loop with no intermediate state
-        distribution.experimental_run_v2(train_step, args=(y,))
+        distribution.run(train_step, args=(y,))
 
       return number_of_steps, product_of_means
 
@@ -308,7 +308,7 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
       for _ in range(2):
         elem = next(iterator)
         output = distribution.experimental_local_results(
-            distribution.experimental_run_v2(step_fn, args=(elem,)))
+            distribution.run(step_fn, args=(elem,)))
         results.append(output)
       return results
 
@@ -454,7 +454,7 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
         return math_ops.reduce_mean(x)
       inputs = next(iterator)
       outputs = distribution.experimental_local_results(
-          distribution.experimental_run_v2(computation, args=(inputs,)))
+          distribution.run(computation, args=(inputs,)))
       return outputs
 
     # This assumes that there are exactly 2 replicas
@@ -478,7 +478,7 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
 
       inputs = next(iterator)
       outputs = distribution.experimental_local_results(
-          distribution.experimental_run_v2(
+          distribution.run(
               computation, args=(inputs,), options=options))
       return outputs
 
@@ -499,7 +499,7 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
       def computation(x):
         return math_ops.reduce_mean(x)
       outputs = distribution.experimental_local_results(
-          distribution.experimental_run_v2(computation, args=(inputs,)))
+          distribution.run(computation, args=(inputs,)))
       return outputs
 
     # This assumes that there are exactly 2 replicas
@@ -552,7 +552,7 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
       def computation(x):
         return array_ops.size_v2(x)
       outputs = distribution.experimental_local_results(
-          distribution.experimental_run_v2(computation, args=(inputs,)))
+          distribution.run(computation, args=(inputs,)))
       return outputs
 
     # This assumes that there are exactly 2 replicas
@@ -580,7 +580,7 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
       def computation(x):
         return math_ops.reduce_mean(x)
       outputs = distribution.experimental_local_results(
-          distribution.experimental_run_v2(computation, args=(inputs,)))
+          distribution.run(computation, args=(inputs,)))
       return outputs
 
     # This assumes that there are exactly 2 replicas
@@ -669,7 +669,7 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
         # Fixed size output with a dynamic sized output.
         return array_ops.zeros([3]), math_ops.square(x)
 
-      return distribution.experimental_run_v2(
+      return distribution.run(
           computation, args=(next(iterator),))
 
     results = run(input_iterator)
@@ -707,7 +707,7 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
       for _ in range(2):
         elem = next(iterator)
         output = distribution.experimental_local_results(
-            distribution.experimental_run_v2(step_fn, args=(elem,)))
+            distribution.run(step_fn, args=(elem,)))
         results.append(output)
       return results
 
@@ -729,7 +729,7 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
     @def_function.function
     def f_train_step(input_data):
       return distribution.experimental_local_results(
-          distribution.experimental_run_v2(train_step, args=(input_data,)))
+          distribution.run(train_step, args=(input_data,)))
 
     dataset = get_dataset_from_tensor_slices([5., 6., 7., 8.]).batch(2)
     dist_dataset = distribution.experimental_distribute_dataset(dataset)
@@ -761,12 +761,12 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
       def func(inputs):
         return math_ops.square(inputs) + var
 
-      per_replica_outputs = distribution.experimental_run_v2(
+      per_replica_outputs = distribution.run(
           func, (next(input_iterator),))
       mean = distribution.reduce(
           reduce_util.ReduceOp.MEAN, per_replica_outputs, axis=None)
       for _ in dataset_ops.Dataset.range(1):
-        per_replica_outputs = distribution.experimental_run_v2(
+        per_replica_outputs = distribution.run(
             func, (next(input_iterator),))
         mean = distribution.reduce(
             reduce_util.ReduceOp.MEAN, per_replica_outputs, axis=None)
@@ -793,7 +793,7 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
 
     @def_function.function
     def f_train_step(iterator):
-      distribution.experimental_run_v2(train_step, args=(next(iterator),))
+      distribution.run(train_step, args=(next(iterator),))
       return a
 
     dataset = get_dataset_from_tensor_slices([5., 6., 7., 8.]).batch(2)
diff --git a/tensorflow/python/distribute/custom_training_loop_metrics_test.py b/tensorflow/python/distribute/custom_training_loop_metrics_test.py
index 48309113a97..a4acb16b6aa 100644
--- a/tensorflow/python/distribute/custom_training_loop_metrics_test.py
+++ b/tensorflow/python/distribute/custom_training_loop_metrics_test.py
@@ -49,7 +49,7 @@ class KerasMetricsTest(test.TestCase, parameterized.TestCase):
         loss_metric.update_state(loss)
         loss_metric_2.update_state(loss)
 
-      distribution.experimental_run_v2(step_fn)
+      distribution.run(step_fn)
 
     train_step()
     self.assertEqual(loss_metric.result().numpy(),
@@ -73,7 +73,7 @@ class KerasMetricsTest(test.TestCase, parameterized.TestCase):
       metric.update_state(i)
 
     for i in dataset:
-      distribution.experimental_run_v2(step_fn, args=(i,))
+      distribution.run(step_fn, args=(i,))
 
     # This should be the mean of integers 0-9 which has a sum of 45 and a count
     # of 10 resulting in mean of 4.5.
diff --git a/tensorflow/python/distribute/custom_training_loop_models_test.py b/tensorflow/python/distribute/custom_training_loop_models_test.py
index 6fafa43677c..2e9a8db5bc8 100644
--- a/tensorflow/python/distribute/custom_training_loop_models_test.py
+++ b/tensorflow/python/distribute/custom_training_loop_models_test.py
@@ -75,7 +75,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
         grads = tape.gradient(loss, model.variables)
         return grads
 
-      outputs = distribution.experimental_run_v2(
+      outputs = distribution.run(
           step_fn, args=(next(iterator),))
       return nest.map_structure(distribution.experimental_local_results,
                                 outputs)
@@ -104,7 +104,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
         grads = tape.gradient(loss, model.variables)
         return grads
 
-      outputs = distribution.experimental_run_v2(
+      outputs = distribution.run(
           step_fn, args=(next(iterator),))
       return nest.map_structure(distribution.experimental_local_results,
                                 outputs)
@@ -135,7 +135,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
         optimizer.apply_gradients(zip(grads, model.variables))
         return loss
 
-      outputs = distribution.experimental_run_v2(
+      outputs = distribution.run(
           step_fn, args=(next(iterator),))
       return nest.map_structure(distribution.experimental_local_results,
                                 outputs)
@@ -178,7 +178,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
         optimizer.apply_gradients(zip(grads, model.variables))
         return loss
 
-      outputs = distribution.experimental_run_v2(
+      outputs = distribution.run(
           step_fn, args=(next(iterator),))
       return nest.map_structure(distribution.experimental_local_results,
                                 outputs)
@@ -210,7 +210,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
         return loss
 
       for _ in range(5):
-        distribution.experimental_run_v2(step_fn, args=(next(iterator),))
+        distribution.run(step_fn, args=(next(iterator),))
 
     train_step(input_iterator)
 
@@ -261,7 +261,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
         optimizer.apply_gradients(zip(grads, model.variables))
         return loss
 
-      outputs = distribution.experimental_run_v2(
+      outputs = distribution.run(
           step_fn, args=(next(input_iterator),))
       return distribution.experimental_local_results(outputs)
 
@@ -314,7 +314,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
         grads = tape.gradient(loss, model.variables)
         optimizer.apply_gradients(zip(grads, model.variables))
 
-      distribution.experimental_run_v2(step_fn, args=(inputs,))
+      distribution.run(step_fn, args=(inputs,))
 
     @def_function.function
     def compute_loss2(images, targets):
@@ -331,7 +331,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
         grads = tape.gradient(loss, model2.variables)
         optimizer.apply_gradients(zip(grads, model2.variables))
 
-      distribution.experimental_run_v2(step_fn, args=(inputs,))
+      distribution.run(step_fn, args=(inputs,))
 
     inputs = next(input_iterator)
 
@@ -365,7 +365,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
         grads = tape.gradient(loss, model.variables)
         return grads
 
-      outputs = distribution.experimental_run_v2(
+      outputs = distribution.run(
           step_fn, args=(next(iterator),))
       return nest.map_structure(distribution.experimental_local_results,
                                 outputs)
@@ -408,7 +408,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
         grads = tape.gradient(loss, model.variables)
         return grads
 
-      outputs = distribution.experimental_run_v2(
+      outputs = distribution.run(
           step_fn, args=(next(iterator),))
       return nest.map_structure(distribution.experimental_local_results,
                                 outputs)
diff --git a/tensorflow/python/distribute/custom_training_loop_optimizer_test.py b/tensorflow/python/distribute/custom_training_loop_optimizer_test.py
index 451e936d9b5..5f39efc9edc 100644
--- a/tensorflow/python/distribute/custom_training_loop_optimizer_test.py
+++ b/tensorflow/python/distribute/custom_training_loop_optimizer_test.py
@@ -66,7 +66,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
         return v.read_value()
 
       return distribution.experimental_local_results(
-          distribution.experimental_run_v2(step_fn, args=(grads,)))
+          distribution.run(step_fn, args=(grads,)))
 
     self.assertAllClose(optimize(), expected)
 
@@ -92,7 +92,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
         return v.read_value()
 
       return distribution.experimental_local_results(
-          distribution.experimental_run_v2(step_fn, args=(grads,)))
+          distribution.run(step_fn, args=(grads,)))
 
     self.assertAllClose(optimize(), [[-0.1, -0.1]])
 
diff --git a/tensorflow/python/distribute/distribute_coordinator.py b/tensorflow/python/distribute/distribute_coordinator.py
index eb1cc5217fc..7ff88afac4e 100644
--- a/tensorflow/python/distribute/distribute_coordinator.py
+++ b/tensorflow/python/distribute/distribute_coordinator.py
@@ -750,6 +750,9 @@ def run_distribute_coordinator(worker_fn,
     otherwise.
   """
   tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
+  rpc_layer = tf_config.get("rpc_layer", rpc_layer)
+  environment = tf_config.get("environment", None)
+
   if not cluster_spec:
     cluster_spec = tf_config.get("cluster", {})
     task_env = tf_config.get("task", {})
@@ -758,11 +761,15 @@ def run_distribute_coordinator(worker_fn,
       task_id = int(task_env.get("index", task_id))
 
   if cluster_spec:
-    cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
     # TODO(yuefengz): validate cluster_spec.
-
-  rpc_layer = tf_config.get("rpc_layer", rpc_layer)
-  environment = tf_config.get("environment", None)
+    cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
+  elif hasattr(strategy.extended, "_cluster_resolver"):
+    cluster_resolver = strategy.extended._cluster_resolver  # pylint: disable=protected-access
+    task_type = cluster_resolver.task_type
+    task_id = cluster_resolver.task_id
+    rpc_layer = cluster_resolver.rpc_layer or rpc_layer
+    environment = cluster_resolver.environment
+    cluster_spec = cluster_resolver.cluster_spec()
 
   # Setting the session config is necessary for some strategies such as
   # CollectiveAllReduceStrategy.
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index 73ea4abf081..9b819987899 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -12,12 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=line-too-long
 """Library for running a computation across multiple devices.
 
 See the guide for overview and examples:
 [TensorFlow v2.x](https://www.tensorflow.org/guide/distributed_training),
-[TensorFlow v1.x](https://github.com/tensorflow/docs/blob/master/site/en/r1/guide/distribute_strategy.ipynb).
+[TensorFlow v1.x](https://github.com/tensorflow/docs/blob/master/site/en/r1/guide/distribute_strategy.ipynb).  # pylint: disable=line-too-long
 
 The intent of this library is that you can write an algorithm in a stylized way
 and it will be usable with a variety of different `tf.distribute.Strategy`
@@ -130,6 +129,7 @@ from tensorflow.python.ops.losses import loss_reduction
 from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.deprecation import deprecated
@@ -485,7 +485,7 @@ class RunOptions(
         "experimental_enable_dynamic_batch_size",
         "experimental_bucketizing_dynamic_shape",
     ])):
-  """Run options for `strategy.experimental_run_v2`.
+  """Run options for `strategy.run`.
 
   This can be used to hold some strategy specific configs.
 
@@ -496,7 +496,7 @@ class RunOptions(
       shape inputs are allowed.
     experimental_bucketizing_dynamic_shape: Boolean. Only applies to
       TPUStrategy. Default to False. If True, TPUStrategy will automatic
-      bucketize inputs passed into `experimental_run_v2` if the input shape is
+      bucketize inputs passed into `run` if the input shape is
       dynamic. This is a performance optimization to reduce XLA recompilation,
       which should not have impact on correctness.
   """
@@ -548,7 +548,7 @@ class StrategyBase(object):
         across replicas, use
         `tf.distribute.Strategy.experimental_distribute_datasets_from_function`
         instead.
-      * Use `tf.distribute.Strategy.experimental_run_v2` to run a function
+      * Use `tf.distribute.Strategy.run` to run a function
         once per replica, taking values that may be "per-replica" (e.g.
         from a distributed dataset) and returning "per-replica" values.
         This function is executed in "replica context", which means each
@@ -568,8 +568,7 @@ class StrategyBase(object):
 
       total_result = 0
       for x in dataset:
-        per_replica_result = my_strategy.experimental_run_v2(replica_fn,
-                                                             args=(x,))
+        per_replica_result = my_strategy.run(replica_fn, args=(x,))
         total_result += my_strategy.reduce(tf.distribute.ReduceOp.SUM,
                                            per_replica_result, axis=None)
       return total_result
@@ -711,7 +710,7 @@ class StrategyBase(object):
     """DEPRECATED TF 1.x ONLY."""
     with self.scope():
       args = (input_iterator.get_next(),) if input_iterator is not None else ()
-    return self.experimental_run_v2(fn, args=args)
+    return self.run(fn, args=args)
 
   def experimental_distribute_dataset(self, dataset):
     """Distributes a tf.data.Dataset instance provided via `dataset`.
@@ -736,7 +735,7 @@ class StrategyBase(object):
     # Iterate over the distributed dataset
     for x in dist_dataset:
       # process dataset elements
-      strategy.experimental_run_v2(train_step, args=(x,))
+      strategy.run(train_step, args=(x,))
     ```
 
     We will assume that the input dataset is batched by the
@@ -792,7 +791,7 @@ class StrategyBase(object):
     # Iterate over the distributed dataset
     for x in dist_dataset:
       # process dataset elements
-      strategy.experimental_run_v2(train_step, args=(x,))
+      strategy.run(train_step, args=(x,))
     ```
 
     Args:
@@ -836,7 +835,7 @@ class StrategyBase(object):
     inputs = strategy.experimental_distribute_datasets_from_function(dataset_fn)
 
     for batch in inputs:
-      replica_results = strategy.experimental_run_v2(replica_fn, args=(batch,))
+      replica_results = strategy.run(replica_fn, args=(batch,))
     ```
 
     IMPORTANT: The `tf.data.Dataset` returned by `dataset_fn` should have a
@@ -860,7 +859,7 @@ class StrategyBase(object):
       return
 
     for _ in range(steps):
-      strategy.experimental_run_v2(replica_fn_with_signature,
+      strategy.run(replica_fn_with_signature,
           args=(next(iterator),))
     ```
 
@@ -875,24 +874,56 @@ class StrategyBase(object):
     return self._extended._experimental_distribute_datasets_from_function(  # pylint: disable=protected-access
         dataset_fn)
 
-  def experimental_run_v2(self, fn, args=(), kwargs=None, options=None):
+  def run(self, fn, args=(), kwargs=None, options=None):
     """Run `fn` on each replica, with the given arguments.
 
     Executes ops specified by `fn` on each replica. If `args` or `kwargs` have
-    "per-replica" values, such as those produced by a "distributed `Dataset`",
+    `tf.distribute.DistributedValues`, such as those produced by a
+    "distributed `Dataset`" or `experimental_distribute_values_from_function`
     when `fn` is executed on a particular replica, it will be executed with the
-    component of those "per-replica" values that correspond to that replica.
+    component of `tf.distribute.DistributedValues` that correspond to that
+    replica.
 
     `fn` may call `tf.distribute.get_replica_context()` to access members such
     as `all_reduce`.
 
     All arguments in `args` or `kwargs` should either be nest of tensors or
-    per-replica objects containing tensors or composite tensors.
+    `tf.distribute.DistributedValues` containing tensors or composite tensors.
 
     IMPORTANT: Depending on the implementation of `tf.distribute.Strategy` and
     whether eager execution is enabled, `fn` may be called one or more times (
     once for each replica).
 
+    Example usage:
+
+    1. Constant tensor input.
+
+    >>> strategy = tf.distribute.MirroredStrategy()
+    >>> tensor_input = tf.constant(3.0)
+    >>> @tf.function
+    ... def replica_fn(input):
+    ...   return input*2.0
+    >>> result = strategy.run(replica_fn, args=(tensor_input,))
+    >>> result
+    <tf.Tensor: shape=(), dtype=float32, numpy=6.0>
+
+    2. DistributedValues input.
+
+    >>> strategy = tf.distribute.MirroredStrategy()
+    >>> @tf.function
+    ... def run():
+    ...   def value_fn(value_context):
+    ...     return value_context.num_replicas_in_sync
+    ...   distributed_values = (
+    ...     strategy.experimental_distribute_values_from_function(
+    ...       value_fn))
+    ...   def replica_fn2(input):
+    ...     return input*2
+    ...   return strategy.run(replica_fn2, args=(distributed_values,))
+    >>> result = run()
+    >>> result
+    <tf.Tensor: shape=(), dtype=int32, numpy=2>
+
     Args:
       fn: The function to run. The output must be a `tf.nest` of `Tensor`s.
       args: (Optional) Positional arguments to `fn`.
@@ -903,8 +934,8 @@ class StrategyBase(object):
     Returns:
       Merged return value of `fn` across replicas. The structure of the return
       value is the same as the return value from `fn`. Each element in the
-      structure can either be "per-replica" `Tensor` objects or `Tensor`s
-      (for example, if running on a single replica).
+      structure can either be `tf.distribute.DistributedValues`, `Tensor`
+      objects, or `Tensor`s (for example, if running on a single replica).
     """
     del options
 
@@ -919,10 +950,16 @@ class StrategyBase(object):
           fn, autograph_ctx.control_status_ctx(), convert_by_default=False)
       return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
 
+  # TODO(b/151224785): Remove deprecated alias.
+  @doc_controls.do_not_doc_inheritable  # DEPRECATED
+  @deprecation.deprecated(None, "renamed to `run`")
+  def experimental_run_v2(self, fn, args=(), kwargs=None, options=None):
+    return self.run(fn, args=args, kwargs=kwargs, options=options)
+
   def reduce(self, reduce_op, value, axis):
     """Reduce `value` across replicas.
 
-    Given a per-replica value returned by `experimental_run_v2`, say a
+    Given a per-replica value returned by `run`, say a
     per-example loss, the batch will be divided across all the replicas.  This
     function allows you to aggregate across replicas and optionally also across
     batch elements.  For example, if you have a global batch size of 8 and 2
@@ -947,7 +984,7 @@ class StrategyBase(object):
     Args:
       reduce_op: A `tf.distribute.ReduceOp` value specifying how values should
         be combined.
-      value: A "per replica" value, e.g. returned by `experimental_run_v2` to
+      value: A "per replica" value, e.g. returned by `run` to
         be combined into a single tensor.
       axis: Specifies the dimension to reduce along within each
         replica's tensor. Should typically be set to the batch dimension, or
@@ -964,7 +1001,7 @@ class StrategyBase(object):
     if axis is None:
       return self._extended._reduce(reduce_op, value)  # pylint: disable=protected-access
     if reduce_op == reduce_util.ReduceOp.SUM:
-      value = self.experimental_run_v2(
+      value = self.run(
           lambda v: math_ops.reduce_sum(v, axis=axis), args=(value,))
       return self._extended._reduce(reduce_op, value)  # pylint: disable=protected-access
     if reduce_op != reduce_util.ReduceOp.MEAN:
@@ -1011,7 +1048,7 @@ class StrategyBase(object):
       # reduce is complete?
       return numer, denom
 
-    numer, denom = self.experimental_run_v2(mean_reduce_helper, args=(value,))
+    numer, denom = self.run(mean_reduce_helper, args=(value,))
     # TODO(josh11b): Should batch reduce here instead of doing two.
     numer = self._extended._reduce(reduce_util.ReduceOp.SUM, numer)  # pylint: disable=protected-access
     denom = self._extended._reduce(reduce_util.ReduceOp.SUM, denom)  # pylint: disable=protected-access
@@ -1050,7 +1087,7 @@ class StrategyBase(object):
     computed on that worker.
 
     Args:
-      value: A value returned by `experimental_run()`, `experimental_run_v2()`,
+      value: A value returned by `experimental_run()`, `run()`,
         `extended.call_for_each_replica()`, or a variable created in `scope`.
 
     Returns:
@@ -1146,7 +1183,7 @@ class Strategy(StrategyBase):
       output = strategy.experimental_assign_to_logical_device(output, 0)
       return output
 
-    strategy.experimental_run_v2(step_fn, args=(next(iterator),))
+    strategy.run(step_fn, args=(next(iterator),))
     ```
 
     Args:
@@ -1204,7 +1241,7 @@ class Strategy(StrategyBase):
       output = model(inputs)
       return output
 
-    strategy.experimental_run_v2(step_fn, args=(next(iterator),))
+    strategy.run(step_fn, args=(next(iterator),))
     ```
     Args:
       tensor: Input tensor to annotate.
@@ -1266,7 +1303,7 @@ class Strategy(StrategyBase):
 
       return loss
 
-    strategy.experimental_run_v2(step_fn, args=(next(iterator),))
+    strategy.run(step_fn, args=(next(iterator),))
     ```
     Args:
       tensor: Input tensor to annotate.
@@ -1280,7 +1317,7 @@ class Strategy(StrategyBase):
     """Generates `tf.distribute.DistributedValues` from `value_fn`.
 
     This function is to generate `tf.distribute.DistributedValues` to pass
-    into `experimental_run_v2`, `reduce`, or other methods that take
+    into `run`, `reduce`, or other methods that take
     distributed values when not using datasets.
 
     Args:
@@ -1340,7 +1377,7 @@ class Strategy(StrategyBase):
         multiple_values.append(tf.constant(1.0))
 
     def value_fn(ctx):
-      return multiple_values[ctx.replica_id]
+      return multiple_values[ctx.replica_id_in_sync_group]
 
     distributed_values = strategy.
       experimental_distribute_values_from_function(
@@ -1468,7 +1505,7 @@ class StrategyV1(StrategyBase):
     """Runs ops in `fn` on each replica, with inputs from `input_iterator`.
 
     DEPRECATED: This method is not available in TF 2.x. Please switch
-    to using `experimental_run_v2` instead.
+    to using `run` instead.
 
     When eager execution is enabled, executes ops specified by `fn` on each
     replica. Otherwise, builds a graph to execute the ops on each replica.
@@ -1540,10 +1577,10 @@ class StrategyExtendedV2(object):
     from replica id to values. "PerReplica" is used when the value may be
     different across replicas, and "Mirrored" when the value are the same.
   * Unwrapping and merging: Consider calling a function `fn` on multiple
-    replicas, like `experimental_run_v2(fn, args=[w])` with an
+    replicas, like `run(fn, args=[w])` with an
     argument `w` that is a wrapped value. This means `w` will have a map taking
     replica id `0` to `w0`, replica id `11` to `w1`, etc.
-    `experimental_run_v2()` unwraps `w` before calling `fn`, so
+    `run()` unwraps `w` before calling `fn`, so
     it calls `fn(w0)` on `d0`, `fn(w1)` on `d1`, etc.  It then merges the return
     values from `fn()`, which can possibly result in wrapped values. For
     example, let's say `fn()` returns a tuple with three components: `(x, a,
@@ -1573,7 +1610,7 @@ class StrategyExtendedV2(object):
 
   * `tf.distribute.Strategy.scope`: enters cross-replica context when
     no other strategy is in scope.
-  * `tf.distribute.Strategy.experimental_run_v2`: calls a function in
+  * `tf.distribute.Strategy.run`: calls a function in
     replica context.
   * `tf.distribute.ReplicaContext.merge_call`: transitions from replica
     context to cross-replica context.
@@ -1615,7 +1652,7 @@ class StrategyExtendedV2(object):
   returned by `tf.distribute.Strategy.experimental_distribute_dataset` and
   `tf.distribute.Strategy.experimental_distribute_datasets_from_function`.  They
   are also the typical result returned by
-  `tf.distribute.Strategy.experimental_run_v2`. You typically can't use a
+  `tf.distribute.Strategy.run`. You typically can't use a
   per-replica value directly in a cross-replica context, without first resolving
   how to aggregate the values across replicas, for instance by using
   `tf.distribute.Strategy.reduce`.
@@ -1653,7 +1690,7 @@ class StrategyExtendedV2(object):
 
   The standard pattern for updating variables is to:
 
-  1. In your function passed to `tf.distribute.Strategy.experimental_run_v2`,
+  1. In your function passed to `tf.distribute.Strategy.run`,
      compute a list of (update, variable) pairs. For example, the update might
      be a the gradient of the loss with respect to the variable.
   2. Switch to cross-replica mode by calling
@@ -2011,8 +2048,7 @@ class StrategyExtendedV2(object):
     """Returns the container that this per-replica `value` belongs to.
 
     Args:
-      value: A value returned by `experimental_run_v2()` or a variable
-        created in `scope()`.
+      value: A value returned by `run()` or a variable created in `scope()`.
 
     Returns:
       A container that `value` belongs to.
@@ -2157,7 +2193,7 @@ class StrategyExtendedV1(StrategyExtendedV2):
                                          iterator,
                                          iterations=1,
                                          initial_loop_values=None):
-    """DEPRECATED: please use `experimental_run_v2` instead.
+    """DEPRECATED: please use `run` instead.
 
     Run `fn` with input from `iterator` for `iterations` times.
 
@@ -2233,7 +2269,7 @@ class StrategyExtendedV1(StrategyExtendedV2):
     with distribution.scope():
       # in "cross-replica" context
       ...
-      merged_results = distribution.experimental_run_v2(fn, args=[3])
+      merged_results = distribution.run(fn, args=[3])
       # merged_results has the values from every replica execution of `fn`.
       # This statement prints a list:
       print(distribution.experimental_local_results(merged_results))
@@ -2300,7 +2336,7 @@ class StrategyExtendedV1(StrategyExtendedV2):
 # `ReplicaContext` (defined here) and `_CurrentDistributionContext`
 # (defined above) used by `tf.distribute.Strategy.scope()`:
 #
-# * a ReplicaContext is only present during a `experimental_run_v2()`
+# * a ReplicaContext is only present during a `run()`
 #   call (except during a `merge_run` call) and in such a scope it
 #   will be returned by calls to `get_replica_context()`.  Implementers of new
 #   Strategy descendants will frequently also need to
@@ -2321,7 +2357,7 @@ class ReplicaContext(object):
 
   You can use `tf.distribute.get_replica_context` to get an instance of
   `ReplicaContext`. This should be inside your replicated step function, such
-  as in a `tf.distribute.Strategy.experimental_run_v2` call.
+  as in a `tf.distribute.Strategy.run` call.
   """
 
   def __init__(self, strategy, replica_id_in_sync_group):
@@ -2353,11 +2389,9 @@ class ReplicaContext(object):
     """Merge args across replicas and run `merge_fn` in a cross-replica context.
 
     This allows communication and coordination when there are multiple calls
-    to the step_fn triggered by a call to
-    `strategy.experimental_run_v2(step_fn, ...)`.
+    to the step_fn triggered by a call to `strategy.run(step_fn, ...)`.
 
-    See `tf.distribute.Strategy.experimental_run_v2` for an
-    explanation.
+    See `tf.distribute.Strategy.run` for an explanation.
 
     If not inside a distributed scope, this is equivalent to:
 
@@ -2665,6 +2699,14 @@ class _DefaultDistributionExtended(StrategyExtendedV1):
     # Default strategy doesn't indicate multi-worker training.
     return False
 
+  @property
+  def should_checkpoint(self):
+    return True
+
+  @property
+  def should_save_summary(self):
+    return True
+
   # TODO(priyag): This should inherit from `InputIterator`, once dependency
   # issues have been resolved.
   class DefaultInputIterator(object):
diff --git a/tensorflow/python/distribute/distribute_lib_test.py b/tensorflow/python/distribute/distribute_lib_test.py
index bac623ada52..828e7a1aed9 100644
--- a/tensorflow/python/distribute/distribute_lib_test.py
+++ b/tensorflow/python/distribute/distribute_lib_test.py
@@ -510,7 +510,7 @@ class DefaultDistributionStrategyTest(test.TestCase, parameterized.TestCase):
       return input_data
 
     for _ in range(2):
-      default_strategy.experimental_run_v2(train_step, args=(next_val,))
+      default_strategy.run(train_step, args=(next_val,))
 
   @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testDistributedDatasets(self):
diff --git a/tensorflow/python/distribute/distribution_strategy_context.py b/tensorflow/python/distribute/distribution_strategy_context.py
index 29593d65c5d..819815afecd 100644
--- a/tensorflow/python/distribute/distribution_strategy_context.py
+++ b/tensorflow/python/distribute/distribution_strategy_context.py
@@ -99,8 +99,7 @@ def get_replica_context():
      will return the default `ReplicaContext` object);
   2. switches to cross-replica context (in which case this will return
      `None`) when entering a `with tf.distribute.Strategy.scope():` block;
-  3. switches to a (non-default) replica context inside
-     `strategy.experimental_run_v2(fn, ...)`;
+  3. switches to a (non-default) replica context inside `strategy.run(fn, ...)`;
   4. if `fn` calls `get_replica_context().merge_call(merge_fn, ...)`, then
      inside `merge_fn` you are back in the cross-replica context (and again
      this function will return `None`).
@@ -121,7 +120,7 @@ def get_replica_context():
       tf.print("Replica id: ", replica_context.replica_id_in_sync_group,
                " of ", replica_context.num_replicas_in_sync)
 
-    strategy.experimental_run_v2(f)
+    strategy.run(f)
   ```
 
   Returns:
@@ -166,7 +165,7 @@ def in_cross_replica_context():
     def f():
       assert not tf.distribute.in_cross_replica_context()
 
-    strategy.experimental_run_v2(f)
+    strategy.run(f)
   ```
 
   Returns:
diff --git a/tensorflow/python/distribute/input_lib_test.py b/tensorflow/python/distribute/input_lib_test.py
index 80d5db38403..8995704f44e 100644
--- a/tensorflow/python/distribute/input_lib_test.py
+++ b/tensorflow/python/distribute/input_lib_test.py
@@ -585,7 +585,7 @@ class DistributedIteratorTensorTypeTest(DistributedIteratorTestBase,
       """Sums the `PerReplica` values in the `per_replica_features` map."""
 
       def map_fn(per_replica_values):
-        per_replica_sums = distribution.experimental_run_v2(
+        per_replica_sums = distribution.run(
             (lambda x: math_ops.reduce_sum(x.values)) if all(
                 map(sparse_tensor.is_sparse, per_replica_values.values)) else
             math_ops.reduce_sum, (per_replica_values,))
@@ -1048,7 +1048,7 @@ class InputTypeSpecTest(test.TestCase, parameterized.TestCase):
 
     @def_function.function(input_signature=[type_spec])
     def process_inputs(inputs):
-      distribution.experimental_run_v2(lambda inputs: inputs, args=(inputs,))
+      distribution.run(lambda inputs: inputs, args=(inputs,))
 
     for x in ds:
       process_inputs(x)
@@ -1073,7 +1073,7 @@ class InputTypeSpecTest(test.TestCase, parameterized.TestCase):
 
     @def_function.function(input_signature=[dist_dataset.element_spec])
     def process_inputs(inputs):
-      distribution.experimental_run_v2(lambda inputs: inputs, args=(inputs,))
+      distribution.run(lambda inputs: inputs, args=(inputs,))
 
     for x in dist_dataset:
       process_inputs(x)
diff --git a/tensorflow/python/distribute/keras_metrics_test.py b/tensorflow/python/distribute/keras_metrics_test.py
index 62b04ac88ab..44ed5debe60 100644
--- a/tensorflow/python/distribute/keras_metrics_test.py
+++ b/tensorflow/python/distribute/keras_metrics_test.py
@@ -97,8 +97,7 @@ class KerasMetricsTest(test.TestCase, parameterized.TestCase):
 
       iterator = distribution.make_input_fn_iterator(lambda _: dataset_fn())
       updates = distribution.experimental_local_results(
-          distribution.experimental_run_v2(
-              metric, args=(iterator.get_next(),)))
+          distribution.run(metric, args=(iterator.get_next(),)))
       batches_per_update = distribution.num_replicas_in_sync
 
       self.evaluate(iterator.initializer)
diff --git a/tensorflow/python/distribute/minimize_loss_test.py b/tensorflow/python/distribute/minimize_loss_test.py
index fb9aa61aa3f..c9df971783c 100644
--- a/tensorflow/python/distribute/minimize_loss_test.py
+++ b/tensorflow/python/distribute/minimize_loss_test.py
@@ -543,7 +543,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         opt.minimize(lambda: constant_op.constant(1.), [])
         opt.apply_gradients([])
 
-      distribution.experimental_run_v2(run_fn)
+      distribution.run(run_fn)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/distribute/mirrored_function_strategy_test.py b/tensorflow/python/distribute/mirrored_function_strategy_test.py
index aa40856f7a6..c883241114e 100644
--- a/tensorflow/python/distribute/mirrored_function_strategy_test.py
+++ b/tensorflow/python/distribute/mirrored_function_strategy_test.py
@@ -52,7 +52,7 @@ class MirroredFunctionStrategyTest(test.TestCase):
 
     one = constant_op.constant(1)
     self.assertLen(f_traces, 0)
-    result1 = self._strategy.experimental_run_v2(f, args=(one,))
+    result1 = self._strategy.run(f, args=(one,))
     self.assertLen(f_traces, 1)  # Function traced once, not for each replica.
     # Returns a per-replica value.
     self.assertIsInstance(result1, values.PerReplica)
@@ -60,7 +60,7 @@ class MirroredFunctionStrategyTest(test.TestCase):
                         self._strategy.experimental_local_results(result1))
 
     # Try passing a per-replica value as an argument.
-    result2 = self._strategy.experimental_run_v2(f, args=(result1,))
+    result2 = self._strategy.run(f, args=(result1,))
     self.assertLen(f_traces, 1)
     self.assertIsInstance(result2, values.PerReplica)
     self.assertAllEqual([1, 3],
@@ -88,7 +88,7 @@ class MirroredFunctionStrategyTest(test.TestCase):
     one = constant_op.constant(1)
     self.assertLen(f_traces, 0)
     self.assertLen(g_traces, 0)
-    result = self._strategy.experimental_run_v2(f, args=(one,))
+    result = self._strategy.run(f, args=(one,))
     # Functions traced once, not for each replica.
     self.assertLen(f_traces, 1)
     self.assertLen(g_traces, 1)
diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py
index 6da586c251b..31c1c6665fa 100644
--- a/tensorflow/python/distribute/mirrored_strategy.py
+++ b/tensorflow/python/distribute/mirrored_strategy.py
@@ -403,8 +403,7 @@ class MirroredStrategy(distribute_lib.Strategy):
 
       total_result = 0
       for x in dataset:
-        per_replica_result = my_strategy.experimental_run_v2(replica_fn,
-                                                             args=(x,))
+        per_replica_result = my_strategy.run(replica_fn, args=(x,))
         total_result += my_strategy.reduce(tf.distribute.ReduceOp.SUM,
                                            per_replica_result, axis=None)
       return total_result
@@ -752,13 +751,13 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
       return wrapped(args, kwargs)
 
     if context.executing_eagerly():
-      logging.log_first_n(logging.WARN, "Using %s eagerly has significant "
-                          "overhead currently. We will be working on improving "
-                          "this in the future, but for now please wrap "
-                          "`call_for_each_replica` or `experimental_run` or "
-                          "`experimental_run_v2` inside a tf.function to get "
-                          "the best performance." %
-                          self._container_strategy().__class__.__name__, 5)
+      logging.log_first_n(
+          logging.WARN, "Using %s eagerly has significant "
+          "overhead currently. We will be working on improving "
+          "this in the future, but for now please wrap "
+          "`call_for_each_replica` or `experimental_run` or "
+          "`run` inside a tf.function to get the best performance." %
+          self._container_strategy().__class__.__name__, 5)
     else:
       # When a tf.function is wrapped to trigger _call_for_each_replica (see
       # the other branch above), AutoGraph stops conversion at
diff --git a/tensorflow/python/distribute/mirrored_strategy_test.py b/tensorflow/python/distribute/mirrored_strategy_test.py
index 0ab4018ce13..73a0f34c6bd 100644
--- a/tensorflow/python/distribute/mirrored_strategy_test.py
+++ b/tensorflow/python/distribute/mirrored_strategy_test.py
@@ -1368,7 +1368,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
           return t.gradient(loss, [w, b])
 
       def step_fn():
-        return distribution.experimental_run_v2(replica_fn)
+        return distribution.run(replica_fn)
 
       context.enable_run_metadata()
       g1, g2 = step_fn()
@@ -1399,7 +1399,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       def replica_fn():
         return f()
 
-      distribution.experimental_run_v2(replica_fn)
+      distribution.run(replica_fn)
 
 
 def _replica_id():
diff --git a/tensorflow/python/distribute/moving_averages_test.py b/tensorflow/python/distribute/moving_averages_test.py
index 5b41db9ec15..6066e3e234f 100644
--- a/tensorflow/python/distribute/moving_averages_test.py
+++ b/tensorflow/python/distribute/moving_averages_test.py
@@ -192,7 +192,7 @@ class ExponentialMovingAverageTest(test.TestCase, parameterized.TestCase):
           ema.apply([w])
           return ema.average(w)
 
-        return distribution.experimental_run_v2(_ema_replica_fn_eager)
+        return distribution.run(_ema_replica_fn_eager)
 
       if use_function:
         fn = def_function.function(fn)
@@ -238,7 +238,7 @@ class ExponentialMovingAverageTest(test.TestCase, parameterized.TestCase):
       self.skipTest("b/139550827: Cannot do variable.assign in replica context "
                     "of TPUStrategy")
     with distribution.scope():
-      w_assign, w_apply, ema_w = distribution.experimental_run_v2(
+      w_assign, w_apply, ema_w = distribution.run(
           self._ema_replica_fn_graph)
     self.assertEqual(ema_w.name, "w/ExponentialMovingAverage:0")
     with self.cached_session():
diff --git a/tensorflow/python/distribute/multi_process_lib.py b/tensorflow/python/distribute/multi_process_lib.py
index 0de6f252b0f..f3b03ca8bc4 100644
--- a/tensorflow/python/distribute/multi_process_lib.py
+++ b/tensorflow/python/distribute/multi_process_lib.py
@@ -38,8 +38,9 @@ def get_user_data():
 
 
 @contextlib.contextmanager
-def context_manager():
+def context_manager(max_subprocess_count=20, barrier_parties=0):
   """No-op in OSS. This exists to maintain testing compatibility."""
+  del max_subprocess_count, barrier_parties
   yield
 
 
diff --git a/tensorflow/python/distribute/multi_process_runner.py b/tensorflow/python/distribute/multi_process_runner.py
index 28716fd2a5e..0091f0e4109 100644
--- a/tensorflow/python/distribute/multi_process_runner.py
+++ b/tensorflow/python/distribute/multi_process_runner.py
@@ -17,7 +17,6 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
 import collections
 import contextlib
 import json
@@ -26,7 +25,7 @@ import signal
 import sys
 import threading
 import time
-
+from absl import logging
 import six
 from six.moves import queue as Queue
 
@@ -35,7 +34,6 @@ from tensorflow.python.compat import v2_compat
 from tensorflow.python.distribute import multi_process_lib
 from tensorflow.python.eager import context
 from tensorflow.python.platform import test
-from tensorflow.python.util import nest
 
 # _ProcessStatusInfo contains process status information. When is_successful
 # attribute is True, the subprocess has ended successfully, or if False, the
@@ -44,8 +42,18 @@ from tensorflow.python.util import nest
 _ProcessStatusInfo = collections.namedtuple(
     '_ProcessStatusInfo', ['task_type', 'is_successful', 'exc_info'])
 
+# _SubprocessInfo collects basic information of a subprocess such as task type
+# and process id.
+# TODO(rchao): Include task_type and task_id in subprocess info.
+_SubprocessInfo = collections.namedtuple('_SubprocessInfo', ['pid'])
+
+# Information returned from a successful MultiProcessRunner run.
+MultiProcessRunnerResult = collections.namedtuple('MultiProcessRunnerResult',
+                                                  ['return_value', 'stdout'])
+
 # Process status queue is used by `multi_process_runner` internally for
-# communication from subprocesses to the parent process.
+# communication from subprocesses to the parent process for whether it's been
+# successful, and if not what the error stack trace is.
 PROCESS_STATUS_QUEUE = 'process_status_queue'
 
 # Return value queue is intended to be used by users of `multi_process_runner`
@@ -53,32 +61,37 @@ PROCESS_STATUS_QUEUE = 'process_status_queue'
 # `multi_process_runner.run()`.
 RETURN_VALUE_QUEUE = 'return_value_queue'
 
-# Standard stream queue is used by `multi_process_runner` to collect
-# information streamed to stdout and stderr to be reported back to the
-# parent process.
-STD_STREAM_QUEUE = 'std_stream_queue'
-
-# Inter-process queue is used for communications between subprocesses.
-INTER_PROCESS_QUEUE = 'inter_process_queue'
+# Subprocess info queue stores `_SubprocessInfo` for later potential
+# termination by the parent.
+SUBPROCESS_INFO_QUEUE = 'subprocess_info_queue'
 
 # Parent-to-sub queue is used for communications from parent to subprocess.
 # Currently this is only used to terminate subprocesses.
+# TODO(rchao): Remove this once subprocess is terminated by SIGKILL.
 PARENT_TO_SUB_QUEUE = 'parent_to_sub_queue'
 
+# Streaming queue stores the logged and printed messages from subprocesses.
+STREAMING_QUEUE = 'streaming_queue'
 
-class _LogCollector(object):
-  """Tool to collect logs before sending them to std stream."""
+# Pipes to stream stdout and stderr from subprocesses to parent process.
+STREAMING_PIPE = 'streaming_pipe'
 
-  def __init__(self, original_stream):
-    self.log = []
-    self.original_stream = original_stream
+# Barrier identifier.
+BARRIER = 'barrier'
 
-  def write(self, data):
-    self.log.append(data)
-    self.original_stream.write(data)
+_DEFAULT_MAX_SUBPROCESS_COUNT = 20
 
-  def flush(self, *args, **kwargs):
-    self.original_stream.flush(*args, **kwargs)
+# Threads to be joined at the time subprocesses successfully exit.
+# TODO(rchao): Refactor multi_process_runner so that _threads lives in
+# parent process' class which is separated from subprocess' class. Currently
+# this needs to be global so it doesn't get pickled into subprocess' function
+# runs, which would fail.
+_threads = []
+
+# Next pipe index to be global so that pipes are not reused across multiple
+# MultiProcessRunner usages.
+# TODO(rchao): Investigate possibility to remove this variable.
+_next_pipe_index = 0
 
 
 class MultiProcessRunner(object):
@@ -102,8 +115,9 @@ class MultiProcessRunner(object):
                cluster_spec,
                rpc_layer=None,
                max_run_time=None,
-               capture_std_stream=False,
                grpc_fail_fast=False,
+               stream_stdout=True,
+               list_stdout=False,
                args=None,
                kwargs=None):
     """Creates a multi-process runner.
@@ -124,10 +138,15 @@ class MultiProcessRunner(object):
         `signal.alarm()` api. Note that this is best effort at Python level
         since Python signal handler does not get executed when it runs lower
         level C/C++ code. So it can be delayed for arbitrarily long time.
-      capture_std_stream: Boolean, whether the messages streamed to stdout and
-        stderr in subprocesses are captured.
       grpc_fail_fast: Whether GRPC connection between processes should fail
         without retrying. Defaults to False.
+      stream_stdout: True if the output/error from the subprocesses should be
+        streamed to be printed in parent process' log. Defaults to True.
+      list_stdout: True if the output/error from the subprocesses should be
+        collected to be attached to the resulting `MultiProcessRunnerResult`
+        returned from `MultiProcessRunner.join()`. If True, the list of stdout
+        can be retrieved via `MultiProcessRunnerResult.stdout` attribute.
+        Defaults to False.
       args: Positional arguments to be sent to functions run on processes.
       kwargs: Keyword arguments to be sent to functions run on processes.
 
@@ -154,16 +173,23 @@ class MultiProcessRunner(object):
     self._cluster_spec = cluster_spec
     self._rpc_layer = rpc_layer
     self._max_run_time = max_run_time
-    self._capture_std_stream = capture_std_stream
     self._grpc_fail_fast = grpc_fail_fast
+    self._stream_stdout = stream_stdout
+    # TODO(rchao): Revisit list_stdout argument to consider other solution.
+    self._list_stdout = list_stdout
+    self._dependence_on_chief = True
     self._args = args or ()
     self._kwargs = kwargs or {}
+
     self._outstanding_subprocess_count = 0
 
     # Child processes should have the same v2 and eager behavior.
     self._v2_enabled = tf2.enabled()
     self._executing_eagerly = context.executing_eagerly()
 
+    # This flag will be set to True once terminate_all() is called.
+    self._all_forced_terminated = False
+
   @contextlib.contextmanager
   def _runtime_mode(self):
     if self._executing_eagerly:
@@ -173,44 +199,18 @@ class MultiProcessRunner(object):
       with context.graph_mode():
         yield
 
-  def _finish_process(self, process_status_info, return_value, stdout_collector,
-                      stderr_collector):
+  def _finish_process(self, process_status_info, return_value):
     """Adds data to queues before program exits."""
     # Clear the alarm.
     signal.alarm(0)
 
-    # When chief exists in the cluster, there must only be one chief and it
-    # needs to reach this point before any other exits. The reason is chief
-    # would continue to ping ps/workers if ps/workers exit before chief does,
-    # and this results in connection error flakiness.
-    # TODO(rchao): Modify this mechanism so that parent sends out the signal
-    # to terminate the subprocesses to have better control over the cases where
-    # fault tolerance is being tested. After the start of such signal from the
-    # parent, the errors should be ignored.
-    if 'chief' in self._cluster_spec:
-      if process_status_info.task_type == 'chief':
-        # When executed by chief, for each task in the cluster, except for
-        # chief, add an item in the queue as a notification for those tasks to
-        # know they can continue to terminate the process.
-        for _ in range(len(nest.flatten(self._cluster_spec)) - 1):
-          self._get_inter_process_queue().put(True)
-      else:
-        # When executed by non-chief, they need to block until the signal from
-        # chief is received.
-        self._get_inter_process_queue().get()
-
     if return_value is not None:
       self._add_return_data(return_value)
-    if self._capture_std_stream:
-      # If stdout and stderr are to be collected, add them to std stream
-      # queue.
-      self._add_std_stream_data_flattened(stdout_collector.log)
-      self._add_std_stream_data_flattened(stderr_collector.log)
     self._get_process_status_queue().put(process_status_info)
 
-  def _message_checking_func(self, task_type, task_id, stdout_collector,
-                             stderr_collector):
+  def _message_checking_func(self, task_type, task_id):
     """A function that regularly checks messages from parent process."""
+    # TODO(rchao): Remove this once parent uses SIGKILL to terminate subprocess.
     while True:
       try:
         message = self._get_parent_to_sub_queue().get(block=False)
@@ -227,29 +227,32 @@ class MultiProcessRunner(object):
         time.sleep(0.1)
     self._finish_process(
         _ProcessStatusInfo(
-            task_type=task_type, is_successful=True, exc_info=None), None,
-        stdout_collector, stderr_collector)
+            task_type=task_type, is_successful=True, exc_info=None), None)
     # `os._exit(0)` is used to more reliably terminate a subprocess.
     os._exit(0)  # pylint: disable=protected-access
 
   def _proc_func_wrapper(self, proc_func, task_type, task_id,
-                         per_process_cluster_spec, rpc_layer, *arg, **kwargs):
+                         per_process_cluster_spec, rpc_layer, pipe_w, *arg,
+                         **kwargs):
     """The wrapper function that actually gets run in child process(es)."""
 
-    if self._capture_std_stream:
-      # TODO(yuefengz): consider a lighter way of capturing std streams.
-      stdout_collector = _LogCollector(sys.__stdout__)
-      stderr_collector = _LogCollector(sys.__stderr__)
-      sys.stdout = stdout_collector
-      sys.stderr = stderr_collector
-    else:
-      stdout_collector = None
-      stderr_collector = None
+    pid = os.getpid()
+    logging.info('Subprocess with PID %d is now being started.', pid)
+    self._get_subprocess_info_queue().put(_SubprocessInfo(pid=pid))
 
-    # The thread will be dedicated to checking messages from parent process.
-    threading.Thread(
+    # Assign sys.stdout and sys.stderr as duplicates of `pipe_w` so print() and
+    # logging.*() write directly to `pipe_w`. Unfortunately since we cannot
+    # prepend task_type and task_id information to the streamed logs we will
+    # need a thread per subprocess to distinguish where the piece of message is
+    # from.
+    os.dup2(pipe_w.fileno(), sys.stdout.fileno())
+    os.dup2(pipe_w.fileno(), sys.stderr.fileno())
+
+    # The thread will be dedicated to checking messages from the parent process.
+    threading.Thread(  # pylint: disable=unexpected-keyword-arg
         target=self._message_checking_func,
-        args=(task_type, task_id, stdout_collector, stderr_collector)).start()
+        args=(task_type, task_id),
+        daemon=True).start()
 
     os.environ['GRPC_FAIL_FAST'] = str(self._grpc_fail_fast)
     tf_config_dict = {
@@ -268,32 +271,16 @@ class MultiProcessRunner(object):
 
     return_value = None
 
-    if self._max_run_time is not None:
-      # Register an sigalarm handler to exit the process when it reaches
-      # `timeout` seconds. A program reaching `timeout` doesn't necessarily
-      # indicate an issue.
-      def handler(signum, frame):
-        del signum, frame
-        self._finish_process(
-            _ProcessStatusInfo(
-                task_type=task_type, is_successful=True, exc_info=None), None,
-            stdout_collector, stderr_collector)
-        # `os._exit(0)` is used to more reliably terminate a subprocess.
-        os._exit(0)  # pylint: disable=protected-access
-
-      signal.signal(signal.SIGALRM, handler)
-      signal.alarm(self._max_run_time)
-
     try:
       with self._runtime_mode():
         return_value = proc_func(*arg, **kwargs)
+
     except Exception:  # pylint: disable=broad-except
       # Capture all exceptions to be reported to parent process.
       self._finish_process(
           _ProcessStatusInfo(
               task_type=task_type, is_successful=False,
-              exc_info=sys.exc_info()), return_value, stdout_collector,
-          stderr_collector)
+              exc_info=sys.exc_info()), return_value)
 
       # Re-raise the exception in addition to reporting it to the parent
       # process, so that even if `--test_timeout` flag is set and the
@@ -307,24 +294,81 @@ class MultiProcessRunner(object):
     self._finish_process(
         _ProcessStatusInfo(
             task_type=task_type, is_successful=True, exc_info=None),
-        return_value, stdout_collector, stderr_collector)
+        return_value)
+
+  def _continuously_readline_from_sub(self, pipe_r, task_type, task_id):
+    """Function to continuously read lines from subprocesses."""
+    reader = os.fdopen(pipe_r.fileno(), 'r')
+    while True:
+      read_line = reader.readline()
+      if read_line == 'EOF':
+        reader.close()
+        break
+      task_string = '[{}-{}]:'.format(task_type, task_id)
+      formatted_line = '{} {}'.format(task_string.ljust(14), read_line)
+      if self._stream_stdout:
+        self._print_stdout_in_parent(formatted_line, task_type, task_id)
+      if self._list_stdout:
+        self._add_stdout_in_queue(formatted_line, task_type, task_id)
+
+  def _print_stdout_in_parent(self, formatted_line, task_type, task_id):
+    del task_type, task_id
+    # Flush True so the logging order from subprocesses is respected.
+    # TODO(rchao): Use a lock here to ensure the printed lines are not broken.
+    print(formatted_line, end='', flush=True)
+
+  def _add_stdout_in_queue(self, formatted_line, task_type, task_id):
+    del task_type, task_id
+    # A queue instead of a simple list is used here due to b/150652733.
+    multi_process_lib.get_user_data()[STREAMING_QUEUE].put(formatted_line)
+
+  def _start_subprocess_and_reading_thread(self, proc_func, task_type, task_id,
+                                           args, kwargs):
+    """Start a subprocess and a thread the reads lines from the subprocess."""
+    global _next_pipe_index
+    pipe_r, pipe_w = multi_process_lib.get_user_data(
+    )[STREAMING_PIPE][_next_pipe_index]
+    _next_pipe_index += 1
+
+    p = multi_process_lib.Process(
+        target=self._proc_func_wrapper,
+        args=(proc_func, task_type, task_id, self._cluster_spec,
+              self._rpc_layer, pipe_w) + args,
+        kwargs=kwargs)
+    p.start()
+    self._outstanding_subprocess_count += 1
+
+    # For each subprocess, we dedicate a thread continuously reading lines
+    # from them.
+    thread = threading.Thread(  # pylint: disable=unexpected-keyword-arg
+        target=self._continuously_readline_from_sub,
+        args=(pipe_r, task_type, task_id),
+        daemon=True)
+    thread.start()
+    _threads.append(thread)
 
   def start(self):
-    """Starts processes, one for each task in `cluster_spec`.
+    """Starts processes, one for each task in `cluster_spec`."""
+
+    global _next_pipe_index
+    self._starting_pipe_index = _next_pipe_index
 
-    If 'chief' job exists in the cluster, it is guaranteed that 'chief'
-    process exits before other jobs to prevent chief from continuing to connect
-    to them which causes error.
-    """
     for task_type, addresses in self._cluster_spec.items():
       for task_id, _ in enumerate(addresses):
-        p = multi_process_lib.Process(
-            target=self._proc_func_wrapper,
-            args=(self._proc_func, task_type, task_id, self._cluster_spec,
-                  self._rpc_layer) + self._args,
-            kwargs=self._kwargs)
-        p.start()
-        self._outstanding_subprocess_count += 1
+        self._start_subprocess_and_reading_thread(self._proc_func, task_type,
+                                                  task_id, self._args,
+                                                  self._kwargs)
+
+    # TODO(rchao): Remove the need of using SIGALRM if possible. At this time,
+    # without this the tests become very flaky.
+    if self._max_run_time is not None:
+
+      def handler(signum, frame):
+        del signum, frame
+        self.terminate_all()
+
+      signal.signal(signal.SIGALRM, handler)
+      signal.alarm(self._max_run_time)
 
   def start_single_process(self,
                            task_type,
@@ -356,13 +400,8 @@ class MultiProcessRunner(object):
     """
     self._cluster_spec = updated_cluster_spec or self._cluster_spec
     proc_func = proc_func or self._proc_func
-    p = multi_process_lib.Process(
-        target=self._proc_func_wrapper,
-        args=(proc_func, task_type, task_id, self._cluster_spec,
-              self._rpc_layer) + (args or ()),
-        kwargs=(kwargs or {}))
-    p.start()
-    self._outstanding_subprocess_count += 1
+    self._start_subprocess_and_reading_thread(proc_func, task_type, task_id,
+                                              args or (), kwargs or {})
 
   def _queue_to_list(self, queue_to_convert):
     """Convert `queue.Queue` to `list`."""
@@ -383,21 +422,19 @@ class MultiProcessRunner(object):
         `timeout` seconds, a `RuntimeError` exception will be thrown.
 
     Returns:
-      It returns a tuple. The first element is a list that stores the return
-      data added by subprocesses through `_add_return_data` or through normal
-      function return; The second element is a list of the messages streamed to
-      stdout and stderr in the subprocesses if `capture_std_stream` is True or
-      `None` otherwise.
+      A MultiProcessRunnerResult object, which has two attributes,
+      `return_value` and `stdout`. `return_value` always contains the return
+      values from the subprocesses. If `list_stdout` argument is True at
+      `__init__`, `stdout` is available that contains a list of all messages
+      from subprocesses' stdout and stderr.
 
     Raises:
-      RuntimeError: if not all processes report status within `timeout` seconds.
-      Or the exception propagated from any child process.
+      RuntimeError: if not all processes report status approximatelty within
+      `timeout` seconds, or there's an exception propagated from any subprocess.
     """
+
     if not timeout:
-      if self._max_run_time:
-        timeout = self._max_run_time + 10  # add 10 seconds grace period
-      else:
-        timeout = float('inf')
+      timeout = float('inf')
     start_time = time.time()
     while self._outstanding_subprocess_count > 0:
       while True:
@@ -405,35 +442,69 @@ class MultiProcessRunner(object):
           process_status = self._get_process_status_queue().get(timeout=10)
           break
         except Queue.Empty:
+          if self._all_forced_terminated:
+            break
           if time.time() - start_time > timeout:
             # If none of those did, report timeout to user.
-            raise RuntimeError(
-                'One or more subprocesses timed out. Please use '
-                '`--test_arg=--logtostderr` bazel flag to inspect logs for '
-                'subprocess debugging info. Number of outstanding subprocesses '
-                'is %d.' % self._outstanding_subprocess_count)
+            raise RuntimeError('One or more subprocesses timed out. '
+                               'Number of outstanding subprocesses '
+                               'is %d.' % self._outstanding_subprocess_count)
 
+      if self._all_forced_terminated:
+        break
       self._outstanding_subprocess_count -= 1
       assert isinstance(process_status, _ProcessStatusInfo)
       if not process_status.is_successful:
         six.reraise(*process_status.exc_info)
 
-    if self._capture_std_stream:
-      # TODO(yuefengz): we need to make sure elements match the same process in
-      # the two returned lists so as to not surprise users. Consider creating a
-      # `ReturnData` class.
-      return tuple(
-          self._queue_to_list(multi_process_lib.get_user_data()[queue_name])
-          for queue_name in [RETURN_VALUE_QUEUE, STD_STREAM_QUEUE])
-    else:
-      return (self._queue_to_list(
-          multi_process_lib.get_user_data()[RETURN_VALUE_QUEUE]), None)
+      if self._dependence_on_chief and process_status.task_type == 'chief':
+        self.terminate_all()
+        break
+
+    # Giving threads some time to finish the message reading from subprocesses.
+    time.sleep(5)
+
+    stdout = self._queue_to_list(
+        multi_process_lib.get_user_data()[STREAMING_QUEUE])
+    return_value = self._queue_to_list(
+        multi_process_lib.get_user_data()[RETURN_VALUE_QUEUE])
+
+    # Notifying the threads that are reading lines that we should stop.
+    for pipe_index in range(self._starting_pipe_index, _next_pipe_index):  # pylint: disable=protected-access
+      _, pipe_w = multi_process_lib.get_user_data()[STREAMING_PIPE][pipe_index]
+      writer = os.fdopen(pipe_w.fileno(), 'w')
+      # Writing end of file message so the threads that's actively reading lines
+      # know to stop.
+      writer.writelines(['EOF'])
+      writer.close()
+
+    for thread in _threads:
+      thread.join(5)
+
+    return MultiProcessRunnerResult(stdout=stdout, return_value=return_value)
 
   def terminate(self, task_type, task_id):
     """Terminates the process with `task_type` and `task_id`."""
     self._get_parent_to_sub_queue().put('terminate {} {}'.format(
         task_type, task_id))
 
+  def terminate_all(self):
+    """Terminates all subprocesses."""
+    subprocess_infos = []
+
+    while True:
+      try:
+        subprocess_info = self._get_subprocess_info_queue().get(block=False)
+        subprocess_infos.append(subprocess_info)
+      except Queue.Empty:
+        break
+
+    for subprocess_info in subprocess_infos:
+      logging.info('Parent process is now killing PID: %d', subprocess_info.pid)
+      os.kill(subprocess_info.pid, signal.SIGKILL)
+
+    self._all_forced_terminated = True
+
   def _add_return_data(self, data):
     """Adds return data that will be returned by `join`.
 
@@ -449,30 +520,27 @@ class MultiProcessRunner(object):
     # the data is from.
     multi_process_lib.get_user_data()[RETURN_VALUE_QUEUE].put(data)
 
-  def _add_std_stream_data_flattened(self, data):
-    # TODO(yuefengz): currently the same queue is used by multiple processes. It
-    # is difficult for users to distinguish between logs from different
-    # processes.
-    std_stream_queue = multi_process_lib.get_user_data()[STD_STREAM_QUEUE]
-    for d in list(data):
-      std_stream_queue.put(d)
-
   def _get_process_status_queue(self):
     return multi_process_lib.get_user_data()[PROCESS_STATUS_QUEUE]
 
-  def _get_inter_process_queue(self):
-    return multi_process_lib.get_user_data()[INTER_PROCESS_QUEUE]
-
   def _get_parent_to_sub_queue(self):
     return multi_process_lib.get_user_data()[PARENT_TO_SUB_QUEUE]
 
+  def _get_subprocess_info_queue(self):
+    return multi_process_lib.get_user_data()[SUBPROCESS_INFO_QUEUE]
+
+
+def barrier():
+  return multi_process_lib.get_user_data()[BARRIER]
+
 
 def run(proc_func,
         cluster_spec,
         rpc_layer=None,
         max_run_time=None,
-        capture_std_stream=False,
         grpc_fail_fast=False,
+        stream_stdout=True,
+        list_stdout=False,
         timeout=None,
         args=None,
         kwargs=None):  # pylint: disable=g-doc-args
@@ -483,22 +551,32 @@ def run(proc_func,
   documentations.
 
   Returns:
-    A tuple returned from `MultiProcessRunner.join()`.
+    A MultiProcessRunnerResult object returned from `MultiProcessRunner.join()`.
   """
   runner = MultiProcessRunner(
       proc_func,
       cluster_spec,
       rpc_layer,
       max_run_time=max_run_time,
-      capture_std_stream=capture_std_stream,
       grpc_fail_fast=grpc_fail_fast,
+      stream_stdout=stream_stdout,
+      list_stdout=list_stdout,
       args=args,
       kwargs=kwargs)
   runner.start()
   return runner.join(timeout)
 
 
-def test_main():
-  """Main function to be called within `__main__` of a test file."""
-  with multi_process_lib.context_manager():
+def test_main(max_subprocess_count=_DEFAULT_MAX_SUBPROCESS_COUNT,
+              barrier_parties=0):
+  """Main function to be called within `__main__` of a test file.
+
+  Args:
+    max_subprocess_count: Maximum number of subprocesses that will be used. User
+      of multi_process_runner needs to determine a number at calling this
+      method, and the subprocesses involved later should not exceed this number.
+    barrier_parties: Number of parties the barrier will be used toward. User of
+      multi_process_runner needs to determine a number at calling this method.
+  """
+  with multi_process_lib.context_manager(max_subprocess_count, barrier_parties):
     test.main()
diff --git a/tensorflow/python/distribute/multi_process_runner_test.py b/tensorflow/python/distribute/multi_process_runner_test.py
index 839646a5d1f..a21ae45fd56 100644
--- a/tensorflow/python/distribute/multi_process_runner_test.py
+++ b/tensorflow/python/distribute/multi_process_runner_test.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 import json
 import os
 import time
-
+from absl import logging
 from six.moves import queue as Queue
 
 from tensorflow.python.distribute import multi_process_runner
@@ -57,14 +57,14 @@ class MultiProcessRunnerTest(test.TestCase):
     return config_task['index']
 
   def test_multi_process_runner(self):
-    returned_data, _ = multi_process_runner.run(
+    mpr_result = multi_process_runner.run(
         proc_func_that_adds_task_type_in_return_data,
         multi_worker_test_base.create_cluster_spec(
             num_workers=2, num_ps=3, has_eval=1),
         args=(self, 3))
 
     job_count_dict = {'worker': 2, 'ps': 3, 'evaluator': 1}
-    for data in returned_data:
+    for data in mpr_result.return_value:
       job_count_dict[data] -= 1
 
     self.assertEqual(job_count_dict['worker'], 0)
@@ -82,57 +82,61 @@ class MultiProcessRunnerTest(test.TestCase):
 
   def test_multi_process_runner_queue_emptied_between_runs(self):
     cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
-    returned_data, _ = multi_process_runner.run(
-        proc_func_that_adds_simple_return_data, cluster_spec)
-    self.assertTrue(returned_data)
-    self.assertEqual(returned_data[0], 'dummy_data')
-    self.assertEqual(returned_data[1], 'dummy_data')
-    returned_data, _ = multi_process_runner.run(proc_func_that_does_nothing,
-                                                cluster_spec)
-    self.assertFalse(returned_data)
+    return_value = multi_process_runner.run(
+        proc_func_that_adds_simple_return_data, cluster_spec).return_value
+    self.assertTrue(return_value)
+    self.assertEqual(return_value[0], 'dummy_data')
+    self.assertEqual(return_value[1], 'dummy_data')
+    return_value = multi_process_runner.run(proc_func_that_does_nothing,
+                                            cluster_spec).return_value
+    self.assertFalse(return_value)
 
   def test_multi_process_runner_args_passed_correctly(self):
-    returned_data, _ = multi_process_runner.run(
+    return_value = multi_process_runner.run(
         proc_func_that_return_args_and_kwargs,
         multi_worker_test_base.create_cluster_spec(num_workers=1),
         args=('a', 'b'),
-        kwargs={'c_k': 'c_v'})
-    self.assertEqual(returned_data[0][0], 'a')
-    self.assertEqual(returned_data[0][1], 'b')
-    self.assertEqual(returned_data[0][2], ('c_k', 'c_v'))
+        kwargs={
+            'c_k': 'c_v'
+        }).return_value
+    self.assertEqual(return_value[0][0], 'a')
+    self.assertEqual(return_value[0][1], 'b')
+    self.assertEqual(return_value[0][2], ('c_k', 'c_v'))
 
   def test_stdout_captured(self):
 
     def simple_print_func():
-      print('This is something printed.')
+      print('This is something printed.', flush=True)
       return 'This is returned data.'
 
-    returned_data, std_stream_data = multi_process_runner.run(
+    mpr_result = multi_process_runner.run(
         simple_print_func,
         multi_worker_test_base.create_cluster_spec(num_workers=2),
-        capture_std_stream=True)
-    num_string_std_stream = len(
-        [d for d in std_stream_data if d == 'This is something printed.'])
-    num_string_returned_data = len(
-        [d for d in returned_data if d == 'This is returned data.'])
-    self.assertEqual(num_string_std_stream, 2)
-    self.assertEqual(num_string_returned_data, 2)
+        list_stdout=True)
+    std_stream_results = mpr_result.stdout
+    return_value = mpr_result.return_value
+    self.assertIn('[worker-0]:    This is something printed.\n',
+                  std_stream_results)
+    self.assertIn('[worker-1]:    This is something printed.\n',
+                  std_stream_results)
+    self.assertIn('This is returned data.', return_value)
 
   def test_process_that_exits(self):
-    def func_to_exit_in_10_sec():
+
+    def func_to_exit_in_15_sec():
       time.sleep(5)
       mpr._add_return_data('foo')
       time.sleep(20)
       mpr._add_return_data('bar')
 
     mpr = multi_process_runner.MultiProcessRunner(
-        func_to_exit_in_10_sec,
+        func_to_exit_in_15_sec,
         multi_worker_test_base.create_cluster_spec(num_workers=1),
-        max_run_time=10)
+        max_run_time=15)
 
     mpr.start()
-    returned_data, _ = mpr.join()
-    self.assertLen(returned_data, 1)
+    return_value = mpr.join().return_value
+    self.assertLen(return_value, 1)
 
   def test_signal_doesnt_fire_after_process_exits(self):
     mpr = multi_process_runner.MultiProcessRunner(
@@ -150,50 +154,106 @@ class MultiProcessRunnerTest(test.TestCase):
 
     def proc_func():
       for i in range(0, 10):
-        print('index {}, iteration {}'.format(self._worker_idx(), i))
+        print(
+            'index {}, iteration {}'.format(self._worker_idx(), i), flush=True)
         time.sleep(1)
 
     mpr = multi_process_runner.MultiProcessRunner(
         proc_func,
         multi_worker_test_base.create_cluster_spec(num_workers=2),
-        capture_std_stream=True)
+        list_stdout=True)
     mpr.start()
     time.sleep(5)
     mpr.terminate('worker', 0)
-    std_stream_result = mpr.join()[1]
+    std_stream_results = mpr.join().stdout
 
     # Worker 0 is terminated in the middle, so it should not have iteration 9
     # printed.
-    self.assertIn('index 0, iteration 0', std_stream_result)
-    self.assertNotIn('index 0, iteration 9', std_stream_result)
-    self.assertIn('index 1, iteration 0', std_stream_result)
-    self.assertIn('index 1, iteration 9', std_stream_result)
+    self.assertIn('[worker-0]:    index 0, iteration 0\n', std_stream_results)
+    self.assertNotIn('[worker-0]:    index 0, iteration 9\n',
+                     std_stream_results)
+    self.assertIn('[worker-1]:    index 1, iteration 0\n', std_stream_results)
+    self.assertIn('[worker-1]:    index 1, iteration 9\n', std_stream_results)
 
   def test_termination_and_start_single_process(self):
 
     def proc_func():
       for i in range(0, 10):
-        print('index {}, iteration {}'.format(self._worker_idx(), i))
+        print(
+            'index {}, iteration {}'.format(self._worker_idx(), i), flush=True)
         time.sleep(1)
 
     mpr = multi_process_runner.MultiProcessRunner(
         proc_func,
         multi_worker_test_base.create_cluster_spec(num_workers=2),
-        capture_std_stream=True)
+        list_stdout=True)
     mpr.start()
     time.sleep(5)
     mpr.terminate('worker', 0)
     mpr.start_single_process('worker', 0)
-    std_stream_result = mpr.join()[1]
+    std_stream_results = mpr.join().stdout
 
     # Worker 0 is terminated in the middle, but a new worker 0 is added, so it
     # should still have iteration 9 printed. Moreover, iteration 0 of worker 0
     # should happen twice.
     self.assertLen(
-        [s for s in std_stream_result if s == 'index 0, iteration 0'], 2)
-    self.assertIn('index 0, iteration 9', std_stream_result)
-    self.assertIn('index 1, iteration 0', std_stream_result)
-    self.assertIn('index 1, iteration 9', std_stream_result)
+        [s for s in std_stream_results if 'index 0, iteration 0' in s], 2)
+    self.assertIn('[worker-0]:    index 0, iteration 9\n', std_stream_results)
+    self.assertIn('[worker-1]:    index 1, iteration 0\n', std_stream_results)
+    self.assertIn('[worker-1]:    index 1, iteration 9\n', std_stream_results)
+
+  def test_streaming(self):
+
+    def proc_func():
+      for i in range(5):
+        logging.info('(logging) %s-%d, i: %d',
+                     multi_worker_test_base.get_task_type(), self._worker_idx(),
+                     i)
+        print(
+            '(print) {}-{}, i: {}'.format(
+                multi_worker_test_base.get_task_type(), self._worker_idx(), i),
+            flush=True)
+        time.sleep(1)
+
+    mpr = multi_process_runner.MultiProcessRunner(
+        proc_func,
+        multi_worker_test_base.create_cluster_spec(
+            has_chief=True, num_workers=2, num_ps=2, has_eval=True),
+        list_stdout=True)
+    mpr._dependence_on_chief = False
+
+    mpr.start()
+    mpr.start_single_process('worker', 2)
+    mpr.start_single_process('ps', 2)
+    mpr_result = mpr.join()
+
+    list_to_assert = mpr_result.stdout
+
+    for job in ['chief', 'evaluator']:
+      for iteration in range(5):
+        self.assertTrue(
+            any('(logging) {}-0, i: {}'.format(job, iteration) in line
+                for line in list_to_assert))
+        self.assertTrue(
+            any('(print) {}-0, i: {}'.format(job, iteration) in line
+                for line in list_to_assert))
+
+    for job in ['worker', 'ps']:
+      for iteration in range(5):
+        for task in range(3):
+          self.assertTrue(
+              any('(logging) {}-{}, i: {}'.format(job, task, iteration) in line
+                  for line in list_to_assert))
+          self.assertTrue(
+              any('(print) {}-{}, i: {}'.format(job, task, iteration) in line
+                  for line in list_to_assert))
+        task = 3
+        self.assertFalse(
+            any('(logging) {}-{}, i: {}'.format(job, task, iteration) in line
+                for line in list_to_assert))
+        self.assertFalse(
+            any('(print) {}-{}, i: {}'.format(job, task, iteration) in line
+                for line in list_to_assert))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/distribute/one_device_strategy.py b/tensorflow/python/distribute/one_device_strategy.py
index 6fa7cb27539..a1c4ada6c52 100644
--- a/tensorflow/python/distribute/one_device_strategy.py
+++ b/tensorflow/python/distribute/one_device_strategy.py
@@ -44,7 +44,7 @@ class OneDeviceStrategy(distribute_lib.Strategy):
   Using this strategy will place any variables created in its scope on the
   specified device. Input distributed through this strategy will be
   prefetched to the specified device. Moreover, any functions called via
-  `strategy.experimental_run_v2` will also be placed on the specified device
+  `strategy.run` will also be placed on the specified device
   as well.
 
   Typical usage of this strategy could be testing your code with the
@@ -64,7 +64,7 @@ class OneDeviceStrategy(distribute_lib.Strategy):
 
   result = 0
   for i in range(10):
-    result += strategy.experimental_run_v2(step_fn, args=(i,))
+    result += strategy.run(step_fn, args=(i,))
   print(result)  # 90
   ```
   """
@@ -78,6 +78,8 @@ class OneDeviceStrategy(distribute_lib.Strategy):
         used. Examples: "/cpu:0", "/gpu:0", "/device:CPU:0", "/device:GPU:0"
     """
     super(OneDeviceStrategy, self).__init__(OneDeviceExtended(self, device))
+    distribute_lib.distribution_strategy_gauge.get_cell("V2").set(
+        "OneDeviceStrategy")
 
   def experimental_distribute_dataset(self, dataset):  # pylint: disable=useless-super-delegation
     """Distributes a tf.data.Dataset instance provided via dataset.
@@ -127,7 +129,7 @@ class OneDeviceStrategy(distribute_lib.Strategy):
     inputs = strategy.experimental_distribute_datasets_from_function(dataset_fn)
 
     for batch in inputs:
-      replica_results = strategy.experimental_run_v2(replica_fn, args=(batch,))
+      replica_results = strategy.run(replica_fn, args=(batch,))
     ```
 
     IMPORTANT: The `tf.data.Dataset` returned by `dataset_fn` should have a
@@ -154,7 +156,7 @@ class OneDeviceStrategy(distribute_lib.Strategy):
     value, so the result is just the value in a tuple.
 
     Args:
-      value: A value returned by `experimental_run()`, `experimental_run_v2()`,
+      value: A value returned by `experimental_run()`, `run()`,
         `extended.call_for_each_replica()`, or a variable created in `scope`.
 
     Returns:
@@ -163,7 +165,7 @@ class OneDeviceStrategy(distribute_lib.Strategy):
     """
     return super(OneDeviceStrategy, self).experimental_local_results(value)
 
-  def experimental_run_v2(self, fn, args=(), kwargs=None, options=None):  # pylint: disable=useless-super-delegation
+  def run(self, fn, args=(), kwargs=None, options=None):  # pylint: disable=useless-super-delegation
     """Run `fn` on each replica, with the given arguments.
 
     In `OneDeviceStrategy`, `fn` is simply called within a device scope for the
@@ -179,8 +181,7 @@ class OneDeviceStrategy(distribute_lib.Strategy):
     Returns:
       Return value from running `fn`.
     """
-    return super(OneDeviceStrategy,
-                 self).experimental_run_v2(fn, args, kwargs, options)
+    return super(OneDeviceStrategy, self).run(fn, args, kwargs, options)
 
   def reduce(self, reduce_op, value, axis):  # pylint: disable=useless-super-delegation
     """Reduce `value` across replicas.
@@ -203,7 +204,7 @@ class OneDeviceStrategy(distribute_lib.Strategy):
     Args:
       reduce_op: A `tf.distribute.ReduceOp` value specifying how values should
         be combined.
-      value: A "per replica" value, e.g. returned by `experimental_run_v2` to
+      value: A "per replica" value, e.g. returned by `run` to
         be combined into a single tensor.
       axis: Specifies the dimension to reduce along within each
         replica's tensor. Should typically be set to the batch dimension, or
@@ -241,6 +242,8 @@ class OneDeviceStrategyV1(distribute_lib.StrategyV1):
 
   def __init__(self, device):
     super(OneDeviceStrategyV1, self).__init__(OneDeviceExtended(self, device))
+    distribute_lib.distribution_strategy_gauge.get_cell("V1").set(
+        "OneDeviceStrategy")
   __init__.__doc__ = OneDeviceStrategy.__init__.__doc__
 
 
@@ -309,7 +312,7 @@ class OneDeviceExtended(distribute_lib.StrategyExtendedV1):
 
   def _experimental_distribute_values_from_function(self, value_fn):
     # TODO(b/137795644): This should return a PerReplica value but other
-    # methods like experimental_run_v2 in OneDeviceStrategy need to be modified
+    # methods like run in OneDeviceStrategy need to be modified
     # to do the same.
     return value_fn(distribute_lib.ValueContext())
 
diff --git a/tensorflow/python/distribute/saved_model_save_load_test.py b/tensorflow/python/distribute/saved_model_save_load_test.py
index e618e1bd78b..23050a612f5 100644
--- a/tensorflow/python/distribute/saved_model_save_load_test.py
+++ b/tensorflow/python/distribute/saved_model_save_load_test.py
@@ -85,8 +85,7 @@ class SavedModelTFModuleTest(test_base.TestSavedModelBase):
       dist_predict_dataset = distribution.experimental_distribute_dataset(
           predict_dataset)
       per_replica_predict_data = next(iter(dist_predict_dataset))
-      result = distribution.experimental_run_v2(
-          model, args=(per_replica_predict_data,))
+      result = distribution.run(model, args=(per_replica_predict_data,))
       # Convert the per_replica value to a list, then concatenate them
       reduced = distribution.experimental_local_results(result)
       concat = array_ops.concat(reduced, 0)
diff --git a/tensorflow/python/distribute/saved_model_test_base.py b/tensorflow/python/distribute/saved_model_test_base.py
index ee2775bf8b2..e544e51cddd 100644
--- a/tensorflow/python/distribute/saved_model_test_base.py
+++ b/tensorflow/python/distribute/saved_model_test_base.py
@@ -112,7 +112,7 @@ def load_and_run_with_saved_model_api(distribution, saved_dir, predict_dataset,
     dist_predict_dataset = distribution.experimental_distribute_dataset(
         predict_dataset)
     per_replica_predict_data = next(iter(dist_predict_dataset))
-    result = distribution.experimental_run_v2(
+    result = distribution.run(
         func.signatures[_DEFAULT_FUNCTION_KEY],
         args=(per_replica_predict_data,))
     result = result[output_name]
diff --git a/tensorflow/python/distribute/strategy_combinations_test.py b/tensorflow/python/distribute/strategy_combinations_test.py
index b41599af5b8..6f75158537d 100644
--- a/tensorflow/python/distribute/strategy_combinations_test.py
+++ b/tensorflow/python/distribute/strategy_combinations_test.py
@@ -56,8 +56,7 @@ class StrategyCombinationsTest(test.TestCase, parameterized.TestCase):
       mode=["graph", "eager"]))
   def testMirrored2CPUs(self, distribution):
     with distribution.scope():
-      one_per_replica = distribution.experimental_run_v2(
-          lambda: constant_op.constant(1))
+      one_per_replica = distribution.run(lambda: constant_op.constant(1))
       num_replicas = distribution.reduce(
           reduce_util.ReduceOp.SUM, one_per_replica, axis=None)
       self.assertEqual(2, self.evaluate(num_replicas))
diff --git a/tensorflow/python/distribute/strategy_test_lib.py b/tensorflow/python/distribute/strategy_test_lib.py
index 00730959d4e..148fda8008c 100644
--- a/tensorflow/python/distribute/strategy_test_lib.py
+++ b/tensorflow/python/distribute/strategy_test_lib.py
@@ -453,16 +453,15 @@ class OneDeviceDistributionTestBase(test.TestCase):
   """Some tests that should work with any one-device DistributionStrategy."""
 
   def _test_run(self, strategy):
-    out1 = strategy.experimental_run_v2(lambda: constant_op.constant(4.))
+    out1 = strategy.run(lambda: constant_op.constant(4.))
     self.assertAllEqual([4.], self.evaluate(strategy.unwrap(out1)))
 
-    out2 = strategy.experimental_run_v2(
-        lambda x: {"a": x * 2, "b": x * x}, args=(out1,))
+    out2 = strategy.run(lambda x: {"a": x * 2, "b": x * x}, args=(out1,))
     out2_vals = self.evaluate(nest.map_structure(strategy.unwrap, out2))
     self.assertAllEqual([8.], out2_vals["a"])
     self.assertAllEqual([16.], out2_vals["b"])
 
-    out3 = strategy.experimental_run_v2(lambda b, a: a + 2 * b + 2, kwargs=out2)
+    out3 = strategy.run(lambda b, a: a + 2 * b + 2, kwargs=out2)
     self.assertAllEqual([42.], self.evaluate(strategy.unwrap(out3)))
 
   def _test_all_reduce_sum(self, strategy):
@@ -575,17 +574,16 @@ class TwoDeviceDistributionTestBase(test.TestCase):
   """Some tests that should work with any two-device DistributionStrategy."""
 
   def _test_run(self, strategy):
-    out1 = strategy.experimental_run_v2(
+    out1 = strategy.run(
         lambda: ds_context.get_replica_context().replica_id_in_sync_group + 1)
     self.assertAllEqual([1, 2], self.evaluate(strategy.unwrap(out1)))
 
-    out2 = strategy.experimental_run_v2(
-        lambda x: {"a": x * 2, "b": x * x}, args=(out1,))
+    out2 = strategy.run(lambda x: {"a": x * 2, "b": x * x}, args=(out1,))
     out2_vals = self.evaluate(nest.map_structure(strategy.unwrap, out2))
     self.assertAllEqual([2, 4], out2_vals["a"])
     self.assertAllEqual([1, 4], out2_vals["b"])
 
-    out3 = strategy.experimental_run_v2(lambda b, a: a + 2 * b + 2, kwargs=out2)
+    out3 = strategy.run(lambda b, a: a + 2 * b + 2, kwargs=out2)
     self.assertAllEqual([6, 14], self.evaluate(strategy.unwrap(out3)))
 
   def _test_all_reduce_sum(self, strategy):
diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index 1d1c658d5e0..2a216118f22 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -85,28 +85,28 @@ def maybe_init_scope():
       yield
 
 
-def validate_experimental_run_function(fn):
-  """Validate the function passed into strategy.experimental_run_v2."""
+def validate_run_function(fn):
+  """Validate the function passed into strategy.run."""
 
   # We allow three types of functions/objects passed into TPUStrategy
-  # experimental_run_v2 in eager mode:
+  # run in eager mode:
   #   1. a user annotated tf.function
   #   2. a ConcreteFunction, this is mostly what you get from loading a saved
   #      model.
   #   3. a callable object and the `__call__` method itself is a tf.function.
   #
   # Otherwise we return an error, because we don't support eagerly running
-  # experimental_run_v2 in TPUStrategy.
+  # run in TPUStrategy.
 
-  if context.executing_eagerly() and not isinstance(
-      fn, def_function.Function) and not isinstance(
-          fn, function.ConcreteFunction) and not (callable(fn) and isinstance(
-              fn.__call__, def_function.Function)):
+  if context.executing_eagerly() \
+      and not isinstance(fn, def_function.Function) \
+      and not isinstance(fn, function.ConcreteFunction) \
+      and not (callable(fn) and isinstance(fn.__call__, def_function.Function)):
     raise NotImplementedError(
-        "TPUStrategy.experimental_run_v2(fn, ...) does not support pure eager "
+        "TPUStrategy.run(fn, ...) does not support pure eager "
         "execution. please make sure the function passed into "
-        "`strategy.experimental_run_v2` is a `tf.function` or "
-        "`strategy.experimental_run_v2` is called inside a `tf.function` if "
+        "`strategy.run` is a `tf.function` or "
+        "`strategy.run` is called inside a `tf.function` if "
         "eager behavior is enabled.")
 
 
@@ -135,10 +135,10 @@ class TPUStrategy(distribute_lib.Strategy):
 
     To run TF2 programs on TPUs, you can either use `.compile` and
     `.fit` APIs in `tf.keras` with TPUStrategy, or write your own customized
-    training loop by calling `strategy.experimental_run_v2` directly. Note that
+    training loop by calling `strategy.run` directly. Note that
     TPUStrategy doesn't support pure eager execution, so please make sure the
-    function passed into `strategy.experimental_run_v2` is a `tf.function` or
-    `strategy.experimental_run_v2` is called inside a `tf.function` if eager
+    function passed into `strategy.run` is a `tf.function` or
+    `strategy.run` is called inside a `tf.function` if eager
     behavior is enabled.
 
     Args:
@@ -159,9 +159,9 @@ class TPUStrategy(distribute_lib.Strategy):
   # TODO(cjfj): Modify `_call_for_each_replica` in `TPUExtended` such that this
   # can use the default implementation.
   # This implementation runs a single step. It does not use infeed or outfeed.
-  def experimental_run_v2(self, fn, args=(), kwargs=None, options=None):
+  def run(self, fn, args=(), kwargs=None, options=None):
     """See base class."""
-    validate_experimental_run_function(fn)
+    validate_run_function(fn)
 
     # Note: the target function is converted to graph even when in Eager mode,
     # so autograph is on by default here.
@@ -208,7 +208,7 @@ class TPUStrategyV1(distribute_lib.StrategyV1):
   # TODO(cjfj): Modify `_call_for_each_replica` in `TPUExtended` such that this
   # can use the default implementation.
   # This implementation runs a single step. It does not use infeed or outfeed.
-  def experimental_run_v2(self, fn, args=(), kwargs=None, options=None):
+  def run(self, fn, args=(), kwargs=None, options=None):
     """Run `fn` on each replica, with the given arguments.
 
     Executes ops specified by `fn` on each replica. If `args` or `kwargs` have
@@ -223,7 +223,7 @@ class TPUStrategyV1(distribute_lib.StrategyV1):
     per-replica objects containing tensors or composite tensors.
 
     Users can pass strategy specific options to `options` argument. An example
-    to enable bucketizing dynamic shapes in `TPUStrategy.experimental_run_v2`
+    to enable bucketizing dynamic shapes in `TPUStrategy.run`
     is:
     ```python
 
@@ -242,7 +242,7 @@ class TPUStrategyV1(distribute_lib.StrategyV1):
       output = tf.reduce_sum(inputs)
       return output
 
-      strategy.experimental_run_v2(step_fn, args=(next(iterator),),
+      strategy.run(step_fn, args=(next(iterator),),
                                    options=options)
     ```
 
@@ -259,7 +259,7 @@ class TPUStrategyV1(distribute_lib.StrategyV1):
       structure can either be "per-replica" `Tensor` objects or `Tensor`s
       (for example, if running on a single replica).
     """
-    validate_experimental_run_function(fn)
+    validate_run_function(fn)
 
     fn = autograph.tf_convert(fn, autograph_ctx.control_status_ctx())
     options = options or distribute_lib.RunOptions()
diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py
index 876169bd4a1..bec96e4eece 100644
--- a/tensorflow/python/distribute/tpu_strategy_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_test.py
@@ -18,11 +18,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import tpu_strategy as tpu_lib
 from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import flags
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import device_assignment as device_assignment_lib
 from tensorflow.python.tpu import tpu_strategy_util
 
 
@@ -32,14 +41,26 @@ flags.DEFINE_string("project", None, "Name of GCP project with TPU.")
 flags.DEFINE_string("zone", None, "Name of GCP zone with TPU.")
 
 
-class TpuStrategyTest(test.TestCase):
+def get_tpu_cluster_resolver():
+  resolver = tpu_cluster_resolver.TPUClusterResolver(
+      tpu=FLAGS.tpu,
+      zone=FLAGS.zone,
+      project=FLAGS.project,
+  )
+  return resolver
+
+
+def get_tpu_strategy():
+  resolver = get_tpu_cluster_resolver()
+  remote.connect_to_cluster(resolver)
+  tpu_strategy_util.initialize_tpu_system(resolver)
+  return tpu_lib.TPUStrategy(resolver)
+
+
+class TPUStrategyTest(test.TestCase):
 
   def test_multiple_initialize_system(self):
-    resolver = tpu_cluster_resolver.TPUClusterResolver(
-        tpu=FLAGS.tpu,
-        zone=FLAGS.zone,
-        project=FLAGS.project,
-    )
+    resolver = get_tpu_cluster_resolver()
     remote.connect_to_cluster(resolver)
     tpu_strategy_util.initialize_tpu_system(resolver)
 
@@ -47,6 +68,152 @@ class TpuStrategyTest(test.TestCase):
       tpu_strategy_util.initialize_tpu_system(resolver)
       self.assertRegex(str(mock_log.call_args), "already been initialized")
 
+  def test_sequential_experimental_runs(self):
+    resolver = get_tpu_cluster_resolver()
+    remote.connect_to_cluster(resolver)
+    topology = tpu_strategy_util.initialize_tpu_system(resolver)
+    # Computation replicated to all cores.
+    device_assignment = device_assignment_lib.DeviceAssignment.build(
+        topology, num_replicas=2)
+    strategy = tpu_lib.TPUStrategy(
+        resolver, device_assignment=device_assignment)
+
+    # Computation on the 1st core.
+    device_assignment2 = device_assignment_lib.DeviceAssignment.build(
+        topology, num_replicas=1)
+    strategy2 = tpu_lib.TPUStrategy(
+        resolver, device_assignment=device_assignment2)
+
+    def computation(x):
+      return math_ops.square(x)
+
+    @def_function.function
+    def train_step():
+      outputs = strategy.experimental_local_results(
+          strategy.run(computation, args=([2., 2.],)))
+      outputs2 = strategy2.run(
+          computation, args=([outputs[0]],))
+      return outputs2
+
+    self.assertAllEqual([[16., 16.]], train_step())
+
+  def test_device_switch_case(self):
+    strategy = get_tpu_strategy()
+    with strategy.scope():
+      a = variables.Variable(1)
+
+    inference_iteration = variables.Variable(-1)
+
+    def inference_fn(x, i):
+      return a + x + i
+
+    @def_function.function
+    def run_inference(x):
+
+      def do_inference(device, inference_fn, i):
+        with ops.device(device):
+          return inference_fn(x, i)
+
+      branch_fns = {
+          0: (lambda: do_inference("/device:TPU:0", inference_fn, 0)),
+          1: (lambda: do_inference("/device:TPU:1", inference_fn, 1)),
+      }
+      branch_index = inference_iteration.assign_add(1, use_locking=True) % 2
+      return control_flow_ops.switch_case(branch_index, branch_fns)
+
+    self.assertAllEqual(2., run_inference(1))  # Use TPU core 0.
+    self.assertAllEqual(3., run_inference(1))  # Use TPU core 1.
+
+  def test_recover_from_compilation_failures(self):
+    # TODO(b/148150981): Stop skipping this test once recovery works
+    # for non-local TPU.
+    if FLAGS.tpu:
+      self.skipTest("Recovery fails for non-local TPU, see b/148150981")
+    strategy = get_tpu_strategy()
+
+    @def_function.function
+    def compilation_failure_run():
+
+      def computation():
+        return random_ops.random_gamma([10], [0.5, 1.5])
+
+      return strategy.experimental_run_v2(computation)
+
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "TPU compilation failed"):
+      compilation_failure_run()
+
+    @def_function.function
+    def good_run():
+
+      def computation():
+        return random_ops.random_normal([10])
+
+      return strategy.experimental_run_v2(computation)
+
+    good_run()
+
+  def test_computation_on_subset_cores(self):
+    resolver = get_tpu_cluster_resolver()
+    remote.connect_to_cluster(resolver)
+    topology = tpu_strategy_util.initialize_tpu_system(resolver)
+    all_core_strategy = tpu_lib.TPUStrategy(resolver)
+
+    with all_core_strategy.scope():
+      v = variables.Variable(0.0,
+                             aggregation=variables.VariableAggregation.MEAN)
+
+    # Computation on the 1st core.
+    device_assignment = device_assignment_lib.DeviceAssignment.build(
+        topology, num_replicas=1)
+    first_core_strategy = tpu_lib.TPUStrategy(
+        resolver, device_assignment=device_assignment)
+
+    # Computation on the 2nd core.
+    device_assignment2 = device_assignment_lib.DeviceAssignment(
+        topology, [[[0, 0, 0, 1]]])
+    second_core_strategy = tpu_lib.TPUStrategy(
+        resolver, device_assignment=device_assignment2)
+
+    @def_function.function
+    def train_step():
+
+      def step_fn():
+        return v + 1.0
+
+      all_core_strategy.run(step_fn)
+      r1 = first_core_strategy.run(step_fn)
+      r2 = second_core_strategy.run(step_fn)
+      return r1 + r2
+
+    train_step()
+    self.assertAllEqual(2., train_step())
+
+  def test_worker_devices_on_subset_cores(self):
+    resolver = get_tpu_cluster_resolver()
+    remote.connect_to_cluster(resolver)
+    topology = tpu_strategy_util.initialize_tpu_system(resolver)
+
+    # Strategy for the 1st core.
+    device_assignment = device_assignment_lib.DeviceAssignment.build(
+        topology, num_replicas=1)
+    first_core_strategy = tpu_lib.TPUStrategy(
+        resolver, device_assignment=device_assignment)
+
+    # Strategy for the 2nd core.
+    device_assignment2 = device_assignment_lib.DeviceAssignment(
+        topology, [[[0, 0, 0, 1]]])
+    second_core_strategy = tpu_lib.TPUStrategy(
+        resolver, device_assignment=device_assignment2)
+
+    self.assertLen(first_core_strategy.extended.worker_devices, 1)
+    self.assertEndsWith(first_core_strategy.extended.worker_devices[0],
+                        "device:TPU:0")
+
+    self.assertLen(second_core_strategy.extended.worker_devices, 1)
+    self.assertEndsWith(second_core_strategy.extended.worker_devices[0],
+                        "device:TPU:1")
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 386a6b9790c..2bc5f61d076 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -61,13 +61,13 @@ class DistributedValues(object):
 
   A subclass instance of DistributedValues is created when creating variables
   within a distribution strategy, iterating a `tf.Dataset` or through
-  `strategy.experimental_run_v2`.  This base class should never be instantiated
+  `strategy.run`.  This base class should never be instantiated
   directly.  DistributedValues contains a value per replica.  Depending on
   the subclass, the values could either be synced on update, synced on demand,
   or never synced.
 
   DistributedValues can be reduced to obtain single value across replicas,
-  as input into `experimental_run_v2` or the per replica values inspected
+  as input into `run` or the per replica values inspected
   using `experimental_local_results`.
 
   Example usage:
@@ -79,16 +79,17 @@ class DistributedValues(object):
   >>> dataset_iterator = iter(strategy.experimental_distribute_dataset(dataset))
   >>> distributed_values = next(dataset_iterator)
 
-  2. Returned by `experimental_run_v2`:
+  2. Returned by `run`:
 
   >>> strategy = tf.distribute.MirroredStrategy()
   >>> @tf.function
   ... def run():
   ...   ctx = tf.distribute.get_replica_context()
   ...   return ctx.replica_id_in_sync_group
-  >>> distributed_values = strategy.experimental_run_v2(run)
+  >>> distributed_values = strategy.run(run)
+
+  3. As input into `run`:
 
-  3. As input into `experimental_run_v2`:
   >>> strategy = tf.distribute.MirroredStrategy()
   >>> dataset = tf.data.Dataset.from_tensor_slices([5., 6., 7., 8.]).batch(2)
   >>> dataset_iterator = iter(strategy.experimental_distribute_dataset(dataset))
@@ -96,10 +97,10 @@ class DistributedValues(object):
   >>> @tf.function
   ... def run(input):
   ...   return input + 1.0
-  >>> updated_value = strategy.experimental_run_v2(run,
-  ...                                              args=(distributed_values,))
+  >>> updated_value = strategy.run(run, args=(distributed_values,))
+
+  4. Reduce value:
 
-  4. Reduce value
   >>> strategy = tf.distribute.MirroredStrategy()
   >>> dataset = tf.data.Dataset.from_tensor_slices([5., 6., 7., 8.]).batch(2)
   >>> dataset_iterator = iter(strategy.experimental_distribute_dataset(dataset))
@@ -108,7 +109,8 @@ class DistributedValues(object):
   ...                                 distributed_values,
   ...                                 axis = 0)
 
-  5. Inspect per replica values.
+  5. Inspect per replica values:
+
   >>> strategy = tf.distribute.MirroredStrategy()
   >>> dataset = tf.data.Dataset.from_tensor_slices([5., 6., 7., 8.]).batch(2)
   >>> dataset_iterator = iter(strategy.experimental_distribute_dataset(dataset))
diff --git a/tensorflow/python/distribute/values_test.py b/tensorflow/python/distribute/values_test.py
index 1035df489c2..290ea7d011a 100644
--- a/tensorflow/python/distribute/values_test.py
+++ b/tensorflow/python/distribute/values_test.py
@@ -215,8 +215,8 @@ class DistributedValuesTest(test.TestCase, parameterized.TestCase):
         return math_ops.square(x)
 
       outputs = distribution.experimental_local_results(
-          distribution.experimental_run_v2(computation,
-                                           args=(distributed_values,)))
+          distribution.run(computation,
+                           args=(distributed_values,)))
       return outputs
 
     local_results = run()
@@ -740,7 +740,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
 
       results = self.evaluate(
           distribution.experimental_local_results(
-              distribution.experimental_run_v2(f)))
+              distribution.run(f)))
       for value in results:
         self.assertEqual(2., value)
 
@@ -798,7 +798,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
                                  "Cannot update non-float variables"):
       self.evaluate(
           distribution.experimental_local_results(
-              distribution.experimental_run_v2(assign)))
+              distribution.run(assign)))
 
     # allow assign() with same value in replica context.
     @def_function.function
@@ -807,7 +807,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
 
     self.evaluate(
         distribution.experimental_local_results(
-            distribution.experimental_run_v2(assign_same)))
+            distribution.run(assign_same)))
     self.assertEqual(self.evaluate(v.read_value()), 2)
 
     # allow assign() with mirrored variable in replica context.
@@ -824,7 +824,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
 
     self.evaluate(
         distribution.experimental_local_results(
-            distribution.experimental_run_v2(assign_mirrored)))
+            distribution.run(assign_mirrored)))
     self.assertEqual(self.evaluate(v.read_value()), 3)
 
     # allow assign() in cross replica context.
@@ -912,7 +912,8 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
       def f():
         if v[0] is None:
           v[0] = variables_lib.Variable(random_ops.random_normal([]))
-      distribution.experimental_run_v2(f)
+
+      distribution.run(f)
 
     context.set_global_seed(None)
     step()
@@ -953,7 +954,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
 
     @def_function.function
     def foo():
-      distribution.experimental_run_v2(replica_fn)
+      distribution.run(replica_fn)
 
     foo()
 
@@ -980,7 +981,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
       replica_id = ctx.replica_id_in_sync_group
       return v.assign(math_ops.cast(replica_id, dtypes.float32))
     per_replica_results = self.evaluate(distribution.experimental_local_results(
-        distribution.experimental_run_v2(assign)))
+        distribution.run(assign)))
     # The per-replica values should always match the first replicas value.
     self.assertAllEqual(
         array_ops.zeros(distribution.num_replicas_in_sync, dtypes.float32),
@@ -1006,7 +1007,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
 
     per_replica_results = self.evaluate(
         distribution.experimental_local_results(
-            distribution.experimental_run_v2(assign)))
+            distribution.run(assign)))
     # The per-replica values should always match the first replicas value.
     self.assertAllEqual([3, 3], per_replica_results)
 
@@ -1037,7 +1038,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
 
     per_replica_results = self.evaluate(
         distribution.experimental_local_results(
-            distribution.experimental_run_v2(scatter_sub)))
+            distribution.run(scatter_sub)))
     self.assertAllEqual([[0., -1., -1.], [0., -1., -1.]], per_replica_results)
 
   @combinations.generate(
@@ -1064,7 +1065,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
 
     per_replica_results = self.evaluate(
         distribution.experimental_local_results(
-            distribution.experimental_run_v2(scatter_add)))
+            distribution.run(scatter_add)))
     self.assertAllEqual([[0, 2, 2], [0, 2, 2]], per_replica_results)
 
   @combinations.generate(
@@ -1091,7 +1092,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
 
     per_replica_results = self.evaluate(
         distribution.experimental_local_results(
-            distribution.experimental_run_v2(scatter_div)))
+            distribution.run(scatter_div)))
     self.assertAllEqual([[0, 2, 1], [0, 2, 1]], per_replica_results)
 
   @combinations.generate(
@@ -1119,7 +1120,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
 
     per_replica_results = self.evaluate(
         distribution.experimental_local_results(
-            distribution.experimental_run_v2(scatter_mul)))
+            distribution.run(scatter_mul)))
     self.assertAllClose([[2., 1.5, 1.], [2., 1.5, 1.]], per_replica_results)
 
   @combinations.generate(
@@ -1148,11 +1149,11 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegex(NotImplementedError, "scatter_min.*"):
       self.evaluate(
           distribution.experimental_local_results(
-              distribution.experimental_run_v2(scatter_min, args=(v1,))))
+              distribution.run(scatter_min, args=(v1,))))
 
     per_replica_results = self.evaluate(
         distribution.experimental_local_results(
-            distribution.experimental_run_v2(scatter_min, args=(v2,))))
+            distribution.run(scatter_min, args=(v2,))))
     self.assertAllClose([[0, 1, 0], [0, 1, 0]], per_replica_results)
 
   @combinations.generate(
@@ -1181,11 +1182,11 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegex(NotImplementedError, "scatter_max.*"):
       self.evaluate(
           distribution.experimental_local_results(
-              distribution.experimental_run_v2(scatter_max, args=(v1,))))
+              distribution.run(scatter_max, args=(v1,))))
 
     per_replica_results = self.evaluate(
         distribution.experimental_local_results(
-            distribution.experimental_run_v2(scatter_max, args=(v2,))))
+            distribution.run(scatter_max, args=(v2,))))
     self.assertAllClose([[1, 0, 0], [1, 0, 0]], per_replica_results)
 
   @combinations.generate(
@@ -1214,11 +1215,11 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegex(NotImplementedError, "scatter_update.*"):
       self.evaluate(
           distribution.experimental_local_results(
-              distribution.experimental_run_v2(scatter_update, args=(v1,))))
+              distribution.run(scatter_update, args=(v1,))))
 
     per_replica_results = self.evaluate(
         distribution.experimental_local_results(
-            distribution.experimental_run_v2(scatter_update, args=(v2,))))
+            distribution.run(scatter_update, args=(v2,))))
     self.assertAllClose([[0, 3, 0], [0, 3, 0]], per_replica_results)
 
   @combinations.generate(
@@ -1314,7 +1315,7 @@ def mirrored_and_tpu_strategy_combinations():
 # tests.
 def strategy_and_run_tf_function_combinations():
   # Test the combination of different strategies and whether a tf.function
-  # is passed into strategy.experimental_run_v2."""
+  # is passed into strategy.run."""
   return combinations.combine(
       distribution=[
           strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
@@ -1538,7 +1539,8 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase):
         if experimental_run_tf_function:
           update_fn = def_function.function(update_fn)
         return distribution.experimental_local_results(
-            distribution.experimental_run_v2(update_fn))
+            distribution.run(update_fn))
+
     updates = [("assign", 1.), ("assign_add", 1.), ("assign_sub", -1.)]
     aggregations = [
         variables_lib.VariableAggregation.NONE,
@@ -1574,7 +1576,8 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase):
         if experimental_run_tf_function:
           update_fn = def_function.function(update_fn)
         return distribution.experimental_local_results(
-            distribution.experimental_run_v2(update_fn))
+            distribution.run(update_fn))
+
     updates = [("assign", 1), ("assign_add", 1), ("assign_sub", -1)]
     aggregations = [
         variables_lib.VariableAggregation.NONE,
@@ -1648,7 +1651,7 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase):
         read_var_fn = v.read_value
       results = self.evaluate(
           distribution.experimental_local_results(
-              distribution.experimental_run_v2(read_var_fn)))
+              distribution.run(read_var_fn)))
       for component, value in zip(v._values, results):
         self.assertAllEqual(self.evaluate(component.read_value()), value)
 
@@ -1679,8 +1682,8 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase):
       if experimental_run_tf_function:
         assign = def_function.function(assign)
 
-      self.evaluate(distribution.experimental_local_results(
-          distribution.experimental_run_v2(assign)))
+      self.evaluate(
+          distribution.experimental_local_results(distribution.run(assign)))
       num_replicas = distribution.num_replicas_in_sync
       sum_of_replica_values = num_replicas * (num_replicas - 1) / 2.
       if aggregation == variables_lib.VariableAggregation.SUM:
@@ -1717,8 +1720,7 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase):
       all_reduce = def_function.function(all_reduce)
 
     per_replica_results = self.evaluate(
-        distribution.experimental_local_results(
-            distribution.experimental_run_v2(all_reduce)))
+        distribution.experimental_local_results(distribution.run(all_reduce)))
     expected_result = []
     for i in range(distribution.num_replicas_in_sync):
       expected_result.append(2.0 * distribution.num_replicas_in_sync +
@@ -1750,8 +1752,7 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase):
         assign = def_function.function(assign)
 
       per_replica_results = self.evaluate(
-          distribution.experimental_local_results(
-              distribution.experimental_run_v2(assign)))
+          distribution.experimental_local_results(distribution.run(assign)))
       expected_result = []
       for i in range(distribution.num_replicas_in_sync):
         expected_result.append(1.0 * i)
@@ -1781,7 +1782,8 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase):
           v[0] = variables_lib.Variable(
               random_ops.random_normal([]),
               synchronization=variables_lib.VariableSynchronization.ON_READ)
-      distribution.experimental_run_v2(f)
+
+      distribution.run(f)
 
     context.set_global_seed(None)
     step()
diff --git a/tensorflow/python/distribute/zero_batch_test.py b/tensorflow/python/distribute/zero_batch_test.py
index b07d054069e..e590d815459 100644
--- a/tensorflow/python/distribute/zero_batch_test.py
+++ b/tensorflow/python/distribute/zero_batch_test.py
@@ -134,8 +134,7 @@ class NormalizationTest(test.TestCase, parameterized.TestCase):
           optimizer.apply_gradients(zip(grads, bn.variables))
           return loss
 
-        return distribution.experimental_run_v2(
-            step_fn, args=(inputs, targets))
+        return distribution.run(step_fn, args=(inputs, targets))
 
       for _ in range(100):
         np_output = train_step().numpy()
@@ -153,8 +152,7 @@ class NormalizationTest(test.TestCase, parameterized.TestCase):
           outputs = bn.apply(inputs, training=False)
           return outputs
 
-        return distribution.experimental_run_v2(
-            step_fn, args=(inputs,))
+        return distribution.run(step_fn, args=(inputs,))
 
       # Test inference.
       self.assertAllEqual(np.zeros(shape=(0, 4, 4, 3), dtype=np.float32),
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 76dc22e17f7..5696915dc80 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -868,6 +868,43 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "remote_execution_test",
+    srcs = ["remote_execution_test.py"],
+    grpc_enabled = True,
+    python_version = "PY3",
+    shard_count = 8,
+    tags = [
+        "no_oss",  # This test launches local server
+    ],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/eager:remote",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+cuda_py_test(
+    name = "remote_cluster_test",
+    srcs = ["remote_cluster_test.py"],
+    grpc_enabled = True,
+    python_version = "PY3",
+    shard_count = 16,
+    tags = [
+        "no_oss",  # This test launches local server
+    ],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tpu_py_test(
     name = "remote_cloud_tpu_test",
     srcs = ["remote_cloud_tpu_test.py"],
diff --git a/tensorflow/python/eager/benchmarks/resnet50/resnet50.py b/tensorflow/python/eager/benchmarks/resnet50/resnet50.py
index 1237928b2d9..7e6a835bbcb 100644
--- a/tensorflow/python/eager/benchmarks/resnet50/resnet50.py
+++ b/tensorflow/python/eager/benchmarks/resnet50/resnet50.py
@@ -286,8 +286,8 @@ class ResNet50(tf.keras.Model):
       if pooling == 'avg':
         self.global_pooling = functools.partial(
             tf.reduce_mean,
-            reduction_indices=reduction_indices,
-            keep_dims=False)
+            axis=reduction_indices,
+            keepdims=False)
       elif pooling == 'max':
         self.global_pooling = functools.partial(
             tf.reduce_max, reduction_indices=reduction_indices, keep_dims=False)
diff --git a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
index 754a3e74219..5562b31fe95 100644
--- a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
+++ b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
@@ -63,10 +63,10 @@ def _events_from_file(filepath):
   Returns:
     A list of all tf.compat.v1.Event protos in the event file.
   """
-  records = list(tf.python_io.tf_record_iterator(filepath))
+  records = list(tf.compat.v1.python_io.tf_record_iterator(filepath))
   result = []
   for r in records:
-    event = tf.Event()
+    event = tf.compat.v1.Event()
     event.ParseFromString(r)
     result.append(event)
   return result
@@ -193,13 +193,13 @@ class ResNet50Test(tf.test.TestCase):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(data_format)
     tf.compat.v2.summary.experimental.set_step(
-        tf.train.get_or_create_global_step())
+        tf.compat.v1.train.get_or_create_global_step())
     logdir = tempfile.mkdtemp()
     with tf.compat.v2.summary.create_file_writer(
         logdir, max_queue=0,
         name='t0').as_default(), tf.compat.v2.summary.record_if(True):
       with tf.device(device), context.execution_mode(execution_mode):
-        optimizer = tf.train.GradientDescentOptimizer(0.1)
+        optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.1)
         images, labels = resnet50_test_util.random_batch(2, data_format)
         apply_gradients(model, optimizer,
                         compute_gradients(model, images, labels))
@@ -218,7 +218,7 @@ class ResNet50Test(tf.test.TestCase):
   def test_no_garbage(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(data_format)
-    optimizer = tf.train.GradientDescentOptimizer(0.1)
+    optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.1)
     with tf.device(device):
       images, labels = resnet50_test_util.random_batch(2, data_format)
       gc.disable()
@@ -338,7 +338,7 @@ class ResNet50Benchmarks(tf.test.Benchmark):
         (images, labels) = resnet50_test_util.random_batch(
             batch_size, data_format)
         model = resnet50.ResNet50(data_format)
-        optimizer = tf.train.GradientDescentOptimizer(0.1)
+        optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.1)
         apply_grads = apply_gradients
         if defun:
           model.call = tf.function(model.call)
@@ -409,5 +409,5 @@ class ResNet50Benchmarks(tf.test.Benchmark):
 
 
 if __name__ == '__main__':
-  tf.enable_eager_execution()
+  tf.compat.v1.enable_eager_execution()
   tf.test.main()
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 8058136eb38..073c33383c3 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -2180,8 +2180,11 @@ def async_scope():
   try:
     os.environ[remote_async_env_var] = str(True)
     yield
-  finally:
+    # Note: sync local and remote executors iff the async block does not raise
+    # an exception. Triggering sync after an exception may lead to derived
+    # runtime errors and unexpected exception types.
     context().sync_executors()
+  finally:
     if old_policy is None:
       del os.environ[remote_async_env_var]
     else:
diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py
index 6c116aa26c3..e49a19807a8 100644
--- a/tensorflow/python/eager/def_function.py
+++ b/tensorflow/python/eager/def_function.py
@@ -175,7 +175,7 @@ class UnliftedInitializerVariable(resource_variable_ops.UninitializedVariable):
 
     with ops.name_scope(name, "Variable", []
                         if init_from_fn else [initial_value]) as scope_name:
-      with ops.name_scope("Initializer"), ops.device(None):
+      with ops.name_scope("Initializer"):
         initial_value = ops.convert_to_tensor(
             initial_value() if init_from_fn else initial_value,
             name="initial_value", dtype=dtype)
@@ -559,7 +559,6 @@ class Function(object):
 
   def __call__(self, *args, **kwds):
     """Calls the graph function and warn too frequent tracings."""
-    context.ensure_initialized()
     if RUN_FUNCTIONS_EAGERLY:
       return self._python_function(*args, **kwds)
 
diff --git a/tensorflow/python/eager/def_function_test.py b/tensorflow/python/eager/def_function_test.py
index cb0c239ceee..998350f695d 100644
--- a/tensorflow/python/eager/def_function_test.py
+++ b/tensorflow/python/eager/def_function_test.py
@@ -161,7 +161,7 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
       return state[0] * x
 
     init_fn = fn.get_initialization_function(constant_op.constant(1.0))
-    self.assertEqual(len(state), 1)
+    self.assertLen(state, 1)
     self.assertFalse(
         resource_variable_ops.var_is_initialized_op(state[0].handle))
     init_fn()
@@ -617,7 +617,6 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
     v_holder[1].assign(11.)
     self.assertAllClose([14., 15.], wrapper(constant_op.constant(2.)))
 
-  # TODO(b/137148281): reenable
   @test_util.run_gpu_only
   def testDeviceAnnotationRespected(self):
     a = []
@@ -634,9 +633,29 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
 
       return a[0].read_value()
 
-    created_variable_read = create_variable()
+    create_variable()
     self.assertRegexpMatches(a[0].device, 'CPU')
 
+  @test_util.run_gpu_only
+  def testDeviceAnnotationForInitializerRespected(self):
+    a = []
+    initial_value = []
+
+    def initial_value_fn():
+      initial_value.append(random_ops.random_uniform((2, 3)))
+      return initial_value[0]
+
+    @def_function.function()
+    def create_variable():
+      with ops.init_scope():
+        if not a:
+          a.append(variables.Variable(initial_value_fn))
+
+    with ops.device('CPU:0'):
+      create_variable()
+    self.assertRegexpMatches(a[0].device, 'CPU')
+    self.assertRegexpMatches(initial_value[0].device, 'CPU')
+
   def testDecorate(self):
     func = def_function.function(lambda: 1)
     def decorator(f):
diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index 71473e51706..1dc580549ce 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -958,8 +958,7 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
     strategy = mirrored_strategy.MirroredStrategy()
     with strategy.scope():
       v = variables.Variable([1., 2., 3.])
-      strategy.experimental_run_v2(
-          _replicated, args=(constant_op.constant([.1, -.2, .3]),))
+      strategy.run(_replicated, args=(constant_op.constant([.1, -.2, .3]),))
 
   # TODO(b/141025187): Add a no_new_pyobjects decorator.
   def testArgumentUnused(self):
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index f64b05aa599..0782762c1de 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -795,7 +795,7 @@ PyObject* GetPythonObjectFromInt(int num) {
 
 // Python subclass of Exception that is created on not ok Status.
 tensorflow::mutex exception_class_mutex(tensorflow::LINKER_INITIALIZED);
-PyObject* exception_class GUARDED_BY(exception_class_mutex) = nullptr;
+PyObject* exception_class TF_GUARDED_BY(exception_class_mutex) = nullptr;
 
 // Python subclass of Exception that is created to signal fallback.
 PyObject* fallback_exception_class = nullptr;
@@ -1440,7 +1440,7 @@ class GradientTape
   bool watch_accessed_variables_;
   tensorflow::mutex watched_variables_mu_;
   std::set<IdAndVariable, CompareById> watched_variables_
-      GUARDED_BY(watched_variables_mu_);
+      TF_GUARDED_BY(watched_variables_mu_);
 };
 
 typedef tensorflow::eager::ForwardAccumulator<PyObject, PyBackwardFunction,
@@ -3924,8 +3924,6 @@ tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg,
 // `include_tensor_ranks_only` allows caching on arguments excluding shape info,
 // so that a slow path using relaxed shape can rely on a cache key that excludes
 // shapes.
-//
-// TODO(nareshmodi): Add support for sparse tensors.
 PyObject* TFE_Py_EncodeArg(PyObject* arg, bool include_tensor_ranks_only) {
   EncodeResult result;
   const auto status =
diff --git a/tensorflow/python/eager/remote_cluster_test.py b/tensorflow/python/eager/remote_cluster_test.py
new file mode 100644
index 00000000000..e26b99a8aa0
--- /dev/null
+++ b/tensorflow/python/eager/remote_cluster_test.py
@@ -0,0 +1,511 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for remote eager execution."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.core.protobuf import cluster_pb2
+from tensorflow.core.protobuf import tensorflow_server_pb2
+from tensorflow.python import pywrap_tfe
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import coordinator
+from tensorflow.python.training import server_lib
+
+JOB_NAME = "remote_device"
+
+
+def get_server_def(job_name, local_server_port, remote_server_addresses,
+                   task_index):
+  """Returns a server def with a single job + multiple tasks."""
+  cluster_def = cluster_pb2.ClusterDef()
+  job_def = cluster_def.job.add()
+  job_def.name = job_name
+  job_def.tasks[0] = "localhost:%d" % local_server_port
+
+  for i, remote_server_address in enumerate(remote_server_addresses, start=1):
+    job_def.tasks[i] = remote_server_address
+
+  server_def = tensorflow_server_pb2.ServerDef(
+      cluster=cluster_def,
+      job_name=job_name,
+      task_index=task_index,
+      protocol="grpc")
+
+  return server_def
+
+
+class DynamicClusterTest(test.TestCase, parameterized.TestCase):
+
+  def __init__(self, methodName="runTest"):  # pylint: disable=invalid-name
+    super(DynamicClusterTest, self).__init__(methodName)
+    self._cached_server1 = server_lib.Server.create_local_server()
+    self._cached_server2 = server_lib.Server.create_local_server()
+    self._cached_server3 = server_lib.Server.create_local_server()
+    self._cached_server4 = server_lib.Server.create_local_server()
+
+    self._cached_server1_target = self._cached_server1.target[len("grpc://"):]
+    self._cached_server2_target = self._cached_server2.target[len("grpc://"):]
+    self._cached_server3_target = self._cached_server3.target[len("grpc://"):]
+    self._cached_server4_target = self._cached_server4.target[len("grpc://"):]
+
+    self.server_def_s1 = get_server_def(
+        JOB_NAME,
+        local_server_port=0,
+        remote_server_addresses=[self._cached_server1_target],
+        task_index=0)
+    self.server_def_s1_s2 = get_server_def(
+        JOB_NAME,
+        local_server_port=0,
+        remote_server_addresses=[
+            self._cached_server1_target, self._cached_server2_target
+        ],
+        task_index=0)
+    self.server_def_s1_s3 = get_server_def(
+        JOB_NAME,
+        local_server_port=0,
+        remote_server_addresses=[
+            self._cached_server1_target, self._cached_server3_target
+        ],
+        task_index=0)
+    self.server_def_s4_s3 = get_server_def(
+        JOB_NAME,
+        local_server_port=0,
+        remote_server_addresses=[
+            self._cached_server4_target, self._cached_server3_target
+        ],
+        task_index=0)
+    self.server_def_s1_s2_s3 = get_server_def(
+        JOB_NAME,
+        local_server_port=0,
+        remote_server_addresses=[
+            self._cached_server1_target, self._cached_server2_target,
+            self._cached_server3_target
+        ],
+        task_index=0)
+
+    self.device_local = "/job:%s/replica:0/task:0/device:CPU:0" % JOB_NAME
+    self.device_t1 = "/job:%s/replica:0/task:1/device:CPU:0" % JOB_NAME
+    self.device_t2 = "/job:%s/replica:0/task:2/device:CPU:0" % JOB_NAME
+    self.device_t3 = "/job:%s/replica:0/task:3/device:CPU:0" % JOB_NAME
+
+  def setUp(self):
+    super(DynamicClusterTest, self).setUp()
+    local_port = pywrap_tfe.TF_PickUnusedPortOrDie()
+    context.set_server_def(
+        server_def=get_server_def(
+            JOB_NAME,
+            local_server_port=local_port,
+            remote_server_addresses=[
+                self._cached_server1_target, self._cached_server2_target
+            ],
+            task_index=0))
+
+  def tearDown(self):
+    super(DynamicClusterTest, self).tearDown()
+    context._reset_context()
+
+  @test_util.run_in_async_and_sync_mode
+  def testServerAdded(self):
+    """Add a server to cluster, and run remote ops on it."""
+    with ops.device(self.device_t1):
+      x1 = array_ops.ones([2, 2])
+
+    context.update_server_def(server_def=self.server_def_s1_s2_s3)
+    with ops.device(self.device_t3):
+      x2 = array_ops.ones([2, 2])
+
+    # Test new server accessing resources on old server
+    with ops.device(self.device_t3):
+      y = math_ops.matmul(x1, x2)
+    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+    # Test old server accessing resources on new server
+    with ops.device(self.device_t2):
+      y = math_ops.matmul(x1, x2)
+    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+  @test_util.run_in_async_and_sync_mode
+  def testServerRemoved(self):
+    """Remove a server from cluster, and run ops on cluster."""
+    with ops.device(self.device_t1):
+      x1 = array_ops.ones([2, 2])
+    with ops.device(self.device_t2):
+      x2 = array_ops.ones([2, 2])
+
+    with ops.device(self.device_t1):
+      y = math_ops.matmul(x1, x2)
+    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+    context.update_server_def(server_def=self.server_def_s1)
+    with ops.device(self.device_t1):
+      y = math_ops.matmul(x1, x1)
+    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+    # Running ops on removed server s2 throws an exception
+    with self.assertRaises(errors.InvalidArgumentError) as cm:
+      with ops.device(self.device_t2):
+        y = math_ops.matmul(x1, x2)
+    self.assertIn("unknown device", cm.exception.message)
+
+    # TODO(haoyuzhang): raise and catch exception when accessing tensors on
+    # the removed servers.
+
+  @test_util.run_in_async_and_sync_mode
+  def testServerReplaced(self):
+    """Replace remote host_port for a task, and run ops on cluster."""
+    with ops.device(self.device_t1):
+      x1 = array_ops.ones([2, 2])
+
+    context.update_server_def(server_def=self.server_def_s1_s3)
+    with ops.device(self.device_t2):
+      y = math_ops.matmul(x1, x1)
+    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+  @test_util.run_in_async_and_sync_mode
+  def testFunctionServerAdded(self):
+    """Add a server to cluster, and run remote function on it."""
+    with ops.device(self.device_t1):
+      x1 = array_ops.ones([2, 2])
+
+    @def_function.function
+    def worker_fn(i):
+      return math_ops.matmul(i, i)
+
+    # Forces function tracing and registration
+    worker_fn.get_concrete_function(x1)
+
+    context.update_server_def(server_def=self.server_def_s1_s2_s3)
+    with ops.device(self.device_t3):
+      y = worker_fn(x1)
+    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+    with ops.device(self.device_t3):
+      x2 = array_ops.ones([2, 2])
+    with ops.device(self.device_t1):
+      y = worker_fn(x2)
+    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+  @test_util.run_in_async_and_sync_mode
+  def testFunctionServerRemoved(self):
+    """Remove a server from cluster, and run ops on cluster."""
+
+    @def_function.function
+    def worker_fn(i):
+      return math_ops.matmul(i, i)
+
+    with ops.device(self.device_t1):
+      x1 = array_ops.ones([2, 2])
+
+    # Forces function tracing and registration
+    worker_fn.get_concrete_function(x1)
+
+    context.update_server_def(server_def=self.server_def_s1)
+
+    with ops.device(self.device_t1):
+      y = worker_fn(x1)
+    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+    # Running functions on removed server s2 throws an exception
+    with self.assertRaises(errors.InvalidArgumentError) as cm:
+      with ops.device(self.device_t2):
+        y = worker_fn(x1)
+    self.assertIn(" unknown device", cm.exception.message)
+
+    # TODO(haoyuzhang): raise and catch exception when accessing tensors on
+    # the removed servers.
+
+  @test_util.run_in_async_and_sync_mode
+  def testFunctionServerRemovedAddedBack(self):
+    """Add and remove a server, and run functions on cluster."""
+    with ops.device(self.device_t1):
+      x1 = array_ops.ones([2, 2])
+
+    @def_function.function
+    def worker_fn(i):
+      return math_ops.matmul(i, i)
+
+    # Forces function tracing and registration
+    worker_fn.get_concrete_function(x1)
+
+    context.update_server_def(server_def=self.server_def_s1_s2_s3)
+    with ops.device(self.device_t3):
+      y = worker_fn(x1)
+    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+    context.update_server_def(server_def=self.server_def_s1_s2)
+    with ops.device(self.device_t2):
+      y = worker_fn(x1)
+    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+    context.update_server_def(server_def=self.server_def_s1_s2_s3)
+    with ops.device(self.device_t3):
+      y = worker_fn(x1)
+    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+  @test_util.run_in_async_and_sync_mode
+  def testFunctionServerReplaced(self):
+    """Replace remote host_port for a task, and run functions on cluster."""
+    with ops.device(self.device_t1):
+      x1 = array_ops.ones([2, 2])
+
+    @def_function.function
+    def worker_fn(i):
+      return math_ops.matmul(i, i)
+
+    # Forces function tracing and registration
+    worker_fn.get_concrete_function(x1)
+
+    context.update_server_def(server_def=self.server_def_s1_s3)
+    with ops.device(self.device_t2):
+      y = worker_fn(x1)
+    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+  @test_util.run_in_async_and_sync_mode
+  def testPendingNodesServerReplaced(self):
+    """Update cluster when nodes are still pending on remote workers."""
+    with ops.device(self.device_local):
+      x1 = array_ops.ones([2, 2])
+
+    @def_function.function
+    def worker_fn(i):
+      return math_ops.matmul(i, i)
+
+    # Forces function tracing and registration
+    worker_fn.get_concrete_function(x1)
+
+    # Add enough ops so they are pending when changing the cluster
+    num_nodes = 10
+    ret = [None] * num_nodes
+    for i in range(num_nodes):
+      with ops.device(self.device_t1):
+        ret[i] = worker_fn(x1)
+    # While nodes are still pending on worker s1, replace worker s2 with s3.
+    context.update_server_def(server_def=self.server_def_s1_s3)
+    with ops.device(self.device_t2):
+      y = worker_fn(x1)
+    for i in range(num_nodes):
+      np.testing.assert_array_equal([[2, 2], [2, 2]], ret[i].numpy())
+    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+  @test_util.run_in_async_and_sync_mode
+  def testMultiThreadPendingNodesServerReplaced(self):
+    """Update cluster when other remote function calls are being launched."""
+    with ops.device(self.device_local):
+      x1 = array_ops.ones([2, 2])
+
+    num_calls = 10
+    lock = threading.Lock()
+
+    @def_function.function
+    def worker_fn(i):
+      return math_ops.matmul(i, i)
+
+    def thread_fn(device, results):
+      for i in range(num_calls):
+        lock.acquire()
+        with ops.device(device):
+          y = worker_fn(x1)
+        results[i] = y.numpy()
+        lock.release()
+
+    def update_server_def_fn():
+      for i in range(num_calls):
+        lock.acquire()
+        context.update_server_def(
+            server_def=(self.server_def_s1_s2 if i %
+                        2 == 0 else self.server_def_s1_s3))
+        lock.release()
+
+    t1_results = [None] * num_calls
+    t2_results = [None] * num_calls
+    threads = []
+    threads.append(threading.Thread(target=thread_fn,
+                                    args=(self.device_t1, t1_results)))
+    threads.append(threading.Thread(target=thread_fn,
+                                    args=(self.device_t2, t2_results)))
+    threads.append(threading.Thread(target=update_server_def_fn))
+    for t in threads:
+      t.start()
+    for t in threads:
+      t.join()
+    for result in t1_results + t2_results:
+      np.testing.assert_array_equal([[2, 2], [2, 2]], result)
+
+  @test_util.run_in_async_and_sync_mode
+  def testMultiThreadPendingNodesLockFree(self):
+    """Update cluster when other remote function calls are being launched."""
+    with ops.device(self.device_t1):
+      x1 = array_ops.ones([2, 2])
+
+    num_calls = 10
+    self._coord = coordinator.Coordinator()
+
+    @def_function.function
+    def worker_fn(i):
+      return math_ops.matmul(i, i)
+
+    def thread_fn(device, results):
+      for i in range(num_calls):
+        with self._coord.stop_on_exception():
+          with ops.device(device):
+            results[i] = worker_fn(x1).numpy()
+
+    def update_server_def_fn():
+      for _ in range(30):
+        with self._coord.stop_on_exception():
+          context.update_server_def(self.server_def_s1_s2)
+
+    t1_results = [None] * num_calls
+    t2_results = [None] * num_calls
+    threads = []
+    threads.append(
+        threading.Thread(target=thread_fn, args=(self.device_t1, t1_results)))
+    threads.append(
+        threading.Thread(target=thread_fn, args=(self.device_t2, t2_results)))
+    threads.append(threading.Thread(target=update_server_def_fn))
+    for t in threads:
+      t.start()
+    self._coord.join(threads)
+    for result in t1_results + t2_results:
+      np.testing.assert_array_equal([[2, 2], [2, 2]], result)
+
+  @test_util.run_in_async_and_sync_mode
+  def testDistributedFunctionServerAdded(self):
+    """Add a server to cluster, and run distributed function on it."""
+    with ops.device(self.device_t1):
+      x1 = array_ops.ones([2, 2])
+
+    @def_function.function
+    def worker_fn(i):
+      with ops.device(self.device_t2):
+        mul = math_ops.matmul(i, i)
+      return mul - array_ops.zeros_like(mul)
+
+    # Forces function tracing and registration
+    worker_fn.get_concrete_function(x1)
+
+    context.update_server_def(server_def=self.server_def_s1_s2_s3)
+    with ops.device(self.device_t3):
+      y = worker_fn(x1)
+    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+  @test_util.run_in_async_and_sync_mode
+  def testDistributedFunctionServerRemovedAddedBack(self):
+    """Add then remove a server, and run distributed function on cluster."""
+    with ops.device(self.device_local):
+      x1 = array_ops.ones([2, 2])
+
+    @def_function.function
+    def worker_fn(i):
+      with ops.device(self.device_t1):
+        mul = math_ops.matmul(i, i)
+      return mul - array_ops.zeros_like(mul)
+
+    # Forces function tracing and registration
+    worker_fn.get_concrete_function(x1)
+
+    context.update_server_def(server_def=self.server_def_s1)
+    with ops.device(self.device_t1):
+      y = worker_fn(x1)
+    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+    context.update_server_def(server_def=self.server_def_s1_s2)
+    with ops.device(self.device_t2):
+      y = worker_fn(x1)
+    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+  @test_util.run_in_async_and_sync_mode
+  def testDistributedFunctionBothServersReplaced(self):
+    """Tests that replacing servers works correctly.
+
+    We create two servers, t1 and t2. We first replace t2, then we replace t1.
+
+    Among other things, this ensures that both already existing, and
+    restarted workers have the context view IDs correctly updated.
+    """
+    with ops.device(self.device_local):
+      x1 = array_ops.ones([2, 2])
+
+    @def_function.function
+    def worker_fn(i):
+      with ops.device(self.device_t1):
+        mul = math_ops.matmul(i, i)
+      with ops.device(self.device_t2):
+        add = mul + i
+      return add - i
+
+    # Forces function tracing and registration
+    worker_fn.get_concrete_function(x1)
+
+    # Replace task2
+    context.update_server_def(server_def=self.server_def_s1_s3)
+    for device in (self.device_t1, self.device_t2):
+      with ops.device(device):
+        y = worker_fn(x1)
+      np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+    # Then replace task1
+    context.update_server_def(server_def=self.server_def_s4_s3)
+    for device in (self.device_t1, self.device_t2):
+      with ops.device(device):
+        y = worker_fn(x1)
+      np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+  def testCheckAlive(self):
+    with self.assertRaisesRegexp(ValueError, "Context is not initialized."):
+      context.check_alive("/job:remote_device/task:0")
+    context.context().ensure_initialized()
+
+    self.assertTrue(context.check_alive("/job:remote_device/replica:0/task:0"))
+    self.assertTrue(context.check_alive("/job:remote_device/replica:0/task:1"))
+
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Client for target /job:remote_device/replica:0/task:10 not found."):
+      context.check_alive("/job:remote_device/replica:0/task:10")
+
+
+class DynamicClusterWithoutLazyRemoteInputsCopyTest(DynamicClusterTest):
+
+  @classmethod
+  def setUpClass(cls):
+    super(DynamicClusterWithoutLazyRemoteInputsCopyTest, cls).setUpClass()
+    context._reset_context()
+    context.context().lazy_remote_inputs_copy = False
+
+  @classmethod
+  def tearDownClass(cls):
+    super(DynamicClusterWithoutLazyRemoteInputsCopyTest, cls).tearDownClass()
+    context._reset_context()
+    context.context().lazy_remote_inputs_copy = True
+
+
+if __name__ == "__main__":
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/eager/remote_execution_test.py b/tensorflow/python/eager/remote_execution_test.py
new file mode 100644
index 00000000000..d0e824fc657
--- /dev/null
+++ b/tensorflow/python/eager/remote_execution_test.py
@@ -0,0 +1,253 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for remote eager execution."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.core.protobuf import cluster_pb2
+from tensorflow.core.protobuf import tensorflow_server_pb2
+from tensorflow.python import pywrap_tfe
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
+from tensorflow.python.eager import remote
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import script_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
+
+JOB_NAME = "remote_device"
+ALT_JOB_NAME = "alt_remote_device"
+
+
+def get_server_def(job_name, local_server_port, remote_server_addresses,
+                   task_index):
+  """Returns a server def with a single job + multiple tasks."""
+  cluster_def = cluster_pb2.ClusterDef()
+  job_def = cluster_def.job.add()
+  job_def.name = job_name
+  job_def.tasks[0] = "localhost:%d" % local_server_port
+
+  for i, remote_server_address in enumerate(remote_server_addresses, start=1):
+    job_def.tasks[i] = remote_server_address
+
+  server_def = tensorflow_server_pb2.ServerDef(
+      cluster=cluster_def,
+      job_name=job_name,
+      task_index=task_index,
+      protocol="grpc")
+
+  return server_def
+
+
+class RemoteExecutionTest(test.TestCase, parameterized.TestCase):
+
+  def __init__(self, methodName="runTest"):  # pylint: disable=invalid-name
+    super(RemoteExecutionTest, self).__init__(methodName)
+    self._cached_server1 = server_lib.Server.create_local_server()
+    self._cached_server2 = server_lib.Server.create_local_server()
+
+    self._cached_server1_target = self._cached_server1.target[len("grpc://"):]
+    self._cached_server2_target = self._cached_server2.target[len("grpc://"):]
+
+  def setUp(self):
+    super(RemoteExecutionTest, self).setUp()
+    local_port = pywrap_tfe.TF_PickUnusedPortOrDie()
+    context.set_server_def(
+        server_def=get_server_def(
+            JOB_NAME,
+            local_server_port=local_port,
+            remote_server_addresses=[
+                self._cached_server1_target, self._cached_server2_target
+            ],
+            task_index=0))
+
+  def tearDown(self):
+    super(RemoteExecutionTest, self).tearDown()
+
+    # Clear the current device scope and reset the context to avoid polluting
+    # other test cases.
+    ops.device(None).__enter__()
+    context._reset_context()
+
+  @test_util.run_in_async_and_sync_mode
+  @test_util.run_gpu_only
+  def testGpuToRemoteCopy(self):
+    """Tests that the remote copy happens satisfactorily."""
+    x1 = array_ops.ones([2, 2]).gpu()
+    with ops.device("/job:%s/replica:0/task:1/device:CPU:0" % JOB_NAME):
+      x2 = x1._copy()  # pylint: disable=protected-access
+
+    np.testing.assert_array_equal(x1.numpy(), x2.numpy())
+
+  @test_util.run_in_async_and_sync_mode
+  @test_util.run_gpu_only
+  def testGpuToRemoteOp(self):
+    with ops.device("gpu:0"):
+      x = array_ops.ones([2, 2])
+    with ops.device("job:%s/replica:0/task:1/device:CPU:0" % JOB_NAME):
+      y = math_ops.matmul(x, x)
+
+    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+  @test_util.run_in_async_and_sync_mode
+  def testDefunMatmul(self):
+    """Basic remote eager execution with defun."""
+
+    mm_defun = function.defun(math_ops.matmul)
+    with ops.device("job:%s/replica:0/task:1/device:CPU:0" % JOB_NAME):
+      x1 = array_ops.ones([2, 2])
+    with ops.device("job:%s/replica:0/task:2/device:CPU:0" % JOB_NAME):
+      x2 = array_ops.ones([2, 2])
+      y = mm_defun(x1, x2)
+    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+  @test_util.run_in_async_and_sync_mode
+  def testSimpleMatmul(self):
+    """Basic remote eager execution."""
+
+    with ops.device("job:%s/replica:0/task:1/device:CPU:0" % JOB_NAME):
+      x1 = array_ops.ones([2, 2])
+    with ops.device("job:%s/replica:0/task:2/device:CPU:0" % JOB_NAME):
+      x2 = array_ops.ones([2, 2])
+      y = math_ops.matmul(x1, x2)
+    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+  def testEagerPyFuncPlacement(self):
+    if not ops.executing_eagerly_outside_functions():
+      return
+
+    def f(x):
+      return math_ops.square(x)
+
+    with ops.device("/job:%s/replica:0/task:1/device:CPU:0" % JOB_NAME):
+      const_op = constant_op.constant(3.0, dtype=dtypes.float32)
+      # PyFuncOp should be placed on the localhost's address space.
+      py_func_op = script_ops.eager_py_func(
+          func=f, inp=[const_op], Tout=dtypes.float32)
+      self.assertEqual(py_func_op.device,
+                       "/job:%s/replica:0/task:0/device:CPU:0" % JOB_NAME)
+      self.assertEqual(self.evaluate(py_func_op), 9.0)
+
+  @test_util.run_in_async_and_sync_mode
+  def testSimpleWeightRead(self):
+    """Basic remote eager weight read."""
+
+    with ops.device("job:%s/replica:0/task:1/device:CPU:0" % JOB_NAME):
+      w = resource_variable_ops.ResourceVariable([[2.0]])
+      loss = w * w
+    np.testing.assert_array_equal([[4.0]], loss.numpy())
+
+  @test_util.run_in_async_and_sync_mode
+  def testTapeWeightRead(self):
+    """Remote eager weight read in a tape."""
+
+    with ops.device("job:%s/replica:0/task:1/device:CPU:0" % JOB_NAME):
+      w = resource_variable_ops.ResourceVariable([[3.0]])
+      with backprop.GradientTape() as tape:
+        loss = w * w
+
+      grad = tape.gradient(loss, w)
+    np.testing.assert_array_equal([[9.0]], loss.numpy())
+    np.testing.assert_array_equal([[6.0]], grad.numpy())
+
+  @test_util.run_in_async_and_sync_mode
+  def testServerDefChanged(self):
+    """Update server def, and run ops on new cluster."""
+    context.set_server_def(
+        server_def=get_server_def(
+            ALT_JOB_NAME,
+            local_server_port=0,
+            remote_server_addresses=[
+                self._cached_server1_target, self._cached_server2_target
+            ],
+            task_index=0))
+
+    with ops.device("job:%s/replica:0/task:1/device:CPU:0" % ALT_JOB_NAME):
+      x1 = array_ops.ones([2, 2])
+    y = math_ops.matmul(x1, x1)
+    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+    # Set the server def back to JOB_NAME
+    context.set_server_def(
+        server_def=get_server_def(
+            JOB_NAME,
+            local_server_port=0,
+            remote_server_addresses=[
+                self._cached_server1_target, self._cached_server2_target
+            ],
+            task_index=0))
+
+    with ops.device("job:%s/replica:0/task:1/device:CPU:0" % JOB_NAME):
+      x1 = array_ops.ones([2, 2])
+    y = math_ops.matmul(x1, x1)
+    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+  @test_util.run_in_async_and_sync_mode
+  def testConnectToRemoteServer(self):
+    """Basic server connection."""
+    context._reset_context()
+    remote.connect_to_remote_host(self._cached_server1_target)
+
+    with ops.device("job:worker/replica:0/task:0/device:CPU:0"):
+      x1 = array_ops.ones([2, 2])
+      x2 = array_ops.ones([2, 2])
+      y = math_ops.matmul(x1, x2)
+    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+  @test_util.run_in_async_and_sync_mode
+  def testContextDeviceUpdated(self):
+    """Tests that the context device is correctly updated."""
+
+    with ops.device("cpu:0"):
+      x1 = array_ops.ones([2, 2])
+      x2 = array_ops.ones([2, 2])
+      y = math_ops.matmul(x1, x2)
+    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+    # `y` is placed on the local CPU as expected.
+    self.assertEqual(y.device,
+                     "/job:%s/replica:0/task:0/device:CPU:0" % JOB_NAME)
+
+
+class RemoteExecutionWithoutLazyRemoteInputsCopyTest(RemoteExecutionTest):
+
+  @classmethod
+  def setUpClass(cls):
+    super(RemoteExecutionWithoutLazyRemoteInputsCopyTest, cls).setUpClass()
+    context._reset_context()
+    context.context().lazy_remote_inputs_copy = False
+
+  @classmethod
+  def tearDownClass(cls):
+    super(RemoteExecutionWithoutLazyRemoteInputsCopyTest, cls).tearDownClass()
+    context._reset_context()
+    context.context().lazy_remote_inputs_copy = True
+
+
+if __name__ == "__main__":
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 3ea0be66741..93a13f4e3ce 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -1053,8 +1053,8 @@ def _bucketized_column(source_column, boundaries):
   dense_tensor = input_layer(features, columns)
   ```
 
-  `bucketized_column` can also be crossed with another categorical column using
-  `crossed_column`:
+  A `bucketized_column` can also be crossed with another categorical column
+  using `crossed_column`:
 
   ```python
   price = numeric_column('price')
diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index 7a6bb73d121..f117d0ed5ef 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -164,6 +164,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training.tracking import data_structures
 from tensorflow.python.training.tracking import tracking
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import deprecation
@@ -236,7 +237,7 @@ class StateManager(object):
   def add_resource(self, feature_column, name, resource):
     """Creates a new resource.
 
-    Resources can be things such as tables etc.
+    Resources can be things such as tables, variables, trackables, etc.
 
     Args:
       feature_column: A `FeatureColumn` object this resource corresponds to.
@@ -249,10 +250,22 @@ class StateManager(object):
     del feature_column, name, resource
     raise NotImplementedError('StateManager.add_resource')
 
+  def has_resource(self, feature_column, name):
+    """Returns true iff a resource with same name exists.
+
+    Resources can be things such as tables, variables, trackables, etc.
+
+    Args:
+      feature_column: A `FeatureColumn` object this variable corresponds to.
+      name: Name of the resource.
+    """
+    del feature_column, name
+    raise NotImplementedError('StateManager.has_resource')
+
   def get_resource(self, feature_column, name):
     """Returns an already created resource.
 
-    Resources can be things such as tables etc.
+    Resources can be things such as tables, variables, trackables, etc.
 
     Args:
       feature_column: A `FeatureColumn` object this variable corresponds to.
@@ -275,11 +288,8 @@ class _StateManagerImpl(StateManager):
     self._trainable = trainable
     self._layer = layer
     if self._layer is not None and not hasattr(self._layer, '_resources'):
-      self._layer._resources = []  # pylint: disable=protected-access
+      self._layer._resources = data_structures.Mapping()  # pylint: disable=protected-access
     self._cols_to_vars_map = collections.defaultdict(lambda: {})
-    # TODO(vbardiovsky): Make sure the resources are tracked by moving them to
-    # the layer (inheriting from AutoTrackable), e.g.:
-    # self._layer._resources_map = data_structures.Mapping()
     self._cols_to_resources_map = collections.defaultdict(lambda: {})
 
   def create_variable(self,
@@ -323,15 +333,25 @@ class _StateManagerImpl(StateManager):
       return self._cols_to_vars_map[feature_column][name]
     raise ValueError('Variable does not exist.')
 
-  def add_resource(self, feature_column, name, resource):
-    self._cols_to_resources_map[feature_column][name] = resource
-    if self._layer is not None:
-      self._layer._resources.append(resource)  # pylint: disable=protected-access
+  def add_resource(self, feature_column, resource_name, resource):
+    self._cols_to_resources_map[feature_column][resource_name] = resource
+    # pylint: disable=protected-access
+    if self._layer is not None and isinstance(resource, trackable.Trackable):
+      # Add trackable resources to the layer for serialization.
+      if feature_column.name not in self._layer._resources:
+        self._layer._resources[feature_column.name] = data_structures.Mapping()
+      if resource_name not in self._layer._resources[feature_column.name]:
+        self._layer._resources[feature_column.name][resource_name] = resource
+    # pylint: enable=protected-access
 
-  def get_resource(self, feature_column, name):
-    if name in self._cols_to_resources_map[feature_column]:
-      return self._cols_to_resources_map[feature_column][name]
-    raise ValueError('Resource does not exist.')
+  def has_resource(self, feature_column, resource_name):
+    return resource_name in self._cols_to_resources_map[feature_column]
+
+  def get_resource(self, feature_column, resource_name):
+    if (feature_column not in self._cols_to_resources_map or
+        resource_name not in self._cols_to_resources_map[feature_column]):
+      raise ValueError('Resource does not exist.')
+    return self._cols_to_resources_map[feature_column][resource_name]
 
 
 class _StateManagerImplV2(_StateManagerImpl):
@@ -1430,8 +1450,8 @@ def bucketized_column(source_column, boundaries):
   dense_tensor = tf.keras.layers.DenseFeatures(columns)(features)
   ```
 
-  `bucketized_column` can also be crossed with another categorical column using
-  `crossed_column`:
+  A `bucketized_column` can also be crossed with another categorical column
+  using `crossed_column`:
 
   ```python
   price = tf.feature_column.numeric_column('price')
@@ -3736,15 +3756,20 @@ class VocabularyFileCategoricalColumn(
       input_tensor = math_ops.cast(input_tensor, dtypes.int64)
 
     name = '{}_lookup'.format(self.key)
-    table = lookup_ops.index_table_from_file(
-        vocabulary_file=self.vocabulary_file,
-        num_oov_buckets=self.num_oov_buckets,
-        vocab_size=self.vocabulary_size,
-        default_value=self.default_value,
-        key_dtype=key_dtype,
-        name=name)
-    if state_manager is not None:
-      state_manager.add_resource(self, name, table)
+    if state_manager is None or not state_manager.has_resource(self, name):
+      with ops.init_scope():
+        table = lookup_ops.index_table_from_file(
+            vocabulary_file=self.vocabulary_file,
+            num_oov_buckets=self.num_oov_buckets,
+            vocab_size=self.vocabulary_size,
+            default_value=self.default_value,
+            key_dtype=key_dtype,
+            name=name)
+      if state_manager is not None:
+        state_manager.add_resource(self, name, table)
+    else:
+      # Reuse the table from the previous run.
+      table = state_manager.get_resource(self, name)
     return table.lookup(input_tensor)
 
   def transform_feature(self, transformation_cache, state_manager):
@@ -3851,14 +3876,19 @@ class VocabularyListCategoricalColumn(
       input_tensor = math_ops.cast(input_tensor, dtypes.int64)
 
     name = '{}_lookup'.format(self.key)
-    table = lookup_ops.index_table_from_tensor(
-        vocabulary_list=tuple(self.vocabulary_list),
-        default_value=self.default_value,
-        num_oov_buckets=self.num_oov_buckets,
-        dtype=key_dtype,
-        name=name)
-    if state_manager is not None:
-      state_manager.add_resource(self, name, table)
+    if state_manager is None or not state_manager.has_resource(self, name):
+      with ops.init_scope():
+        table = lookup_ops.index_table_from_tensor(
+            vocabulary_list=tuple(self.vocabulary_list),
+            default_value=self.default_value,
+            num_oov_buckets=self.num_oov_buckets,
+            dtype=key_dtype,
+            name=name)
+      if state_manager is not None:
+        state_manager.add_resource(self, name, table)
+    else:
+      # Reuse the table from the previous run.
+      table = state_manager.get_resource(self, name)
     return table.lookup(input_tensor)
 
   def transform_feature(self, transformation_cache, state_manager):
diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py
index d4ef1a1de19..c674717482d 100644
--- a/tensorflow/python/framework/auto_control_deps.py
+++ b/tensorflow/python/framework/auto_control_deps.py
@@ -129,6 +129,32 @@ class ResourceType(enum.Enum):
   READ_WRITE = "read-write"
 
 
+def collective_manager_ids_from_op(op):
+  """Returns CollectiveManager ID from the op if one exists, else None.
+
+  CollectiveManager adds collective and no_op operations tagged with an ID,
+  unique to the manager object. This function extracts that ID, or None, if the
+  node was not generated by a CollectiveManager.
+
+  Args:
+    op: `Operation` to get the collective manager ID from.
+
+  Returns:
+    List of CollectiveManager IDs used by the op.
+  """
+  if op.type == "CollectiveReduce":
+    try:
+      return [op.get_attr("_collective_manager_id")]
+    except ValueError:
+      pass
+  elif op.type == "StatefulPartitionedCall":
+    try:
+      return op.get_attr(utils.COLLECTIVE_MANAGER_IDS)
+    except ValueError:
+      pass
+  return []
+
+
 class AutomaticControlDependencies(object):
   """Context manager to automatically add control dependencies.
 
@@ -241,6 +267,7 @@ class AutomaticControlDependencies(object):
       merge_for_resource: map from resource tensor to merge which must follow
         all usages of it.
     """
+    # pylint: disable=protected-access
     inp = switch_op.inputs[0]
     input_id = ops.tensor_id(inp)
     if inp.dtype == dtypes_module.resource and inp.op.type == "Switch":
@@ -250,24 +277,25 @@ class AutomaticControlDependencies(object):
     output_id = ops.tensor_id(output)
     if output_id in merge_for_resource:
       return
-    new_merge = control_flow_ops.merge(switch_op.outputs,
-                                       name="artificial_merge")
-    new_merge[0].op._control_flow_context = (  # pylint: disable=protected-access
-        switch_op._control_flow_context.outer_context)  # pylint: disable=protected-access
+    new_merge = control_flow_ops.merge(
+        switch_op.outputs, name="artificial_merge")
+    new_merge[0].op._control_flow_context = (
+        switch_op._control_flow_context.outer_context)
     # Ensures the merge always runs
     ops_which_must_run.add(new_merge[0].op)
     if input_id in last_write_to_resource:
       # Ensures the switch executes after the previous op using the resource.
-      switch_op._add_control_input(last_write_to_resource[input_id])  # pylint: disable=protected-access
+      switch_op._add_control_input(last_write_to_resource[input_id])
     # Ensure the next op outside the cond happens after the merge.
     last_write_to_resource[input_id] = new_merge[0].op
     if input_id in merge_for_resource:
-      merge_for_resource[input_id]._add_control_input(new_merge[0].op)  # pylint: disable=protected-access
+      merge_for_resource[input_id]._add_control_input(new_merge[0].op)
     for o in switch_op.outputs:
       # Ensures the merge will execute after all ops inside the cond
       merge_for_resource[ops.tensor_id(o)] = new_merge[0].op
 
   def __exit__(self, unused_type, unused_value, unused_traceback):
+    # pylint: disable=protected-access
     if context.executing_eagerly():
       return
 
@@ -275,19 +303,24 @@ class AutomaticControlDependencies(object):
       raise RuntimeError(
           "Graph changed while trying to add control dependencies.")
 
-    # pylint: disable=protected-access
     if hasattr(self._graph, "outer_graph"):
       outer_val = self._graph.outer_graph._add_control_dependencies
       self._graph._add_control_dependencies = outer_val
     else:
       self._graph._add_control_dependencies = False
-    # pylint: enable=protected-access
 
     # map from resource tensor to the last op which wrote to it
     last_write_to_resource = {}
     # map from resource tensor to the list of reads from it since the last
     # write or since the beginning of the function.
     reads_since_last_write_to_resource = collections.defaultdict(list)
+    # CollectiveManager manager_ids within a particular function call should not
+    # be needed outside of that function call. So we keep them separate (though
+    # the general idea of the maps is the same, in the future, we'll need to
+    # correctly thread the control output outside).
+    # Map from collective manager scope to the last op which used it
+    collective_manager_scopes_opened = {}
+    collective_manager_scopes_used = {}
     # set of conditional and loop exits
     ops_which_must_run = set()
     # merge which must depend on ops which use this resource
@@ -334,13 +367,20 @@ class AutomaticControlDependencies(object):
         # TODO(srbs): Do not add functional ops to `ops_which_must_run` if
         # they only have variable reads and are otherwise stateless.
         ops_which_must_run.add(op)
+      # Make a note of all opened manager_ids.
+      if op.type == "NoOp":
+        try:
+          collective_manager_scopes_opened[op.get_attr(
+              "_collective_manager_id")] = op
+        except ValueError:
+          pass
       # Ignore switches (they're handled separately)
       if op.type == "Switch" and op.inputs[0].dtype == dtypes_module.resource:
         continue
       # Make merges trigger all other computation which must run
       if op.type == "Merge":
         for o in ops_which_must_run:
-          op._add_control_input(o)  # pylint: disable=protected-access
+          op._add_control_input(o)
           for inp in o.inputs:
             input_id = ops.tensor_id(inp)
             if input_id in last_write_to_resource:
@@ -369,12 +409,12 @@ class AutomaticControlDependencies(object):
         # Ensure uses of resources are serialized
         if input_id in last_write_to_resource:
           if is_building_function or (
-              last_write_to_resource[input_id]._control_flow_context  # pylint: disable=protected-access
-              is op._control_flow_context):  # pylint: disable=protected-access
+              last_write_to_resource[input_id]._control_flow_context
+              is op._control_flow_context):
             control_inputs.add(last_write_to_resource[input_id])
         # Ensure merges happen after the closing of a cond block
         if input_id in merge_for_resource:
-          merge_for_resource[input_id]._add_control_input(op)  # pylint: disable=protected-access
+          merge_for_resource[input_id]._add_control_input(op)
         if is_read:
           reads_since_last_write_to_resource[input_id].append(op)
         else:
@@ -383,25 +423,48 @@ class AutomaticControlDependencies(object):
           last_write_to_resource[input_id] = op
 
       if (op_is_stateful(op) and not resource_inputs
-          and op._control_flow_context is None):  # pylint: disable=protected-access
+          and op._control_flow_context is None):
         if None in last_write_to_resource:
-          op._add_control_input(last_write_to_resource[None])  # pylint: disable=protected-access
+          op._add_control_input(last_write_to_resource[None])
         last_write_to_resource[None] = op
-      control_inputs = [
-          c for c in control_inputs if is_building_function or
-          (c._control_flow_context is op._control_flow_context)]  # pylint: disable=protected-access
-      op._add_control_inputs(control_inputs)  # pylint: disable=protected-access
+
+      # Ensure ordering of collective ops
+      manager_ids = collective_manager_ids_from_op(op)
+      for manager_id in manager_ids:
+        if manager_id in collective_manager_scopes_opened:
+          # Chain this function call if the scope was opened.
+          op._add_control_input(collective_manager_scopes_opened[manager_id])
+          collective_manager_scopes_opened[manager_id] = op
+        else:
+          # If this op is in a scope not created here, create a chain starting
+          # at this op.
+          if manager_id in collective_manager_scopes_used:
+            op._add_control_input(collective_manager_scopes_used[manager_id])
+          collective_manager_scopes_used[manager_id] = op
+
+      if control_inputs and not is_building_function:
+        control_inputs = [
+            c for c in control_inputs
+            if c._control_flow_context is op._control_flow_context
+        ]
+
+      op._add_control_inputs(control_inputs)
 
     # Ensure all ops which must run do run
     self.ops_which_must_run.update(ops_which_must_run)
     for r in nest.flatten(list(self._returned_tensors), expand_composites=True):
       if self.ops_which_must_run:
-        r.op._add_control_inputs(  # pylint: disable=protected-access
-            [
-                o for o in self.ops_which_must_run
-                if r.graph.building_function or
-                (o._control_flow_context is r.op._control_flow_context)  # pylint: disable=protected-access
-            ])
+        updated_ops_which_must_run = []
+        if r.graph.building_function:
+          updated_ops_which_must_run = self.ops_which_must_run
+        else:
+          updated_ops_which_must_run = [
+              o for o in self.ops_which_must_run
+              if o._control_flow_context is r.op._control_flow_context
+          ]
+        r.op._add_control_inputs(updated_ops_which_must_run)
+
+    self.collective_manager_ids_used = collective_manager_scopes_used
 
 
 _acd_resource_resolvers_registry = registry.Registry("acd_resource_resolvers")
diff --git a/tensorflow/python/framework/auto_control_deps_utils.py b/tensorflow/python/framework/auto_control_deps_utils.py
index f1b23556c7a..63ca73bb034 100644
--- a/tensorflow/python/framework/auto_control_deps_utils.py
+++ b/tensorflow/python/framework/auto_control_deps_utils.py
@@ -25,6 +25,9 @@ READ_ONLY_RESOURCE_INPUTS_ATTR = "_read_only_resource_inputs"
 RESOURCE_READ_OPS = set()
 
 
+COLLECTIVE_MANAGER_IDS = "_collective_manager_ids"
+
+
 def register_read_only_resource_op(op_type):
   """Declares that `op_type` does not update its touched resource."""
   RESOURCE_READ_OPS.add(op_type)
diff --git a/tensorflow/python/framework/combinations.py b/tensorflow/python/framework/combinations.py
index d8d49466bae..a384037e14f 100644
--- a/tensorflow/python/framework/combinations.py
+++ b/tensorflow/python/framework/combinations.py
@@ -31,16 +31,17 @@ from tensorflow.python.framework import test_combinations
 
 
 class EagerGraphCombination(test_combinations.TestCombination):
-  """Run the test in Graph or Eager mode.  Graph is the default.
+  """Run the test in Graph or Eager mode.
 
   The optional `mode` parameter controls the test's execution mode.  Its
   accepted values are "graph" or "eager" literals.
   """
 
   def context_managers(self, kwargs):
-    # TODO(isaprykin): Switch the default to eager.
-    mode = kwargs.pop("mode", "graph")
-    if mode == "eager":
+    mode = kwargs.pop("mode", None)
+    if mode is None:
+      return []
+    elif mode == "eager":
       return [context.eager_mode()]
     elif mode == "graph":
       return [ops.Graph().as_default(), context.graph_mode()]
diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py
index d686df562a6..d702771cef3 100644
--- a/tensorflow/python/framework/func_graph.py
+++ b/tensorflow/python/framework/func_graph.py
@@ -1023,6 +1023,8 @@ def func_graph_from_py_func(name,
 
   if add_control_dependencies:
     func_graph.control_outputs.extend(deps_control_manager.ops_which_must_run)
+    func_graph.collective_manager_ids_used = (
+        deps_control_manager.collective_manager_ids_used)
 
   return func_graph
 
diff --git a/tensorflow/python/framework/memory_checker_test.py b/tensorflow/python/framework/memory_checker_test.py
index 762d7f1a7d8..847d1d97583 100644
--- a/tensorflow/python/framework/memory_checker_test.py
+++ b/tensorflow/python/framework/memory_checker_test.py
@@ -140,14 +140,17 @@ class MemoryCheckerTest(test.TestCase):
       foo = Foo()  # pylint: disable=unused-variable
       memory_checker.record_snapshot()
 
-    # TODO(kkb): `{'builtins.weakref': 1}` is unexpected, locate and fix it.
+    # TODO(kkb): `{'builtins.weakref': 1, 'builtins.function': 1}` is
+    # unexpected, locate and fix it.
     memory_checker.assert_no_new_python_objects(threshold={
         '__main__.Foo': 1,
-        'builtins.weakref': 1
+        'builtins.weakref': 1,
+        'builtins.function': 1,
     })
     memory_checker.assert_no_new_python_objects(threshold={
         '__main__.Foo': 2,
-        'builtins.weakref': 1
+        'builtins.weakref': 1,
+        'builtins.function': 1,
     })
 
   def testKerasBasic(self):
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index d3df8cb973d..266413cc96e 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -605,7 +605,7 @@ class Tensor(_TensorLike):
 
     ```python
     _, image_data = tf.compat.v1.TFRecordReader(...).read(...)
-    image = tf.image.decode_png(image_data, channels=3)
+    image = tf.io.decode_png(image_data, channels=3)
 
     # The height and width dimensions of `image` are data dependent, and
     # cannot be computed without executing the op.
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index d9daceb7314..65b59c72e8a 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -67,11 +67,12 @@ from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import control_flow_util_v2
-from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.ops.ragged import ragged_tensor_value
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.ragged import ragged_ops  # pylint: disable=unused-import
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_value
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
@@ -80,9 +81,9 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.protobuf import compare
 from tensorflow.python.util.tf_export import tf_export
-from tensorflow.python.util.compat import collections_abc
 
 
 # If the below import is made available through the BUILD rule, then this
@@ -1017,6 +1018,21 @@ def build_as_function_and_v1_graph(func=None):
   return decorator
 
 
+def run_in_async_and_sync_mode(f):
+  """Execute the test in async mode and sync mode."""
+
+  @parameterized.named_parameters([("Async", True), ("", False)])
+  @functools.wraps(f)
+  def decorator(self, async_mode, *args, **kwargs):
+    if async_mode:
+      with context.execution_mode(context.ASYNC):
+        f(self, *args, **kwargs)
+    else:
+      with context.execution_mode(context.SYNC):
+        f(self, *args, **kwargs)
+  return decorator
+
+
 def eager_lazy_remote_copy_on_and_off(f):
   """Execute the test method w/o lazy tensor copy for function remote inputs."""
 
@@ -2474,6 +2490,13 @@ class TensorFlowTestCase(googletest.TestCase):
     `a` and `b` can be arbitrarily nested structures. A layer of a nested
     structure can be a `dict`, `namedtuple`, `tuple` or `list`.
 
+    Note: the implementation follows
+    [`numpy.allclose`](https://docs.scipy.org/doc/numpy/reference/generated/numpy.allclose.html)
+    (and numpy.testing.assert_allclose). It checks whether two arrays are
+    element-wise equal within a tolerance. The relative difference
+    (`rtol * abs(b)`) and the absolute difference `atol` are added together
+    to compare against the absolute difference between `a` and `b`.
+
     Args:
       a: The expected numpy `ndarray`, or anything that can be converted into a
         numpy `ndarray` (including Tensor), or any arbitrarily nested of
@@ -2619,10 +2642,10 @@ class TensorFlowTestCase(googletest.TestCase):
       msg: Optional message to report on failure.
     """
     try:
-      self.assertAllEqual(a, b, msg)
+      self.assertAllEqual(a, b)
     except AssertionError:
       return
-    raise AssertionError("The two values are equal at all elements")
+    raise AssertionError("The two values are equal at all elements. %s" % msg)
 
   @py_func_if_in_function
   def assertAllGreater(self, a, comparison_target):
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index 96f7d600713..b5cb903c666 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -467,6 +467,21 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with self.assertRaisesRegexp(AssertionError, r"not equal lhs"):
       self.assertAllEqual([0] * 3, k)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testAssertNotAllEqual(self):
+    i = variables.Variable([100], dtype=dtypes.int32, name="i")
+    j = constant_op.constant([20], dtype=dtypes.int32, name="j")
+    k = math_ops.add(i, j, name="k")
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertNotAllEqual([100] * 3, i)
+    self.assertNotAllEqual([120] * 3, k)
+    self.assertNotAllEqual([20] * 3, j)
+
+    with self.assertRaisesRegexp(
+        AssertionError, r"two values are equal at all elements.*extra message"):
+      self.assertNotAllEqual([120], k, msg="extra message")
+
   @test_util.run_in_graph_and_eager_modes
   def testAssertNotAllClose(self):
     # Test with arrays
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index cf7f32a16e1..f3078ddaabb 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -94,6 +94,7 @@ py_library(
     name = "backend_config",
     srcs = ["backend_config.py"],
     srcs_version = "PY2AND3",
+    deps = ["//tensorflow/python:util"],
 )
 
 # TODO(scottzhu): Cleanup this target and point all the user to keras/engine.
@@ -148,6 +149,19 @@ py_library(
     ],
 )
 
+py_library(
+    name = "combinations",
+    srcs = [
+        "combinations.py",
+    ],
+    deps = [
+        ":testing_utils",
+        "//tensorflow/python:framework_combinations",
+        "//tensorflow/python:framework_test_combinations_lib",
+        "//tensorflow/python:tf2",
+    ],
+)
+
 py_library(
     name = "callbacks_v1",
     srcs = [
@@ -309,24 +323,46 @@ tf_py_test(
     srcs = ["activations_test.py"],
     python_version = "PY3",
     deps = [
-        ":keras",
+        ":activations",
+        ":backend",
+        ":combinations",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:nn_ops",
+        "//tensorflow/python/keras/layers",
+        "//tensorflow/python/keras/layers:advanced_activations",
+        "//tensorflow/python/keras/layers:core",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
+tf_py_test(
+    name = "combinations_test",
+    size = "small",
+    srcs = ["combinations_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":combinations",
+        ":testing_utils",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:extra_py_tests_deps",
+        "//tensorflow/python:tf2",
+        "//tensorflow/python/eager:context",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_py_test(
     name = "constraints_test",
     size = "small",
     srcs = ["constraints_test.py"],
     python_version = "PY3",
     deps = [
-        ":keras",
+        ":backend",
+        ":combinations",
+        ":constraints",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -336,11 +372,17 @@ tf_py_test(
     srcs = ["initializers_test.py"],
     python_version = "PY3",
     deps = [
-        ":keras",
+        ":backend",
+        ":combinations",
+        ":initializers",
+        ":models",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:init_ops",
+        "//tensorflow/python:tf2",
+        "//tensorflow/python/keras/engine",
         "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -378,10 +420,17 @@ tf_py_test(
     srcs = ["losses_test.py"],
     python_version = "PY3",
     deps = [
-        ":keras",
+        ":backend",
+        ":combinations",
+        ":losses",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/keras/utils:engine_utils",
         "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -404,10 +453,27 @@ tf_py_test(
     python_version = "PY3",
     shard_count = 4,
     deps = [
+        ":combinations",
         ":keras",
+        ":metrics",
+        ":testing_utils",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python:weights_broadcast_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:function",
+        "//tensorflow/python/keras/layers",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/training/tracking:util",
         "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -418,8 +484,17 @@ tf_py_test(
     python_version = "PY3",
     shard_count = 4,
     deps = [
-        ":keras",
+        ":combinations",
+        ":metrics",
+        ":models",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/keras/layers",
+        "//tensorflow/python/keras/utils:metrics_utils",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -465,8 +540,16 @@ tf_py_test(
     python_version = "PY3",
     tags = ["notsan"],
     deps = [
-        ":keras",
+        ":callbacks",
+        ":callbacks_v1",
+        ":combinations",
+        ":testing_utils",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training_lib",
+        "//tensorflow/python/keras/engine",
+        "//tensorflow/python/keras/layers",
+        "//tensorflow/python/keras/utils:np_utils",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -498,9 +581,22 @@ tf_py_test(
     python_version = "PY3",
     shard_count = 4,
     deps = [
-        ":keras",
+        ":backend",
+        ":combinations",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:config",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:extra_py_tests_deps",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:util",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -512,10 +608,10 @@ tf_py_test(
     srcs = ["backend_config_test.py"],
     python_version = "PY3",
     deps = [
-        ":keras",
+        ":backend",
+        ":backend_config",
+        ":combinations",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:util",
-        "//third_party/py/numpy",
     ],
 )
 
diff --git a/tensorflow/python/keras/activations_test.py b/tensorflow/python/keras/activations_test.py
index ad86cbbc5b0..f951076efbb 100644
--- a/tensorflow/python/keras/activations_test.py
+++ b/tensorflow/python/keras/activations_test.py
@@ -18,10 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python import keras
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras import activations
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import combinations
+from tensorflow.python.keras.layers import advanced_activations
+from tensorflow.python.keras.layers import core
+from tensorflow.python.keras.layers import serialization
 from tensorflow.python.ops import nn_ops as nn
 from tensorflow.python.platform import test
 
@@ -32,34 +37,34 @@ def _ref_softmax(values):
   return e / np.sum(e)
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class KerasActivationsTest(test.TestCase):
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
+class KerasActivationsTest(test.TestCase, parameterized.TestCase):
 
   def test_serialization(self):
     all_activations = ['softmax', 'relu', 'elu', 'tanh',
                        'sigmoid', 'hard_sigmoid', 'linear',
                        'softplus', 'softsign', 'selu']
     for name in all_activations:
-      fn = keras.activations.get(name)
-      ref_fn = getattr(keras.activations, name)
+      fn = activations.get(name)
+      ref_fn = getattr(activations, name)
       assert fn == ref_fn
-      config = keras.activations.serialize(fn)
-      fn = keras.activations.deserialize(config)
+      config = activations.serialize(fn)
+      fn = activations.deserialize(config)
       assert fn == ref_fn
 
   def test_serialization_v2(self):
     activation_map = {nn.softmax_v2: 'softmax'}
     for fn_v2_key in activation_map:
-      fn_v2 = keras.activations.get(fn_v2_key)
-      config = keras.activations.serialize(fn_v2)
-      fn = keras.activations.deserialize(config)
+      fn_v2 = activations.get(fn_v2_key)
+      config = activations.serialize(fn_v2)
+      fn = activations.deserialize(config)
       assert fn.__name__ == activation_map[fn_v2_key]
 
   def test_serialization_with_layers(self):
-    activation = keras.layers.LeakyReLU(alpha=0.1)
-    layer = keras.layers.Dense(3, activation=activation)
-    config = keras.layers.serialize(layer)
-    deserialized_layer = keras.layers.deserialize(
+    activation = advanced_activations.LeakyReLU(alpha=0.1)
+    layer = core.Dense(3, activation=activation)
+    config = serialization.serialize(layer)
+    deserialized_layer = serialization.deserialize(
         config, custom_objects={'LeakyReLU': activation})
     self.assertEqual(deserialized_layer.__class__.__name__,
                      layer.__class__.__name__)
@@ -67,37 +72,37 @@ class KerasActivationsTest(test.TestCase):
                      activation.__class__.__name__)
 
   def test_softmax(self):
-    x = keras.backend.placeholder(ndim=2)
-    f = keras.backend.function([x], [keras.activations.softmax(x)])
+    x = backend.placeholder(ndim=2)
+    f = backend.function([x], [activations.softmax(x)])
     test_values = np.random.random((2, 5))
 
     result = f([test_values])[0]
     expected = _ref_softmax(test_values[0])
     self.assertAllClose(result[0], expected, rtol=1e-05)
 
+    x = backend.placeholder(ndim=1)
     with self.assertRaises(ValueError):
-      x = keras.backend.placeholder(ndim=1)
-      keras.activations.softmax(x)
+      activations.softmax(x)
 
   def test_temporal_softmax(self):
-    x = keras.backend.placeholder(shape=(2, 2, 3))
-    f = keras.backend.function([x], [keras.activations.softmax(x)])
+    x = backend.placeholder(shape=(2, 2, 3))
+    f = backend.function([x], [activations.softmax(x)])
     test_values = np.random.random((2, 2, 3)) * 10
     result = f([test_values])[0]
     expected = _ref_softmax(test_values[0, 0])
     self.assertAllClose(result[0, 0], expected, rtol=1e-05)
 
   def test_selu(self):
-    x = keras.backend.placeholder(ndim=2)
-    f = keras.backend.function([x], [keras.activations.selu(x)])
+    x = backend.placeholder(ndim=2)
+    f = backend.function([x], [activations.selu(x)])
     alpha = 1.6732632423543772848170429916717
     scale = 1.0507009873554804934193349852946
 
-    positive_values = np.array([[1, 2]], dtype=keras.backend.floatx())
+    positive_values = np.array([[1, 2]], dtype=backend.floatx())
     result = f([positive_values])[0]
     self.assertAllClose(result, positive_values * scale, rtol=1e-05)
 
-    negative_values = np.array([[-1, -2]], dtype=keras.backend.floatx())
+    negative_values = np.array([[-1, -2]], dtype=backend.floatx())
     result = f([negative_values])[0]
     true_result = (np.exp(negative_values) - 1) * scale * alpha
     self.assertAllClose(result, true_result)
@@ -106,8 +111,8 @@ class KerasActivationsTest(test.TestCase):
     def softplus(x):
       return np.log(np.ones_like(x) + np.exp(x))
 
-    x = keras.backend.placeholder(ndim=2)
-    f = keras.backend.function([x], [keras.activations.softplus(x)])
+    x = backend.placeholder(ndim=2)
+    f = backend.function([x], [activations.softplus(x)])
     test_values = np.random.random((2, 5))
     result = f([test_values])[0]
     expected = softplus(test_values)
@@ -117,8 +122,8 @@ class KerasActivationsTest(test.TestCase):
     def softsign(x):
       return np.divide(x, np.ones_like(x) + np.absolute(x))
 
-    x = keras.backend.placeholder(ndim=2)
-    f = keras.backend.function([x], [keras.activations.softsign(x)])
+    x = backend.placeholder(ndim=2)
+    f = backend.function([x], [activations.softsign(x)])
     test_values = np.random.random((2, 5))
     result = f([test_values])[0]
     expected = softsign(test_values)
@@ -133,8 +138,8 @@ class KerasActivationsTest(test.TestCase):
         return z / (1 + z)
     sigmoid = np.vectorize(ref_sigmoid)
 
-    x = keras.backend.placeholder(ndim=2)
-    f = keras.backend.function([x], [keras.activations.sigmoid(x)])
+    x = backend.placeholder(ndim=2)
+    f = backend.function([x], [activations.sigmoid(x)])
     test_values = np.random.random((2, 5))
     result = f([test_values])[0]
     expected = sigmoid(test_values)
@@ -146,16 +151,16 @@ class KerasActivationsTest(test.TestCase):
       z = 0.0 if x <= 0 else (1.0 if x >= 1 else x)
       return z
     hard_sigmoid = np.vectorize(ref_hard_sigmoid)
-    x = keras.backend.placeholder(ndim=2)
-    f = keras.backend.function([x], [keras.activations.hard_sigmoid(x)])
+    x = backend.placeholder(ndim=2)
+    f = backend.function([x], [activations.hard_sigmoid(x)])
     test_values = np.random.random((2, 5))
     result = f([test_values])[0]
     expected = hard_sigmoid(test_values)
     self.assertAllClose(result, expected, rtol=1e-05)
 
   def test_relu(self):
-    x = keras.backend.placeholder(ndim=2)
-    f = keras.backend.function([x], [keras.activations.relu(x)])
+    x = backend.placeholder(ndim=2)
+    f = backend.function([x], [activations.relu(x)])
     positive_values = np.random.random((2, 5))
     result = f([positive_values])[0]
     self.assertAllClose(result, positive_values, rtol=1e-05)
@@ -166,44 +171,45 @@ class KerasActivationsTest(test.TestCase):
     self.assertAllClose(result, expected, rtol=1e-05)
 
   def test_elu(self):
-    x = keras.backend.placeholder(ndim=2)
-    f = keras.backend.function([x], [keras.activations.elu(x, 0.5)])
+    x = backend.placeholder(ndim=2)
+    f = backend.function([x], [activations.elu(x, 0.5)])
     test_values = np.random.random((2, 5))
     result = f([test_values])[0]
     self.assertAllClose(result, test_values, rtol=1e-05)
-    negative_values = np.array([[-1, -2]], dtype=keras.backend.floatx())
+    negative_values = np.array([[-1, -2]], dtype=backend.floatx())
     result = f([negative_values])[0]
     true_result = (np.exp(negative_values) - 1) / 2
     self.assertAllClose(result, true_result)
 
   def test_tanh(self):
     test_values = np.random.random((2, 5))
-    x = keras.backend.placeholder(ndim=2)
-    exp = keras.activations.tanh(x)
-    f = keras.backend.function([x], [exp])
+    x = backend.placeholder(ndim=2)
+    exp = activations.tanh(x)
+    f = backend.function([x], [exp])
     result = f([test_values])[0]
     expected = np.tanh(test_values)
     self.assertAllClose(result, expected, rtol=1e-05)
 
   def test_exponential(self):
     test_values = np.random.random((2, 5))
-    x = keras.backend.placeholder(ndim=2)
-    exp = keras.activations.exponential(x)
-    f = keras.backend.function([x], [exp])
+    x = backend.placeholder(ndim=2)
+    exp = activations.exponential(x)
+    f = backend.function([x], [exp])
     result = f([test_values])[0]
     expected = np.exp(test_values)
     self.assertAllClose(result, expected, rtol=1e-05)
 
   def test_linear(self):
     x = np.random.random((10, 5))
-    self.assertAllClose(x, keras.activations.linear(x))
+    self.assertAllClose(x, activations.linear(x))
 
   def test_invalid_usage(self):
     with self.assertRaises(ValueError):
-      keras.activations.get('unknown')
+      activations.get('unknown')
 
     # The following should be possible but should raise a warning:
-    keras.activations.get(keras.layers.LeakyReLU())
+    activations.get(advanced_activations.LeakyReLU())
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/api/BUILD b/tensorflow/python/keras/api/BUILD
index 97c06a3ab8e..19ad03a09bf 100644
--- a/tensorflow/python/keras/api/BUILD
+++ b/tensorflow/python/keras/api/BUILD
@@ -57,6 +57,7 @@ keras_packages = [
     "tensorflow.python.keras.layers.noise",
     "tensorflow.python.keras.layers.normalization",
     "tensorflow.python.keras.layers.normalization_v2",
+    "tensorflow.python.keras.layers.preprocessing",
     "tensorflow.python.keras.layers.pooling",
     "tensorflow.python.keras.layers.recurrent",
     "tensorflow.python.keras.layers.recurrent_v2",
diff --git a/tensorflow/python/keras/applications/densenet.py b/tensorflow/python/keras/applications/densenet.py
index a93f3323fd7..9b11c342536 100644
--- a/tensorflow/python/keras/applications/densenet.py
+++ b/tensorflow/python/keras/applications/densenet.py
@@ -137,6 +137,10 @@ def DenseNet(
 ):
   """Instantiates the DenseNet architecture.
 
+  Reference paper:
+  - [Densely Connected Convolutional Networks]
+    (https://arxiv.org/abs/1608.06993) (CVPR 2017 Best Paper Award)
+
   Optionally loads weights pre-trained on ImageNet.
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
@@ -394,6 +398,10 @@ decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
 
 DOC = """
 
+  Reference paper:
+  - [Densely Connected Convolutional Networks]
+    (https://arxiv.org/abs/1608.06993) (CVPR 2017 Best Paper Award)
+
   Optionally loads weights pre-trained on ImageNet.
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
diff --git a/tensorflow/python/keras/applications/efficientnet.py b/tensorflow/python/keras/applications/efficientnet.py
index b8af253706e..4b9487dcdd6 100644
--- a/tensorflow/python/keras/applications/efficientnet.py
+++ b/tensorflow/python/keras/applications/efficientnet.py
@@ -161,6 +161,10 @@ def EfficientNet(
 ):
   """Instantiates the EfficientNet architecture using given scaling coefficients.
 
+  Reference paper:
+  - [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks]
+    (https://arxiv.org/abs/1905.11946) (ICML 2019)
+
   Optionally loads weights pre-trained on ImageNet.
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
diff --git a/tensorflow/python/keras/applications/inception_resnet_v2.py b/tensorflow/python/keras/applications/inception_resnet_v2.py
index e78b6a3da47..7f338f82597 100644
--- a/tensorflow/python/keras/applications/inception_resnet_v2.py
+++ b/tensorflow/python/keras/applications/inception_resnet_v2.py
@@ -52,6 +52,11 @@ def InceptionResNetV2(include_top=True,
                       **kwargs):
   """Instantiates the Inception-ResNet v2 architecture.
 
+  Reference paper:
+  - [Inception-v4, Inception-ResNet and the Impact of
+     Residual Connections on Learning](https://arxiv.org/abs/1602.07261)
+    (AAAI 2017)
+
   Optionally loads weights pre-trained on ImageNet.
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
diff --git a/tensorflow/python/keras/applications/mobilenet_v2.py b/tensorflow/python/keras/applications/mobilenet_v2.py
index ca5dcf8fa2d..bdd21c3da62 100644
--- a/tensorflow/python/keras/applications/mobilenet_v2.py
+++ b/tensorflow/python/keras/applications/mobilenet_v2.py
@@ -69,6 +69,9 @@ MACs stands for Multiply Adds
 | [mobilenet_v2_0.35_128] | 20  | 1.66 |          50.8 | 75.0 |
 | [mobilenet_v2_0.35_96]  | 11  | 1.66 |          45.5 | 70.4 |
 
+  Reference paper:
+  - [MobileNetV2: Inverted Residuals and Linear Bottlenecks]
+  (https://arxiv.org/abs/1801.04381) (CVPR 2018)
 """
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/keras/applications/nasnet.py b/tensorflow/python/keras/applications/nasnet.py
index 816ef267b2b..3da415dbb12 100644
--- a/tensorflow/python/keras/applications/nasnet.py
+++ b/tensorflow/python/keras/applications/nasnet.py
@@ -33,7 +33,7 @@ The below table describes the performance on ImageNet 2012:
 |   NASNet-A (6 @ 4032)  |   82.7 %  |   96.2 %  |      23.8 B    |    88.9    |
 --------------------------------------------------------------------------------
 
-References:
+Reference paper:
   - [Learning Transferable Architectures for Scalable Image Recognition]
     (https://arxiv.org/abs/1707.07012) (CVPR 2018)
 """
@@ -78,6 +78,10 @@ def NASNet(
 ):
   """Instantiates a NASNet model.
 
+  Reference paper:
+  - [Learning Transferable Architectures for Scalable Image Recognition]
+    (https://arxiv.org/abs/1707.07012) (CVPR 2018)
+
   Optionally loads weights pre-trained on ImageNet.
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
diff --git a/tensorflow/python/keras/applications/resnet.py b/tensorflow/python/keras/applications/resnet.py
index eaeefb3ccf6..3e33bb04bdd 100644
--- a/tensorflow/python/keras/applications/resnet.py
+++ b/tensorflow/python/keras/applications/resnet.py
@@ -13,7 +13,12 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=invalid-name
-"""ResNet models for Keras."""
+"""ResNet models for Keras.
+
+Reference paper:
+  - [Deep Residual Learning for Image Recognition]
+    (https://arxiv.org/abs/1512.03385) (CVPR 2015)
+"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -65,6 +70,10 @@ def ResNet(stack_fn,
            **kwargs):
   """Instantiates the ResNet, ResNetV2, and ResNeXt architecture.
 
+  Reference paper:
+  - [Deep Residual Learning for Image Recognition]
+    (https://arxiv.org/abs/1512.03385) (CVPR 2015)
+
   Optionally loads weights pre-trained on ImageNet.
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
@@ -549,6 +558,10 @@ decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
 
 DOC = """
 
+  Reference paper:
+  - [Deep Residual Learning for Image Recognition]
+  (https://arxiv.org/abs/1512.03385) (CVPR 2015)
+
   Optionally loads weights pre-trained on ImageNet.
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
diff --git a/tensorflow/python/keras/applications/resnet_v2.py b/tensorflow/python/keras/applications/resnet_v2.py
index 1be0b636223..2e1ee272c4b 100644
--- a/tensorflow/python/keras/applications/resnet_v2.py
+++ b/tensorflow/python/keras/applications/resnet_v2.py
@@ -13,7 +13,12 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=invalid-name
-"""ResNet v2 models for Keras."""
+"""ResNet v2 models for Keras.
+
+Reference paper:
+  - [Identity Mappings in Deep Residual Networks]
+    (https://arxiv.org/abs/1603.05027) (CVPR 2016)
+"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -164,6 +169,10 @@ decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
 
 DOC = """
 
+  Reference paper:
+  - [Identity Mappings in Deep Residual Networks]
+    (https://arxiv.org/abs/1603.05027) (CVPR 2016)
+
   Optionally loads weights pre-trained on ImageNet.
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
diff --git a/tensorflow/python/keras/applications/vgg16.py b/tensorflow/python/keras/applications/vgg16.py
index f4ec39a9d50..534d2cff6be 100644
--- a/tensorflow/python/keras/applications/vgg16.py
+++ b/tensorflow/python/keras/applications/vgg16.py
@@ -13,7 +13,12 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=invalid-name
-"""VGG16 model for Keras."""
+"""VGG16 model for Keras.
+
+Reference paper:
+  - [Very Deep Convolutional Networks for Large-Scale Image Recognition]
+    (https://arxiv.org/abs/1409.1556) (ICLR 2015)
+"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -48,6 +53,10 @@ def VGG16(
 ):
   """Instantiates the VGG16 model.
 
+  Reference paper:
+  - [Very Deep Convolutional Networks for Large-Scale Image Recognition](
+  https://arxiv.org/abs/1409.1556) (ICLR 2015)
+
   By default, it loads weights pre-trained on ImageNet. Check 'weights' for
   other options.
 
diff --git a/tensorflow/python/keras/applications/vgg19.py b/tensorflow/python/keras/applications/vgg19.py
index 2e83937ac35..81c90e1ebb4 100644
--- a/tensorflow/python/keras/applications/vgg19.py
+++ b/tensorflow/python/keras/applications/vgg19.py
@@ -53,6 +53,10 @@ def VGG19(
 ):
   """Instantiates the VGG19 architecture.
 
+  Reference:
+  - [Very Deep Convolutional Networks for Large-Scale Image Recognition](
+      https://arxiv.org/abs/1409.1556) (ICLR 2015)
+
   By default, it loads weights pre-trained on ImageNet. Check 'weights' for
   other options.
 
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index f83ed74c2f8..36b7fe2397f 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -79,6 +79,7 @@ from tensorflow.python.util import nest
 from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import keras_export
 
 py_all = all
@@ -240,14 +241,33 @@ def reset_uids():
 
 @keras_export('keras.backend.clear_session')
 def clear_session():
-  """Destroys the current TF graph and session, and creates a new one.
+  """Resets all state generated by Keras.
 
-  Calling clear_session() releases the global graph state that Keras is
-  holding on to; resets the counters used for naming layers and
-  variables in Keras; and resets the learning phase. This helps avoid clutter
-  from old models and layers, especially when memory is limited, and a
-  common use-case for clear_session is releasing memory when building models
-  and layers in a loop.
+  Keras manages a global state, which it uses to implement the Functional
+  model-building API and to uniquify autogenerated layer names.
+
+  If you are creating many models in a loop, this global state will consume
+  an increasing amount of memory over time, and you may want to clear it.
+  Calling `clear_session()` releases the global state: this helps avoid clutter
+  from old models and layers, especially when memory is limited.
+
+  Example 1: calling `clear_session()` when creating models in a loop
+
+  ```python
+  for _ in range(100):
+    # Without `clear_session()`, each iteration of this loop will
+    # slightly increase the size of the global state managed by Keras
+    model = tf.keras.Sequential([tf.keras.layers.Dense(10) for _ in range(10)])
+
+  for _ in range(100):
+    # With `clear_session()` called at the beginning,
+    # Keras starts with a blank state at each iteration
+    # and memory consumption is constant over time.
+    tf.keras.backend.clear_session()
+    model = tf.keras.Sequential([tf.keras.layers.Dense(10) for _ in range(10)])
+  ```
+
+  Example 2: resetting the layer name generation counter
 
   >>> import tensorflow as tf
   >>> layers = [tf.keras.layers.Dense(10) for _ in range(10)]
@@ -261,8 +281,6 @@ def clear_session():
   >>> new_layer = tf.keras.layers.Dense(10)
   >>> print(new_layer.name)
   dense
-  >>> print(tf.keras.backend.learning_phase())
-  0
   """
   global _SESSION
   global _GRAPH_LEARNING_PHASES  # pylint: disable=global-variable-not-assigned
@@ -5686,10 +5704,13 @@ def random_uniform(shape, minval=0.0, maxval=1.0, dtype=None, seed=None):
       shape, minval=minval, maxval=maxval, dtype=dtype, seed=seed)
 
 
+@deprecated(None, 'Use `tf.keras.backend.random_bernoulli` instead.')
 @keras_export('keras.backend.random_binomial')
 def random_binomial(shape, p=0.0, dtype=None, seed=None):
   """Returns a tensor with random binomial distribution of values.
 
+  DEPRECATED, use `tf.keras.backend.random_bernoulli` instead.
+
   The binomial distribution with parameters `n` and `p` is the probability
   distribution of the number of successful Bernoulli process. Only supports
   `n` = 1 for now.
@@ -5712,6 +5733,22 @@ def random_binomial(shape, p=0.0, dtype=None, seed=None):
       array_ops.ones(shape, dtype=dtype), array_ops.zeros(shape, dtype=dtype))
 
 
+@keras_export('keras.backend.random_bernoulli')
+def random_bernoulli(shape, p=0.0, dtype=None, seed=None):
+  """Returns a tensor with random bernoulli distribution of values.
+
+  Arguments:
+      shape: A tuple of integers, the shape of tensor to create.
+      p: A float, `0. <= p <= 1`, probability of bernoulli distribution.
+      dtype: String, dtype of returned tensor.
+      seed: Integer, random seed.
+
+  Returns:
+      A tensor.
+  """
+  return random_binomial(shape, p, dtype, seed)
+
+
 @keras_export('keras.backend.truncated_normal')
 def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
   """Returns a tensor with truncated random normal distribution of values.
diff --git a/tensorflow/python/keras/backend_config_test.py b/tensorflow/python/keras/backend_config_test.py
index 1e17c27c513..b025303a552 100644
--- a/tensorflow/python/keras/backend_config_test.py
+++ b/tensorflow/python/keras/backend_config_test.py
@@ -17,38 +17,38 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python import keras
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import backend_config
+from tensorflow.python.keras import combinations
 from tensorflow.python.platform import test
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class BackendConfigTest(test.TestCase):
 
   def test_backend(self):
-    self.assertEqual(keras.backend.backend(), 'tensorflow')
+    self.assertEqual(backend.backend(), 'tensorflow')
 
   def test_epsilon(self):
     epsilon = 1e-2
-    keras.backend_config.set_epsilon(epsilon)
-    self.assertEqual(keras.backend_config.epsilon(), epsilon)
-    keras.backend_config.set_epsilon(1e-7)
-    self.assertEqual(keras.backend_config.epsilon(), 1e-7)
+    backend_config.set_epsilon(epsilon)
+    self.assertEqual(backend_config.epsilon(), epsilon)
+    backend_config.set_epsilon(1e-7)
+    self.assertEqual(backend_config.epsilon(), 1e-7)
 
   def test_floatx(self):
     floatx = 'float64'
-    keras.backend_config.set_floatx(floatx)
-    self.assertEqual(keras.backend_config.floatx(), floatx)
-    keras.backend_config.set_floatx('float32')
-    self.assertEqual(keras.backend_config.floatx(), 'float32')
+    backend_config.set_floatx(floatx)
+    self.assertEqual(backend_config.floatx(), floatx)
+    backend_config.set_floatx('float32')
+    self.assertEqual(backend_config.floatx(), 'float32')
 
   def test_image_data_format(self):
     image_data_format = 'channels_first'
-    keras.backend_config.set_image_data_format(image_data_format)
-    self.assertEqual(keras.backend_config.image_data_format(),
-                     image_data_format)
-    keras.backend_config.set_image_data_format('channels_last')
-    self.assertEqual(keras.backend_config.image_data_format(), 'channels_last')
+    backend_config.set_image_data_format(image_data_format)
+    self.assertEqual(backend_config.image_data_format(), image_data_format)
+    backend_config.set_image_data_format('channels_last')
+    self.assertEqual(backend_config.image_data_format(), 'channels_last')
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index 5ab4f32f684..1adc20652b2 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -24,7 +24,6 @@ import numpy as np
 import scipy.sparse
 
 from tensorflow.core.protobuf import config_pb2
-from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import config
@@ -32,6 +31,11 @@ from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import combinations
+from tensorflow.python.keras.engine import input_layer
+from tensorflow.python.keras.layers import advanced_activations
+from tensorflow.python.keras.layers import normalization
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import variables
@@ -55,9 +59,9 @@ def compare_single_input_op_to_numpy(keras_op,
   inputs = 2. * np.random.random(input_shape)
   if negative_values:
     inputs -= 1.
-  keras_output = keras_op(keras.backend.variable(inputs, dtype=dtype),
-                          *keras_args, **keras_kwargs)
-  keras_output = keras.backend.eval(keras_output)
+  keras_output = keras_op(
+      backend.variable(inputs, dtype=dtype), *keras_args, **keras_kwargs)
+  keras_output = backend.eval(keras_output)
   np_output = np_op(inputs.astype(dtype), *np_args, **np_kwargs)
   try:
     np.testing.assert_allclose(keras_output, np_output, atol=1e-4)
@@ -82,10 +86,10 @@ def compare_two_inputs_op_to_numpy(keras_op,
   np_kwargs = np_kwargs or {}
   input_a = np.random.random(input_shape_a)
   input_b = np.random.random(input_shape_b)
-  keras_output = keras_op(keras.backend.variable(input_a, dtype=dtype),
-                          keras.backend.variable(input_b, dtype=dtype),
-                          *keras_args, **keras_kwargs)
-  keras_output = keras.backend.eval(keras_output)
+  keras_output = keras_op(
+      backend.variable(input_a, dtype=dtype),
+      backend.variable(input_b, dtype=dtype), *keras_args, **keras_kwargs)
+  keras_output = backend.eval(keras_output)
   np_output = np_op(input_a.astype(dtype), input_b.astype(dtype),
                     *np_args, **np_kwargs)
   try:
@@ -98,33 +102,32 @@ def compare_two_inputs_op_to_numpy(keras_op,
 
 class BackendResetTest(test.TestCase, parameterized.TestCase):
 
-  @test_util.run_all_in_graph_and_eager_modes
   def test_new_config(self):
     # User defined jit setting
     config.set_optimizer_jit(False)
-    sess = keras.backend.get_session()
+    sess = backend.get_session()
     default_config = context.context().config
     self.assertEqual(
         sess._config.graph_options.optimizer_options.global_jit_level,
         default_config.graph_options.optimizer_options.global_jit_level)
-    keras.backend.clear_session()
+    backend.clear_session()
 
     # New session has the same jit setting
-    sess = keras.backend.get_session()
+    sess = backend.get_session()
     default_config = context.context().config
     self.assertEqual(
         sess._config.graph_options.optimizer_options.global_jit_level,
         default_config.graph_options.optimizer_options.global_jit_level)
-    keras.backend.clear_session()
+    backend.clear_session()
 
     # Change respected
     config.set_optimizer_jit(True)
-    sess = keras.backend.get_session()
+    sess = backend.get_session()
     default_config = context.context().config
     self.assertEqual(
         sess._config.graph_options.optimizer_options.global_jit_level,
         default_config.graph_options.optimizer_options.global_jit_level)
-    keras.backend.clear_session()
+    backend.clear_session()
 
   # We can't use the normal parameterized decorator because the test session
   # will block graph clearing.
@@ -132,35 +135,35 @@ class BackendResetTest(test.TestCase, parameterized.TestCase):
                                   ('_v2', context.eager_mode))
   def test_new_graph(self, test_context):
     with test_context():
-      g_old = keras.backend.get_graph()
-      keras.backend.clear_session()
-      g = keras.backend.get_graph()
+      g_old = backend.get_graph()
+      backend.clear_session()
+      g = backend.get_graph()
 
       assert g_old is not g
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class BackendUtilsTest(test.TestCase):
 
   def test_backend(self):
-    self.assertEqual(keras.backend.backend(), 'tensorflow')
+    self.assertEqual(backend.backend(), 'tensorflow')
 
   def test_get_reset_uids(self):
-    self.assertEqual(keras.backend.get_uid('foo'), 1)
-    self.assertEqual(keras.backend.get_uid('foo'), 2)
+    self.assertEqual(backend.get_uid('foo'), 1)
+    self.assertEqual(backend.get_uid('foo'), 2)
 
-    keras.backend.reset_uids()
-    self.assertEqual(keras.backend.get_uid('foo'), 1)
+    backend.reset_uids()
+    self.assertEqual(backend.get_uid('foo'), 1)
 
   def test_learning_phase(self):
     with self.cached_session() as sess:
       with self.assertRaises(ValueError):
-        keras.backend.set_learning_phase(2)
+        backend.set_learning_phase(2)
 
       # Test running with a learning-phase-consuming layer
-      with keras.backend.learning_phase_scope(0):
-        x = keras.Input((3,))
-        y = keras.layers.BatchNormalization()(x)
+      with backend.learning_phase_scope(0):
+        x = input_layer.Input((3,))
+        y = normalization.BatchNormalization()(x)
         if not context.executing_eagerly():
           self.evaluate(variables.global_variables_initializer())
           sess.run(y, feed_dict={x: np.random.random((2, 3))})
@@ -168,201 +171,200 @@ class BackendUtilsTest(test.TestCase):
   def test_learning_phase_name(self):
     with ops.name_scope('test_scope'):
       # Test that outer name scopes do not affect the learning phase's name.
-      lp = keras.backend.symbolic_learning_phase()
+      lp = backend.symbolic_learning_phase()
     self.assertEqual(lp.name, 'keras_learning_phase:0')
 
   def test_learning_phase_scope(self):
-    initial_learning_phase = keras.backend.learning_phase()
-    with keras.backend.learning_phase_scope(1):
-      self.assertEqual(keras.backend.learning_phase(), 1)
-    self.assertEqual(keras.backend.learning_phase(), initial_learning_phase)
-    with keras.backend.learning_phase_scope(0):
-      self.assertEqual(keras.backend.learning_phase(), 0)
-    self.assertEqual(keras.backend.learning_phase(), initial_learning_phase)
+    initial_learning_phase = backend.learning_phase()
+    with backend.learning_phase_scope(1):
+      self.assertEqual(backend.learning_phase(), 1)
+    self.assertEqual(backend.learning_phase(), initial_learning_phase)
+    with backend.learning_phase_scope(0):
+      self.assertEqual(backend.learning_phase(), 0)
+    self.assertEqual(backend.learning_phase(), initial_learning_phase)
     with self.assertRaises(ValueError):
-      with keras.backend.learning_phase_scope(None):
+      with backend.learning_phase_scope(None):
         pass
-    self.assertEqual(keras.backend.learning_phase(), initial_learning_phase)
+    self.assertEqual(backend.learning_phase(), initial_learning_phase)
 
     new_learning_phase = 0
-    keras.backend.set_learning_phase(new_learning_phase)
-    self.assertEqual(keras.backend.learning_phase(), new_learning_phase)
-    with keras.backend.learning_phase_scope(1):
-      self.assertEqual(keras.backend.learning_phase(), 1)
-    self.assertEqual(keras.backend.learning_phase(), new_learning_phase)
+    backend.set_learning_phase(new_learning_phase)
+    self.assertEqual(backend.learning_phase(), new_learning_phase)
+    with backend.learning_phase_scope(1):
+      self.assertEqual(backend.learning_phase(), 1)
+    self.assertEqual(backend.learning_phase(), new_learning_phase)
 
   def test_learning_phase_scope_in_graph(self):
-    initial_learning_phase_outside_graph = keras.backend.learning_phase()
-    with keras.backend.get_graph().as_default():
-      initial_learning_phase_in_graph = keras.backend.learning_phase()
+    initial_learning_phase_outside_graph = backend.learning_phase()
+    with backend.get_graph().as_default():
+      initial_learning_phase_in_graph = backend.learning_phase()
 
-    self.assertEqual(keras.backend.learning_phase(),
+    self.assertEqual(backend.learning_phase(),
                      initial_learning_phase_outside_graph)
-    with keras.backend.learning_phase_scope(1):
-      self.assertEqual(keras.backend.learning_phase(), 1)
-    self.assertEqual(keras.backend.learning_phase(),
+    with backend.learning_phase_scope(1):
+      self.assertEqual(backend.learning_phase(), 1)
+    self.assertEqual(backend.learning_phase(),
                      initial_learning_phase_outside_graph)
 
-    with keras.backend.get_graph().as_default():
-      self.assertIs(keras.backend.learning_phase(),
-                    initial_learning_phase_in_graph)
+    with backend.get_graph().as_default():
+      self.assertIs(backend.learning_phase(), initial_learning_phase_in_graph)
 
-    self.assertEqual(keras.backend.learning_phase(),
+    self.assertEqual(backend.learning_phase(),
                      initial_learning_phase_outside_graph)
 
   def test_int_shape(self):
-    x = keras.backend.ones(shape=(3, 4))
-    self.assertEqual(keras.backend.int_shape(x), (3, 4))
+    x = backend.ones(shape=(3, 4))
+    self.assertEqual(backend.int_shape(x), (3, 4))
 
     if not context.executing_eagerly():
-      x = keras.backend.placeholder(shape=(None, 4))
-      self.assertEqual(keras.backend.int_shape(x), (None, 4))
+      x = backend.placeholder(shape=(None, 4))
+      self.assertEqual(backend.int_shape(x), (None, 4))
 
   def test_in_train_phase(self):
-    y1 = keras.backend.variable(1)
-    y2 = keras.backend.variable(2)
+    y1 = backend.variable(1)
+    y2 = backend.variable(2)
     if context.executing_eagerly():
-      with keras.backend.learning_phase_scope(0):
-        y_val_test = keras.backend.in_train_phase(y1, y2).numpy()
-      with keras.backend.learning_phase_scope(1):
-        y_val_train = keras.backend.in_train_phase(y1, y2).numpy()
+      with backend.learning_phase_scope(0):
+        y_val_test = backend.in_train_phase(y1, y2).numpy()
+      with backend.learning_phase_scope(1):
+        y_val_train = backend.in_train_phase(y1, y2).numpy()
     else:
-      y = keras.backend.in_train_phase(y1, y2)
-      f = keras.backend.function([keras.backend.learning_phase()], [y])
+      y = backend.in_train_phase(y1, y2)
+      f = backend.function([backend.learning_phase()], [y])
       y_val_test = f([0])[0]
       y_val_train = f([1])[0]
     self.assertAllClose(y_val_test, 2)
     self.assertAllClose(y_val_train, 1)
 
   def test_is_keras_tensor(self):
-    x = keras.backend.variable(1)
-    self.assertEqual(keras.backend.is_keras_tensor(x), False)
-    x = keras.Input(shape=(1,))
-    self.assertEqual(keras.backend.is_keras_tensor(x), True)
-    x = keras.Input(shape=(None,), ragged=True)
-    self.assertEqual(keras.backend.is_keras_tensor(x), True)
-    x = keras.Input(shape=(None, None), sparse=True)
-    self.assertEqual(keras.backend.is_keras_tensor(x), True)
+    x = backend.variable(1)
+    self.assertEqual(backend.is_keras_tensor(x), False)
+    x = input_layer.Input(shape=(1,))
+    self.assertEqual(backend.is_keras_tensor(x), True)
+    x = input_layer.Input(shape=(None,), ragged=True)
+    self.assertEqual(backend.is_keras_tensor(x), True)
+    x = input_layer.Input(shape=(None, None), sparse=True)
+    self.assertEqual(backend.is_keras_tensor(x), True)
     with self.assertRaises(ValueError):
-      keras.backend.is_keras_tensor(0)
+      backend.is_keras_tensor(0)
 
   def test_stop_gradient(self):
-    x = keras.backend.variable(1)
-    y = keras.backend.stop_gradient(x)
+    x = backend.variable(1)
+    y = backend.stop_gradient(x)
     if not context.executing_eagerly():
       self.assertEqual(y.op.name[:12], 'StopGradient')
 
-    xs = [keras.backend.variable(1) for _ in range(3)]
-    ys = keras.backend.stop_gradient(xs)
+    xs = [backend.variable(1) for _ in range(3)]
+    ys = backend.stop_gradient(xs)
     if not context.executing_eagerly():
       for y in ys:
         self.assertEqual(y.op.name[:12], 'StopGradient')
 
   def test_placeholder(self):
-    x = keras.backend.placeholder(shape=(3, 4))
+    x = backend.placeholder(shape=(3, 4))
     self.assertEqual(x.shape.as_list(), [3, 4])
-    x = keras.backend.placeholder(shape=(3, 4), sparse=True)
+    x = backend.placeholder(shape=(3, 4), sparse=True)
     self.assertEqual(x.shape.as_list(), [3, 4])
 
   def test_is_placeholder(self):
-    x = keras.backend.placeholder(shape=(1,))
-    self.assertEqual(keras.backend.is_placeholder(x), True)
-    x = keras.backend.variable(1)
-    self.assertEqual(keras.backend.is_placeholder(x), False)
+    x = backend.placeholder(shape=(1,))
+    self.assertEqual(backend.is_placeholder(x), True)
+    x = backend.variable(1)
+    self.assertEqual(backend.is_placeholder(x), False)
 
   def test_print_tensor(self):
     # Unfortunately it seems impossible to use `mock` (or any other method)
     # to capture stdout when used inside a graph or graph function, thus
     # we cannot test correctness.
     # The message gets correctly printed in practice.
-    x = keras.backend.placeholder(shape=())
-    y = keras.backend.print_tensor(x, 'eager=%s' % context.executing_eagerly())
-    f = keras.backend.function(x, y)
+    x = backend.placeholder(shape=())
+    y = backend.print_tensor(x, 'eager=%s' % context.executing_eagerly())
+    f = backend.function(x, y)
     f(0)
 
   def test_cast_to_floatx(self):
-    x = keras.backend.variable(1, dtype='float64')
-    x = keras.backend.cast_to_floatx(x)
+    x = backend.variable(1, dtype='float64')
+    x = backend.cast_to_floatx(x)
     self.assertEqual(x.dtype.name, 'float32')
-    x = keras.backend.cast_to_floatx(2)
+    x = backend.cast_to_floatx(2)
     self.assertEqual(x.dtype.name, 'float32')
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class BackendVariableTest(test.TestCase):
 
   def test_zeros(self):
-    x = keras.backend.zeros((3, 4))
-    val = keras.backend.eval(x)
+    x = backend.zeros((3, 4))
+    val = backend.eval(x)
     self.assertAllClose(val, np.zeros((3, 4)))
 
   def test_ones(self):
-    x = keras.backend.ones((3, 4))
-    val = keras.backend.eval(x)
+    x = backend.ones((3, 4))
+    val = backend.eval(x)
     self.assertAllClose(val, np.ones((3, 4)))
 
   def test_eye(self):
-    x = keras.backend.eye(4)
-    val = keras.backend.eval(x)
+    x = backend.eye(4)
+    val = backend.eval(x)
     self.assertAllClose(val, np.eye(4))
 
   def test_zeros_like(self):
-    x = keras.backend.zeros((3, 4))
-    y = keras.backend.zeros_like(x)
-    val = keras.backend.eval(y)
+    x = backend.zeros((3, 4))
+    y = backend.zeros_like(x)
+    val = backend.eval(y)
     self.assertAllClose(val, np.zeros((3, 4)))
 
   def test_ones_like(self):
-    x = keras.backend.zeros((3, 4))
-    y = keras.backend.ones_like(x)
-    val = keras.backend.eval(y)
+    x = backend.zeros((3, 4))
+    y = backend.ones_like(x)
+    val = backend.eval(y)
     self.assertAllClose(val, np.ones((3, 4)))
 
   def test_random_uniform_variable(self):
-    x = keras.backend.random_uniform_variable((30, 20), low=1, high=2, seed=0)
-    val = keras.backend.eval(x)
+    x = backend.random_uniform_variable((30, 20), low=1, high=2, seed=0)
+    val = backend.eval(x)
     self.assertAllClose(val.mean(), 1.5, atol=1e-1)
     self.assertAllClose(val.max(), 2., atol=1e-1)
     self.assertAllClose(val.min(), 1., atol=1e-1)
 
   def test_random_normal_variable(self):
-    x = keras.backend.random_normal_variable((30, 20), 1., 0.5, seed=0)
-    val = keras.backend.eval(x)
+    x = backend.random_normal_variable((30, 20), 1., 0.5, seed=0)
+    val = backend.eval(x)
     self.assertAllClose(val.mean(), 1., atol=1e-1)
     self.assertAllClose(val.std(), 0.5, atol=1e-1)
 
   def test_count_params(self):
-    x = keras.backend.zeros((4, 5))
-    val = keras.backend.count_params(x)
+    x = backend.zeros((4, 5))
+    val = backend.count_params(x)
     self.assertAllClose(val, 20)
 
   def test_constant(self):
     ref_val = np.random.random((3, 4)).astype('float32')
-    x = keras.backend.constant(ref_val)
-    val = keras.backend.eval(x)
+    x = backend.constant(ref_val)
+    val = backend.eval(x)
     self.assertAllClose(val, ref_val)
 
   def test_sparse_variable(self):
     val = scipy.sparse.eye(10)
-    x = keras.backend.variable(val)
+    x = backend.variable(val)
     self.assertTrue(isinstance(x, sparse_tensor.SparseTensor))
 
-    y = keras.backend.to_dense(x)
-    self.assertFalse(keras.backend.is_sparse(y))
+    y = backend.to_dense(x)
+    self.assertFalse(backend.is_sparse(y))
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class BackendLinearAlgebraTest(test.TestCase, parameterized.TestCase):
 
   def test_dot(self):
-    x = keras.backend.ones(shape=(2, 3))
-    y = keras.backend.ones(shape=(3, 4))
-    xy = keras.backend.dot(x, y)
+    x = backend.ones(shape=(2, 3))
+    y = backend.ones(shape=(3, 4))
+    xy = backend.dot(x, y)
     self.assertEqual(xy.shape.as_list(), [2, 4])
 
-    x = keras.backend.ones(shape=(32, 28, 3))
-    y = keras.backend.ones(shape=(3, 4))
-    xy = keras.backend.dot(x, y)
+    x = backend.ones(shape=(32, 28, 3))
+    y = backend.ones(shape=(3, 4))
+    xy = backend.dot(x, y)
     self.assertEqual(xy.shape.as_list(), [32, 28, 4])
 
   @parameterized.parameters(
@@ -376,11 +378,11 @@ class BackendLinearAlgebraTest(test.TestCase, parameterized.TestCase):
   def test_batch_dot(self, x_shape, y_shape, output_shape, axes):
     x_val = np.random.random(x_shape)
     y_val = np.random.random(y_shape)
-    x = keras.backend.variable(x_val)
-    y = keras.backend.variable(y_val)
-    xy = keras.backend.batch_dot(x, y, axes=axes)
+    x = backend.variable(x_val)
+    y = backend.variable(y_val)
+    xy = backend.batch_dot(x, y, axes=axes)
     self.assertEqual(tuple(xy.shape.as_list()), output_shape)
-    xy_val = keras.backend.eval(xy)
+    xy_val = backend.eval(xy)
     ref_val = self._reference_batch_dot(x_val, y_val, axes)
     self.assertAllClose(xy_val, ref_val, atol=1e-5)
 
@@ -409,15 +411,15 @@ class BackendLinearAlgebraTest(test.TestCase, parameterized.TestCase):
 
   def test_reduction_ops(self):
     ops_to_test = [
-        (keras.backend.max, np.max),
-        (keras.backend.min, np.min),
-        (keras.backend.sum, np.sum),
-        (keras.backend.prod, np.prod),
-        (keras.backend.var, np.var),
-        (keras.backend.std, np.std),
-        (keras.backend.mean, np.mean),
-        (keras.backend.argmin, np.argmin),
-        (keras.backend.argmax, np.argmax),
+        (backend.max, np.max),
+        (backend.min, np.min),
+        (backend.sum, np.sum),
+        (backend.prod, np.prod),
+        (backend.var, np.var),
+        (backend.std, np.std),
+        (backend.mean, np.mean),
+        (backend.argmin, np.argmin),
+        (backend.argmax, np.argmax),
     ]
     for keras_op, np_op in ops_to_test:
       compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7, 5),
@@ -436,20 +438,20 @@ class BackendLinearAlgebraTest(test.TestCase, parameterized.TestCase):
 
   def test_elementwise_ops(self):
     ops_to_test = [
-        (keras.backend.square, np.square),
-        (keras.backend.abs, np.abs),
-        (keras.backend.round, np.round),
-        (keras.backend.sign, np.sign),
-        (keras.backend.sin, np.sin),
-        (keras.backend.cos, np.cos),
-        (keras.backend.exp, np.exp),
+        (backend.square, np.square),
+        (backend.abs, np.abs),
+        (backend.round, np.round),
+        (backend.sign, np.sign),
+        (backend.sin, np.sin),
+        (backend.cos, np.cos),
+        (backend.exp, np.exp),
     ]
     for keras_op, np_op in ops_to_test:
       compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7))
 
     ops_to_test = [
-        (keras.backend.sqrt, np.sqrt),
-        (keras.backend.log, np.log),
+        (backend.sqrt, np.sqrt),
+        (backend.log, np.log),
     ]
     for keras_op, np_op in ops_to_test:
       compare_single_input_op_to_numpy(keras_op, np_op,
@@ -457,27 +459,31 @@ class BackendLinearAlgebraTest(test.TestCase, parameterized.TestCase):
                                        negative_values=False)
 
     compare_single_input_op_to_numpy(
-        keras.backend.clip, np.clip,
+        backend.clip,
+        np.clip,
         input_shape=(6, 4),
-        keras_kwargs={'min_value': 0.1, 'max_value': 2.4},
-        np_kwargs={'a_min': 0.1, 'a_max': 1.4})
+        keras_kwargs={
+            'min_value': 0.1,
+            'max_value': 2.4
+        },
+        np_kwargs={
+            'a_min': 0.1,
+            'a_max': 1.4
+        })
 
     compare_single_input_op_to_numpy(
-        keras.backend.pow, np.power,
-        input_shape=(6, 4),
-        keras_args=[3],
-        np_args=[3])
+        backend.pow, np.power, input_shape=(6, 4), keras_args=[3], np_args=[3])
 
   def test_two_tensor_ops(self):
     ops_to_test = [
-        (keras.backend.equal, np.equal),
-        (keras.backend.not_equal, np.not_equal),
-        (keras.backend.greater, np.greater),
-        (keras.backend.greater_equal, np.greater_equal),
-        (keras.backend.less, np.less),
-        (keras.backend.less_equal, np.less_equal),
-        (keras.backend.maximum, np.maximum),
-        (keras.backend.minimum, np.minimum),
+        (backend.equal, np.equal),
+        (backend.not_equal, np.not_equal),
+        (backend.greater, np.greater),
+        (backend.greater_equal, np.greater_equal),
+        (backend.less, np.less),
+        (backend.less_equal, np.less_equal),
+        (backend.maximum, np.maximum),
+        (backend.minimum, np.minimum),
     ]
     for keras_op, np_op in ops_to_test:
       compare_two_inputs_op_to_numpy(keras_op, np_op,
@@ -488,175 +494,163 @@ class BackendLinearAlgebraTest(test.TestCase, parameterized.TestCase):
     x = ops.convert_to_tensor_v2([[-4, 0], [2, 7]], 'float32')
 
     # standard relu
-    relu_op = keras.backend.relu(x)
-    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
+    relu_op = backend.relu(x)
+    self.assertAllClose(backend.eval(relu_op), [[0, 0], [2, 7]])
 
     # alpha (leaky relu used)
-    relu_op = keras.backend.relu(x, alpha=0.5)
+    relu_op = backend.relu(x, alpha=0.5)
     if not context.executing_eagerly():
       self.assertTrue('LeakyRelu' in relu_op.name)
-    self.assertAllClose(keras.backend.eval(relu_op), [[-2, 0], [2, 7]])
+    self.assertAllClose(backend.eval(relu_op), [[-2, 0], [2, 7]])
 
     # max_value < some elements
-    relu_op = keras.backend.relu(x, max_value=5)
-    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 5]])
+    relu_op = backend.relu(x, max_value=5)
+    self.assertAllClose(backend.eval(relu_op), [[0, 0], [2, 5]])
 
     # nn.relu6 used
-    relu_op = keras.backend.relu(x, max_value=6)
+    relu_op = backend.relu(x, max_value=6)
     if not context.executing_eagerly():
       self.assertTrue('Relu6' in relu_op.name)  # uses tf.nn.relu6
-    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 6]])
+    self.assertAllClose(backend.eval(relu_op), [[0, 0], [2, 6]])
 
     # max value > 6
-    relu_op = keras.backend.relu(x, max_value=10)
-    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
+    relu_op = backend.relu(x, max_value=10)
+    self.assertAllClose(backend.eval(relu_op), [[0, 0], [2, 7]])
 
     # max value is float
-    relu_op = keras.backend.relu(x, max_value=4.3)
-    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 4.3]])
+    relu_op = backend.relu(x, max_value=4.3)
+    self.assertAllClose(backend.eval(relu_op), [[0, 0], [2, 4.3]])
 
     # max value == 0
-    relu_op = keras.backend.relu(x, max_value=0)
-    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 0]])
+    relu_op = backend.relu(x, max_value=0)
+    self.assertAllClose(backend.eval(relu_op), [[0, 0], [0, 0]])
 
     # alpha and max_value
-    relu_op = keras.backend.relu(x, alpha=0.25, max_value=3)
-    self.assertAllClose(keras.backend.eval(relu_op), [[-1, 0], [2, 3]])
+    relu_op = backend.relu(x, alpha=0.25, max_value=3)
+    self.assertAllClose(backend.eval(relu_op), [[-1, 0], [2, 3]])
 
     # threshold
-    relu_op = keras.backend.relu(x, threshold=3)
-    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 7]])
+    relu_op = backend.relu(x, threshold=3)
+    self.assertAllClose(backend.eval(relu_op), [[0, 0], [0, 7]])
 
     # threshold is float
-    relu_op = keras.backend.relu(x, threshold=1.5)
-    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
+    relu_op = backend.relu(x, threshold=1.5)
+    self.assertAllClose(backend.eval(relu_op), [[0, 0], [2, 7]])
 
     # threshold is negative
-    relu_op = keras.backend.relu(x, threshold=-5)
-    self.assertAllClose(keras.backend.eval(relu_op), [[-4, 0], [2, 7]])
+    relu_op = backend.relu(x, threshold=-5)
+    self.assertAllClose(backend.eval(relu_op), [[-4, 0], [2, 7]])
 
     # threshold and max_value
-    relu_op = keras.backend.relu(x, threshold=3, max_value=5)
-    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 5]])
+    relu_op = backend.relu(x, threshold=3, max_value=5)
+    self.assertAllClose(backend.eval(relu_op), [[0, 0], [0, 5]])
 
     # threshold and alpha
-    relu_op = keras.backend.relu(x, alpha=0.25, threshold=4)
-    self.assertAllClose(keras.backend.eval(relu_op), [[-2, -1], [-0.5, 7]])
+    relu_op = backend.relu(x, alpha=0.25, threshold=4)
+    self.assertAllClose(backend.eval(relu_op), [[-2, -1], [-0.5, 7]])
 
     # threshold, alpha, and max_value
-    relu_op = keras.backend.relu(x, alpha=0.25, threshold=4, max_value=5)
-    self.assertAllClose(keras.backend.eval(relu_op), [[-2, -1], [-0.5, 5]])
+    relu_op = backend.relu(x, alpha=0.25, threshold=4, max_value=5)
+    self.assertAllClose(backend.eval(relu_op), [[-2, -1], [-0.5, 5]])
 
     # Test case for GitHub issue 35430, with integer dtype
-    x = keras.Input(shape=(), name='x', dtype='int64')
-    y = keras.layers.ReLU(max_value=100, dtype='int64')(x)
+    x = input_layer.Input(shape=(), name='x', dtype='int64')
+    _ = advanced_activations.ReLU(max_value=100, dtype='int64')(x)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class BackendShapeOpsTest(test.TestCase):
 
   def test_reshape(self):
-    compare_single_input_op_to_numpy(keras.backend.reshape, np.reshape,
-                                     input_shape=(4, 7),
-                                     keras_args=[(2, 14)],
-                                     np_args=[(2, 14)])
+    compare_single_input_op_to_numpy(
+        backend.reshape,
+        np.reshape,
+        input_shape=(4, 7),
+        keras_args=[(2, 14)],
+        np_args=[(2, 14)])
 
   def test_concatenate(self):
-    a = keras.backend.variable(np.ones((1, 2, 3)))
-    b = keras.backend.variable(np.ones((1, 2, 2)))
-    y = keras.backend.concatenate([a, b], axis=-1)
+    a = backend.variable(np.ones((1, 2, 3)))
+    b = backend.variable(np.ones((1, 2, 2)))
+    y = backend.concatenate([a, b], axis=-1)
     self.assertEqual(y.shape.as_list(), [1, 2, 5])
 
   def test_permute_dimensions(self):
-    compare_single_input_op_to_numpy(keras.backend.permute_dimensions,
-                                     np.transpose,
-                                     input_shape=(4, 7),
-                                     keras_args=[(1, 0)],
-                                     np_args=[(1, 0)])
+    compare_single_input_op_to_numpy(
+        backend.permute_dimensions,
+        np.transpose,
+        input_shape=(4, 7),
+        keras_args=[(1, 0)],
+        np_args=[(1, 0)])
 
   def test_resize_images(self):
     height_factor = 2
     width_factor = 2
     data_format = 'channels_last'
-    x = keras.backend.variable(np.ones((1, 2, 2, 3)))
-    y = keras.backend.resize_images(x,
-                                    height_factor,
-                                    width_factor,
-                                    data_format)
+    x = backend.variable(np.ones((1, 2, 2, 3)))
+    y = backend.resize_images(x, height_factor, width_factor, data_format)
     self.assertEqual(y.shape.as_list(), [1, 4, 4, 3])
 
     data_format = 'channels_first'
-    x = keras.backend.variable(np.ones((1, 3, 2, 2)))
-    y = keras.backend.resize_images(x,
-                                    height_factor,
-                                    width_factor,
-                                    data_format)
+    x = backend.variable(np.ones((1, 3, 2, 2)))
+    y = backend.resize_images(x, height_factor, width_factor, data_format)
     self.assertEqual(y.shape.as_list(), [1, 3, 4, 4])
 
     # Invalid use:
     with self.assertRaises(ValueError):
-      keras.backend.resize_images(x,
-                                  height_factor,
-                                  width_factor,
-                                  data_format='unknown')
+      backend.resize_images(
+          x, height_factor, width_factor, data_format='unknown')
 
   def test_resize_volumes(self):
     height_factor = 2
     width_factor = 2
     depth_factor = 2
     data_format = 'channels_last'
-    x = keras.backend.variable(np.ones((1, 2, 2, 2, 3)))
-    y = keras.backend.resize_volumes(x,
-                                     depth_factor,
-                                     height_factor,
-                                     width_factor,
-                                     data_format)
+    x = backend.variable(np.ones((1, 2, 2, 2, 3)))
+    y = backend.resize_volumes(x, depth_factor, height_factor, width_factor,
+                               data_format)
     self.assertEqual(y.shape.as_list(), [1, 4, 4, 4, 3])
 
     data_format = 'channels_first'
-    x = keras.backend.variable(np.ones((1, 3, 2, 2, 2)))
-    y = keras.backend.resize_volumes(x,
-                                     depth_factor,
-                                     height_factor,
-                                     width_factor,
-                                     data_format)
+    x = backend.variable(np.ones((1, 3, 2, 2, 2)))
+    y = backend.resize_volumes(x, depth_factor, height_factor, width_factor,
+                               data_format)
     self.assertEqual(y.shape.as_list(), [1, 3, 4, 4, 4])
 
     # Invalid use:
     with self.assertRaises(ValueError):
-      keras.backend.resize_volumes(x,
-                                   depth_factor,
-                                   height_factor,
-                                   width_factor,
-                                   data_format='unknown')
+      backend.resize_volumes(
+          x, depth_factor, height_factor, width_factor, data_format='unknown')
 
   def test_repeat_elements(self):
-    x = keras.backend.variable(np.ones((1, 3, 2)))
-    y = keras.backend.repeat_elements(x, 3, axis=1)
+    x = backend.variable(np.ones((1, 3, 2)))
+    y = backend.repeat_elements(x, 3, axis=1)
     self.assertEqual(y.shape.as_list(), [1, 9, 2])
 
     # Use with a dynamic axis:
     if not context.executing_eagerly():
-      x = keras.backend.placeholder(shape=(2, None, 2))
-      y = keras.backend.repeat_elements(x, 3, axis=1)
+      x = backend.placeholder(shape=(2, None, 2))
+      y = backend.repeat_elements(x, 3, axis=1)
       self.assertEqual(y.shape.as_list(), [2, None, 2])
 
   def test_repeat(self):
-    x = keras.backend.variable(np.ones((1, 3)))
-    y = keras.backend.repeat(x, 2)
+    x = backend.variable(np.ones((1, 3)))
+    y = backend.repeat(x, 2)
     self.assertEqual(y.shape.as_list(), [1, 2, 3])
 
   def test_flatten(self):
-    compare_single_input_op_to_numpy(keras.backend.flatten,
-                                     np.reshape,
-                                     input_shape=(4, 7, 6),
-                                     np_args=[(4 * 7 * 6,)])
+    compare_single_input_op_to_numpy(
+        backend.flatten,
+        np.reshape,
+        input_shape=(4, 7, 6),
+        np_args=[(4 * 7 * 6,)])
 
   def test_batch_flatten(self):
-    compare_single_input_op_to_numpy(keras.backend.batch_flatten,
-                                     np.reshape,
-                                     input_shape=(4, 7, 6),
-                                     np_args=[(4, 7 * 6)])
+    compare_single_input_op_to_numpy(
+        backend.batch_flatten,
+        np.reshape,
+        input_shape=(4, 7, 6),
+        np_args=[(4, 7 * 6)])
 
   def test_temporal_padding(self):
 
@@ -667,11 +661,12 @@ class BackendShapeOpsTest(test.TestCase):
       y[:, padding[0]:-padding[1], :] = x
       return y
 
-    compare_single_input_op_to_numpy(keras.backend.temporal_padding,
-                                     ref_op,
-                                     input_shape=(4, 7, 6),
-                                     keras_args=[(2, 3)],
-                                     np_args=[(2, 3)])
+    compare_single_input_op_to_numpy(
+        backend.temporal_padding,
+        ref_op,
+        input_shape=(4, 7, 6),
+        keras_args=[(2, 3)],
+        np_args=[(2, 3)])
 
   def test_spatial_2d_padding(self):
 
@@ -690,7 +685,7 @@ class BackendShapeOpsTest(test.TestCase):
       return y
 
     compare_single_input_op_to_numpy(
-        keras.backend.spatial_2d_padding,
+        backend.spatial_2d_padding,
         ref_op,
         input_shape=(2, 3, 2, 3),
         keras_args=[((2, 3), (1, 2))],
@@ -698,7 +693,7 @@ class BackendShapeOpsTest(test.TestCase):
         np_args=[((2, 3), (1, 2))],
         np_kwargs={'data_format': 'channels_last'})
     compare_single_input_op_to_numpy(
-        keras.backend.spatial_2d_padding,
+        backend.spatial_2d_padding,
         ref_op,
         input_shape=(2, 3, 2, 3),
         keras_args=[((2, 3), (1, 2))],
@@ -732,7 +727,7 @@ class BackendShapeOpsTest(test.TestCase):
       return y
 
     compare_single_input_op_to_numpy(
-        keras.backend.spatial_3d_padding,
+        backend.spatial_3d_padding,
         ref_op,
         input_shape=(2, 3, 2, 3, 2),
         keras_args=[((2, 3), (1, 2), (2, 3))],
@@ -740,7 +735,7 @@ class BackendShapeOpsTest(test.TestCase):
         np_args=[((2, 3), (1, 2), (2, 3))],
         np_kwargs={'data_format': 'channels_last'})
     compare_single_input_op_to_numpy(
-        keras.backend.spatial_3d_padding,
+        backend.spatial_3d_padding,
         ref_op,
         input_shape=(2, 3, 2, 3, 2),
         keras_args=[((2, 3), (1, 2), (2, 3))],
@@ -749,11 +744,11 @@ class BackendShapeOpsTest(test.TestCase):
         np_kwargs={'data_format': 'channels_first'})
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
 
   def test_bias_add(self):
-    keras_op = keras.backend.bias_add
+    keras_op = backend.bias_add
     np_op = np.add
     compare_two_inputs_op_to_numpy(keras_op, np_op,
                                    input_shape_a=(4, 7),
@@ -769,18 +764,18 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
                                    input_shape_b=(7,))
 
     with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
-      x = keras.backend.variable((3, 4))
-      b = keras.backend.variable((3, 4))
-      keras.backend.bias_add(x, b)
+      x = backend.variable((3, 4))
+      b = backend.variable((3, 4))
+      backend.bias_add(x, b)
     with self.assertRaises(ValueError):
-      x = keras.backend.variable((3, 4))
-      b = keras.backend.variable((4,))
-      keras.backend.bias_add(x, b, data_format='unknown')
+      x = backend.variable((3, 4))
+      b = backend.variable((4,))
+      backend.bias_add(x, b, data_format='unknown')
 
   def test_bias_add_channels_first(self):
 
     def keras_op(x, b):
-      return keras.backend.bias_add(x, b, data_format='channels_first')
+      return backend.bias_add(x, b, data_format='channels_first')
 
     def np_op(x, b):
       if x.ndim == 3:
@@ -798,106 +793,129 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
 
   def test_pool2d(self):
     val = np.random.random((10, 3, 10, 10))
-    x = keras.backend.variable(val)
-    y = keras.backend.pool2d(x, (2, 2), strides=(1, 1),
-                             padding='valid', data_format='channels_first',
-                             pool_mode='max')
+    x = backend.variable(val)
+    y = backend.pool2d(
+        x, (2, 2),
+        strides=(1, 1),
+        padding='valid',
+        data_format='channels_first',
+        pool_mode='max')
     self.assertEqual(y.shape.as_list(), [10, 3, 9, 9])
 
-    y = keras.backend.pool2d(x, (2, 2), strides=(1, 1),
-                             padding='valid', data_format='channels_first',
-                             pool_mode='avg')
+    y = backend.pool2d(
+        x, (2, 2),
+        strides=(1, 1),
+        padding='valid',
+        data_format='channels_first',
+        pool_mode='avg')
     self.assertEqual(y.shape.as_list(), [10, 3, 9, 9])
 
     val = np.random.random((10, 10, 10, 3))
-    x = keras.backend.variable(val)
-    y = keras.backend.pool2d(x, (2, 2), strides=(1, 1),
-                             padding='valid', data_format='channels_last')
+    x = backend.variable(val)
+    y = backend.pool2d(
+        x, (2, 2), strides=(1, 1), padding='valid', data_format='channels_last')
     self.assertEqual(y.shape.as_list(), [10, 9, 9, 3])
 
     val = np.random.random((10, 10, 10, 3))
-    x = keras.backend.variable(val)
-    y = keras.backend.pool2d(x, (2, 2), strides=(1, 1),
-                             padding='same', data_format='channels_last')
+    x = backend.variable(val)
+    y = backend.pool2d(
+        x, (2, 2), strides=(1, 1), padding='same', data_format='channels_last')
     self.assertEqual(y.shape.as_list(), [10, 10, 10, 3])
 
     val = np.random.random((10, 10, 10, 3))
-    x = keras.backend.variable(val)
-    y = keras.backend.pool2d(x, (2, 2), strides=(2, 2),
-                             padding='same', data_format='channels_last')
+    x = backend.variable(val)
+    y = backend.pool2d(
+        x, (2, 2), strides=(2, 2), padding='same', data_format='channels_last')
     self.assertEqual(y.shape.as_list(), [10, 5, 5, 3])
 
     with self.assertRaises(ValueError):
-      y = keras.backend.pool2d(x, (2, 2), strides=(2, 2),
-                               padding='other', data_format='channels_last')
+      y = backend.pool2d(
+          x, (2, 2),
+          strides=(2, 2),
+          padding='other',
+          data_format='channels_last')
     with self.assertRaises(ValueError):
-      y = keras.backend.pool2d(x, (2, 2), strides=(2, 2),
-                               data_format='other')
+      y = backend.pool2d(x, (2, 2), strides=(2, 2), data_format='other')
     with self.assertRaises(ValueError):
-      y = keras.backend.pool2d(x, (2, 2, 2), strides=(2, 2))
+      y = backend.pool2d(x, (2, 2, 2), strides=(2, 2))
     with self.assertRaises(ValueError):
-      y = keras.backend.pool2d(x, (2, 2), strides=(2, 2, 2))
+      y = backend.pool2d(x, (2, 2), strides=(2, 2, 2))
     with self.assertRaises(ValueError):
-      y = keras.backend.pool2d(x, (2, 2), strides=(2, 2), pool_mode='other')
+      y = backend.pool2d(x, (2, 2), strides=(2, 2), pool_mode='other')
 
   def test_pool3d(self):
     if test.is_built_with_rocm():
       self.skipTest('Pooling with 3D tensors is not supported in ROCm')
     val = np.random.random((10, 3, 10, 10, 10))
-    x = keras.backend.variable(val)
-    y = keras.backend.pool3d(x, (2, 2, 2), strides=(1, 1, 1),
-                             padding='valid', data_format='channels_first',
-                             pool_mode='max')
+    x = backend.variable(val)
+    y = backend.pool3d(
+        x, (2, 2, 2),
+        strides=(1, 1, 1),
+        padding='valid',
+        data_format='channels_first',
+        pool_mode='max')
     self.assertEqual(y.shape.as_list(), [10, 3, 9, 9, 9])
 
-    y = keras.backend.pool3d(x, (2, 2, 2), strides=(1, 1, 1),
-                             padding='valid', data_format='channels_first',
-                             pool_mode='avg')
+    y = backend.pool3d(
+        x, (2, 2, 2),
+        strides=(1, 1, 1),
+        padding='valid',
+        data_format='channels_first',
+        pool_mode='avg')
     self.assertEqual(y.shape.as_list(), [10, 3, 9, 9, 9])
 
     val = np.random.random((10, 10, 10, 10, 3))
-    x = keras.backend.variable(val)
-    y = keras.backend.pool3d(x, (2, 2, 2), strides=(1, 1, 1),
-                             padding='valid', data_format='channels_last')
+    x = backend.variable(val)
+    y = backend.pool3d(
+        x, (2, 2, 2),
+        strides=(1, 1, 1),
+        padding='valid',
+        data_format='channels_last')
     self.assertEqual(y.shape.as_list(), [10, 9, 9, 9, 3])
 
     val = np.random.random((10, 10, 10, 10, 3))
-    x = keras.backend.variable(val)
-    y = keras.backend.pool3d(x, (2, 2, 2), strides=(1, 1, 1),
-                             padding='same', data_format='channels_last')
+    x = backend.variable(val)
+    y = backend.pool3d(
+        x, (2, 2, 2),
+        strides=(1, 1, 1),
+        padding='same',
+        data_format='channels_last')
     self.assertEqual(y.shape.as_list(), [10, 10, 10, 10, 3])
 
     val = np.random.random((10, 10, 10, 10, 3))
-    x = keras.backend.variable(val)
-    y = keras.backend.pool3d(x, (2, 2, 2), strides=(2, 2, 2),
-                             padding='same', data_format='channels_last')
+    x = backend.variable(val)
+    y = backend.pool3d(
+        x, (2, 2, 2),
+        strides=(2, 2, 2),
+        padding='same',
+        data_format='channels_last')
     self.assertEqual(y.shape.as_list(), [10, 5, 5, 5, 3])
 
   def test_conv1d(self):
     val = np.random.random((10, 4, 10))
-    x = keras.backend.variable(val)
+    x = backend.variable(val)
     kernel_val = np.random.random((3, 4, 5))
-    k = keras.backend.variable(kernel_val)
-    y = keras.backend.conv1d(x, k, strides=(1,),
-                             padding='valid', data_format='channels_first')
+    k = backend.variable(kernel_val)
+    y = backend.conv1d(
+        x, k, strides=(1,), padding='valid', data_format='channels_first')
     self.assertEqual(y.shape.as_list(), [10, 5, 8])
 
     val = np.random.random((10, 10, 4))
-    x = keras.backend.variable(val)
-    y = keras.backend.conv1d(x, k, strides=(1,),
-                             padding='valid', data_format='channels_last')
+    x = backend.variable(val)
+    y = backend.conv1d(
+        x, k, strides=(1,), padding='valid', data_format='channels_last')
     self.assertEqual(y.shape.as_list(), [10, 8, 5])
 
     val = np.random.random((10, 10, 4))
-    x = keras.backend.variable(val)
-    y = keras.backend.conv1d(x, k, strides=(1,),
-                             padding='same', data_format='channels_last')
+    x = backend.variable(val)
+    y = backend.conv1d(
+        x, k, strides=(1,), padding='same', data_format='channels_last')
     self.assertEqual(y.shape.as_list(), [10, 10, 5])
 
     val = np.random.random((10, 10, 4))
-    x = keras.backend.variable(val)
-    y = keras.backend.conv1d(x, k, strides=(2,),
-                             padding='same', data_format='channels_last')
+    x = backend.variable(val)
+    y = backend.conv1d(
+        x, k, strides=(2,), padding='same', data_format='channels_last')
     self.assertEqual(y.shape.as_list(), [10, 5, 5])
 
   def test_local_conv_channels_dim(self):
@@ -910,7 +928,7 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
       dim = len(input_spatial_shape)
 
       inputs = np.random.normal(0, 1, (batch_size,) + input_shape)
-      inputs_cf = keras.backend.variable(inputs)
+      inputs_cf = backend.variable(inputs)
 
       for kernel_size in [1, 2]:
         for stride in [1, 2]:
@@ -931,34 +949,26 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
           )
 
           kernel_cf = np.reshape(kernel, kernel_shape)
-          kernel_cf = keras.backend.variable(kernel_cf)
+          kernel_cf = backend.variable(kernel_cf)
 
-          conv_cf = keras.backend.local_conv(inputs_cf,
-                                             kernel_cf,
-                                             kernel_sizes,
-                                             strides,
-                                             output_shape,
-                                             'channels_first')
+          conv_cf = backend.local_conv(inputs_cf, kernel_cf, kernel_sizes,
+                                       strides, output_shape, 'channels_first')
 
           inputs_cl = np.transpose(inputs, [0, 2] + list(range(3, dim + 2)) +
                                    [1])
-          inputs_cl = keras.backend.variable(inputs_cl)
+          inputs_cl = backend.variable(inputs_cl)
 
           kernel_cl = np.reshape(
               np.transpose(kernel, list(range(dim)) + [dim + 1, dim, dim + 2]),
               kernel_shape
           )
-          kernel_cl = keras.backend.variable(kernel_cl)
+          kernel_cl = backend.variable(kernel_cl)
 
-          conv_cl = keras.backend.local_conv(inputs_cl,
-                                             kernel_cl,
-                                             kernel_sizes,
-                                             strides,
-                                             output_shape,
-                                             'channels_last')
+          conv_cl = backend.local_conv(inputs_cl, kernel_cl, kernel_sizes,
+                                       strides, output_shape, 'channels_last')
 
-          conv_cf = keras.backend.eval(conv_cf)
-          conv_cl = keras.backend.eval(conv_cl)
+          conv_cf = backend.eval(conv_cf)
+          conv_cl = backend.eval(conv_cl)
 
           self.assertAllCloseAccordingToType(
               conv_cf,
@@ -979,86 +989,73 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
     batch_size = 2
 
     inputs = np.random.normal(0, 1, (batch_size,) + input_shape)
-    inputs = keras.backend.variable(inputs)
+    inputs = backend.variable(inputs)
 
     kernel = np.random.normal(0, 1, (np.prod(output_shape),
                                      np.prod(kernel_sizes) * input_shape[-1],
                                      filters))
-    kernel = keras.backend.variable(kernel)
+    kernel = backend.variable(kernel)
 
-    local_conv = keras.backend.local_conv(inputs,
-                                          kernel,
-                                          kernel_sizes,
-                                          strides,
-                                          output_shape,
-                                          'channels_last')
+    local_conv = backend.local_conv(inputs, kernel, kernel_sizes, strides,
+                                    output_shape, 'channels_last')
     if len(output_shape) == 1:
-      local_conv_dim = keras.backend.local_conv1d(inputs,
-                                                  kernel,
-                                                  kernel_sizes,
-                                                  strides,
-                                                  'channels_last')
+      local_conv_dim = backend.local_conv1d(inputs, kernel, kernel_sizes,
+                                            strides, 'channels_last')
     else:
-      local_conv_dim = keras.backend.local_conv2d(inputs,
-                                                  kernel,
-                                                  kernel_sizes,
-                                                  strides,
-                                                  output_shape,
-                                                  'channels_last')
+      local_conv_dim = backend.local_conv2d(inputs, kernel, kernel_sizes,
+                                            strides, output_shape,
+                                            'channels_last')
 
-    local_conv = keras.backend.eval(local_conv)
-    local_conv_dim = keras.backend.eval(local_conv_dim)
+    local_conv = backend.eval(local_conv)
+    local_conv_dim = backend.eval(local_conv_dim)
 
     self.assertAllCloseAccordingToType(local_conv, local_conv_dim)
 
   def test_conv2d(self):
     kernel_val = np.random.random((3, 3, 4, 5))
-    k = keras.backend.variable(kernel_val)
+    k = backend.variable(kernel_val)
 
     # Test channels_first
     val = np.random.random((10, 4, 10, 10))
-    x = keras.backend.variable(val)
-    y = keras.backend.conv2d(x, k,
-                             padding='valid', data_format='channels_first')
+    x = backend.variable(val)
+    y = backend.conv2d(x, k, padding='valid', data_format='channels_first')
     self.assertEqual(y.shape.as_list(), [10, 5, 8, 8])
 
     # Test channels_last
     val = np.random.random((10, 10, 10, 4))
-    x = keras.backend.variable(val)
-    y = keras.backend.conv2d(x, k, strides=(1, 1),
-                             padding='valid', data_format='channels_last')
+    x = backend.variable(val)
+    y = backend.conv2d(
+        x, k, strides=(1, 1), padding='valid', data_format='channels_last')
     self.assertEqual(y.shape.as_list(), [10, 8, 8, 5])
 
     # Test same padding
     val = np.random.random((10, 10, 10, 4))
-    x = keras.backend.variable(val)
-    y = keras.backend.conv2d(x, k,
-                             padding='same', data_format='channels_last')
+    x = backend.variable(val)
+    y = backend.conv2d(x, k, padding='same', data_format='channels_last')
     self.assertEqual(y.shape.as_list(), [10, 10, 10, 5])
 
     # Test dilation_rate
     val = np.random.random((10, 10, 10, 4))
-    x = keras.backend.variable(val)
-    y = keras.backend.conv2d(x, k, dilation_rate=(2, 2),
-                             padding='same', data_format='channels_last')
+    x = backend.variable(val)
+    y = backend.conv2d(
+        x, k, dilation_rate=(2, 2), padding='same', data_format='channels_last')
     self.assertEqual(y.shape.as_list(), [10, 10, 10, 5])
 
     # Test strides
     val = np.random.random((10, 10, 10, 4))
-    x = keras.backend.variable(val)
-    y = keras.backend.conv2d(x, k, strides=(2, 2),
-                             padding='same', data_format='channels_last')
+    x = backend.variable(val)
+    y = backend.conv2d(
+        x, k, strides=(2, 2), padding='same', data_format='channels_last')
     self.assertEqual(y.shape.as_list(), [10, 5, 5, 5])
 
     # Test invalid arguments
     with self.assertRaises(ValueError):
-      y = keras.backend.conv2d(x, k, (2, 2),
-                               padding='other', data_format='channels_last')
+      y = backend.conv2d(
+          x, k, (2, 2), padding='other', data_format='channels_last')
     with self.assertRaises(ValueError):
-      y = keras.backend.conv2d(x, k, (2, 2),
-                               data_format='other')
+      y = backend.conv2d(x, k, (2, 2), data_format='other')
     with self.assertRaises(ValueError):
-      y = keras.backend.conv2d(x, k, (2, 2, 2))
+      y = backend.conv2d(x, k, (2, 2, 2))
 
   def test_conv2d_transpose(self):
     input_size = (7, 8)
@@ -1068,30 +1065,34 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
     batch_size = 2
 
     kernel_val = np.random.random(kernel_size + (input_depth, filters))
-    k = keras.backend.variable(kernel_val)
+    k = backend.variable(kernel_val)
 
     # Test channels_first
     input_val = np.random.random((batch_size, input_depth) + input_size)
-    x = keras.backend.variable(input_val)
-    y = keras.backend.conv2d_transpose(x, k, (batch_size, filters) + input_size,
-                                       padding='same',
-                                       data_format='channels_first')
+    x = backend.variable(input_val)
+    y = backend.conv2d_transpose(
+        x,
+        k, (batch_size, filters) + input_size,
+        padding='same',
+        data_format='channels_first')
     self.assertEqual(
         tuple(y.shape.as_list()), (batch_size, filters) + input_size)
 
     # Test channels_last
     input_val = np.random.random((batch_size,) + input_size + (input_depth,))
-    x = keras.backend.variable(input_val)
-    y = keras.backend.conv2d_transpose(
-        x, k, (batch_size,) + input_size + (filters,),
+    x = backend.variable(input_val)
+    y = backend.conv2d_transpose(
+        x,
+        k, (batch_size,) + input_size + (filters,),
         padding='same',
         data_format='channels_last')
     self.assertEqual(
         tuple(y.shape.as_list()), (batch_size,) + input_size + (filters,))
 
     # Test dilation_rate
-    y = keras.backend.conv2d_transpose(
-        x, k, (batch_size,) + input_size + (filters,),
+    y = backend.conv2d_transpose(
+        x,
+        k, (batch_size,) + input_size + (filters,),
         padding='same',
         data_format='channels_last',
         dilation_rate=(2, 2))
@@ -1099,92 +1100,89 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
         tuple(y.shape.as_list()), (batch_size,) + input_size + (filters,))
 
     # Test batch size of None in output_shape
-    y = keras.backend.conv2d_transpose(x, k, (None,) + input_size + (filters,),
-                                       padding='same',
-                                       data_format='channels_last')
+    y = backend.conv2d_transpose(
+        x,
+        k, (None,) + input_size + (filters,),
+        padding='same',
+        data_format='channels_last')
     self.assertEqual(
         tuple(y.shape.as_list()), (batch_size,) + input_size + (filters,))
 
     # Test invalid values
     with self.assertRaises(ValueError):
-      y = keras.backend.conv2d_transpose(x, k, (2, 2, 8, 9),
-                                         padding='other',
-                                         data_format='channels_last')
+      y = backend.conv2d_transpose(
+          x, k, (2, 2, 8, 9), padding='other', data_format='channels_last')
     with self.assertRaises(ValueError):
-      y = keras.backend.conv2d_transpose(x, k, (2, 2, 8, 9),
-                                         data_format='other')
+      y = backend.conv2d_transpose(x, k, (2, 2, 8, 9), data_format='other')
 
   def test_separable_conv2d(self):
     val = np.random.random((10, 4, 10, 10))
-    x = keras.backend.variable(val)
+    x = backend.variable(val)
     depthwise_kernel_val = np.random.random((3, 3, 4, 1))
     pointwise_kernel_val = np.random.random((1, 1, 4, 5))
-    dk = keras.backend.variable(depthwise_kernel_val)
-    pk = keras.backend.variable(pointwise_kernel_val)
-    y = keras.backend.separable_conv2d(
+    dk = backend.variable(depthwise_kernel_val)
+    pk = backend.variable(pointwise_kernel_val)
+    y = backend.separable_conv2d(
         x, dk, pk, padding='valid', data_format='channels_first')
     self.assertEqual(y.shape.as_list(), [10, 5, 8, 8])
 
     val = np.random.random((10, 10, 10, 4))
-    x = keras.backend.variable(val)
-    y = keras.backend.separable_conv2d(
+    x = backend.variable(val)
+    y = backend.separable_conv2d(
         x, dk, pk, strides=(1, 1), padding='valid', data_format='channels_last')
     self.assertEqual(y.shape.as_list(), [10, 8, 8, 5])
 
     val = np.random.random((10, 10, 10, 4))
-    x = keras.backend.variable(val)
-    y = keras.backend.separable_conv2d(
+    x = backend.variable(val)
+    y = backend.separable_conv2d(
         x, dk, pk, strides=(1, 1), padding='same', data_format='channels_last')
     self.assertEqual(y.shape.as_list(), [10, 10, 10, 5])
 
     val = np.random.random((10, 10, 10, 4))
-    x = keras.backend.variable(val)
-    y = keras.backend.separable_conv2d(
+    x = backend.variable(val)
+    y = backend.separable_conv2d(
         x, dk, pk, strides=(2, 2), padding='same', data_format='channels_last')
     self.assertEqual(y.shape.as_list(), [10, 5, 5, 5])
     with self.assertRaises(ValueError):
-      y = keras.backend.separable_conv2d(
+      y = backend.separable_conv2d(
           x, dk, pk, (2, 2), padding='other', data_format='channels_last')
     with self.assertRaises(ValueError):
-      y = keras.backend.separable_conv2d(
-          x, dk, pk, (2, 2), data_format='other')
+      y = backend.separable_conv2d(x, dk, pk, (2, 2), data_format='other')
     with self.assertRaises(ValueError):
-      y = keras.backend.separable_conv2d(x, dk, pk, (2, 2, 2))
+      y = backend.separable_conv2d(x, dk, pk, (2, 2, 2))
 
   def test_conv3d(self):
     val = np.random.random((10, 4, 10, 10, 10))
-    x = keras.backend.variable(val)
+    x = backend.variable(val)
     kernel_val = np.random.random((3, 3, 3, 4, 5))
-    k = keras.backend.variable(kernel_val)
-    y = keras.backend.conv3d(x, k,
-                             padding='valid', data_format='channels_first')
+    k = backend.variable(kernel_val)
+    y = backend.conv3d(x, k, padding='valid', data_format='channels_first')
     self.assertEqual(y.shape.as_list(), [10, 5, 8, 8, 8])
 
     val = np.random.random((10, 10, 10, 10, 4))
-    x = keras.backend.variable(val)
-    y = keras.backend.conv3d(x, k, strides=(1, 1, 1),
-                             padding='valid', data_format='channels_last')
+    x = backend.variable(val)
+    y = backend.conv3d(
+        x, k, strides=(1, 1, 1), padding='valid', data_format='channels_last')
     self.assertEqual(y.shape.as_list(), [10, 8, 8, 8, 5])
 
     val = np.random.random((10, 10, 10, 10, 4))
-    x = keras.backend.variable(val)
-    y = keras.backend.conv3d(x, k, strides=(1, 1, 1),
-                             padding='same', data_format='channels_last')
+    x = backend.variable(val)
+    y = backend.conv3d(
+        x, k, strides=(1, 1, 1), padding='same', data_format='channels_last')
     self.assertEqual(y.shape.as_list(), [10, 10, 10, 10, 5])
 
     val = np.random.random((10, 10, 10, 10, 4))
-    x = keras.backend.variable(val)
-    y = keras.backend.conv3d(x, k, strides=(2, 2, 2),
-                             padding='same', data_format='channels_last')
+    x = backend.variable(val)
+    y = backend.conv3d(
+        x, k, strides=(2, 2, 2), padding='same', data_format='channels_last')
     self.assertEqual(y.shape.as_list(), [10, 5, 5, 5, 5])
     with self.assertRaises(ValueError):
-      y = keras.backend.conv3d(x, k, (2, 2, 2),
-                               padding='other', data_format='channels_last')
+      y = backend.conv3d(
+          x, k, (2, 2, 2), padding='other', data_format='channels_last')
     with self.assertRaises(ValueError):
-      y = keras.backend.conv3d(x, k, (2, 2, 2),
-                               data_format='other')
+      y = backend.conv3d(x, k, (2, 2, 2), data_format='other')
     with self.assertRaises(ValueError):
-      y = keras.backend.conv3d(x, k, (2, 2))
+      y = backend.conv3d(x, k, (2, 2))
 
   def test_rnn(self):
     # implement a simple RNN
@@ -1202,13 +1200,13 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
     np_mask = np.random.randint(2, size=(num_samples, timesteps))
 
     def rnn_step_fn():
-      w_i = keras.backend.variable(w_i_val)
-      w_o = keras.backend.variable(w_o_val)
+      w_i = backend.variable(w_i_val)
+      w_o = backend.variable(w_o_val)
 
       def step_function(x, states):
         assert len(states) == 1
         prev_output = states[0]
-        output = keras.backend.dot(x, w_i) + keras.backend.dot(prev_output, w_o)
+        output = backend.dot(x, w_i) + backend.dot(prev_output, w_o)
         return output, [output]
 
       return step_function
@@ -1219,9 +1217,9 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
     state_list = [[], [], [], [], [], []]
 
     rnn_fn = rnn_step_fn()
-    inputs = keras.backend.variable(input_val)
-    initial_states = [keras.backend.variable(init_state_val)]
-    mask = keras.backend.variable(np_mask)
+    inputs = backend.variable(input_val)
+    initial_states = [backend.variable(init_state_val)]
+    mask = backend.variable(np_mask)
 
     kwargs_list = [
         {'go_backwards': False, 'mask': None},
@@ -1232,9 +1230,8 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
         {'go_backwards': False, 'mask': mask, 'unroll': True},
     ]
     for i, kwargs in enumerate(kwargs_list):
-      last_output, outputs, new_states = keras.backend.rnn(rnn_fn, inputs,
-                                                           initial_states,
-                                                           **kwargs)
+      last_output, outputs, new_states = backend.rnn(rnn_fn, inputs,
+                                                     initial_states, **kwargs)
       # check static shape inference
       self.assertEqual(last_output.shape.as_list(), [num_samples, output_dim])
       self.assertEqual(outputs.shape.as_list(),
@@ -1242,10 +1239,10 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
       for state in new_states:
         self.assertEqual(state.shape.as_list(), [num_samples, output_dim])
 
-      last_output_list[i].append(keras.backend.eval(last_output))
-      outputs_list[i].append(keras.backend.eval(outputs))
+      last_output_list[i].append(backend.eval(last_output))
+      outputs_list[i].append(backend.eval(outputs))
       self.assertLen(new_states, 1)
-      state_list[i].append(keras.backend.eval(new_states[0]))
+      state_list[i].append(backend.eval(new_states[0]))
 
       def assert_list_pairwise(z_list, atol=1e-05):
         for (z1, z2) in zip(z_list[1:], z_list[:-1]):
@@ -1292,15 +1289,14 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
     np_mask = np.random.randint(2, size=(num_samples, timesteps))
 
     def rnn_step_fn():
-      w_i = keras.backend.variable(w_i_val)
-      w_o = keras.backend.variable(w_o_val)
+      w_i = backend.variable(w_i_val)
+      w_o = backend.variable(w_o_val)
 
       def step_function(x, states):
         assert len(states) == 2
         prev_output = states[0]
-        output = keras.backend.dot(x, w_i) + keras.backend.dot(prev_output, w_o)
-        return output, [output,
-                        keras.backend.concatenate([output, output], axis=-1)]
+        output = backend.dot(x, w_i) + backend.dot(prev_output, w_o)
+        return output, [output, backend.concatenate([output, output], axis=-1)]
 
       return step_function
 
@@ -1311,13 +1307,13 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
     additional_state_list = [[], [], [], [], [], []]
 
     rnn_fn = rnn_step_fn()
-    inputs = keras.backend.variable(input_val)
+    inputs = backend.variable(input_val)
     initial_states = [
-        keras.backend.variable(init_state_val),
+        backend.variable(init_state_val),
         ops.convert_to_tensor_v2(
             np.concatenate([init_state_val, init_state_val], axis=-1))
     ]
-    mask = keras.backend.variable(np_mask)
+    mask = backend.variable(np_mask)
 
     kwargs_list = [
         {'go_backwards': False, 'mask': None},
@@ -1328,9 +1324,8 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
         {'go_backwards': False, 'mask': mask, 'unroll': True},
     ]
     for i, kwargs in enumerate(kwargs_list):
-      last_output, outputs, new_states = keras.backend.rnn(rnn_fn, inputs,
-                                                           initial_states,
-                                                           **kwargs)
+      last_output, outputs, new_states = backend.rnn(rnn_fn, inputs,
+                                                     initial_states, **kwargs)
       # check static shape inference
       self.assertEqual(last_output.shape.as_list(), [num_samples, output_dim])
       self.assertEqual(outputs.shape.as_list(),
@@ -1342,11 +1337,11 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(new_states[1].shape.as_list(),
                        [num_samples, 2 * output_dim])
 
-      last_output_list[i].append(keras.backend.eval(last_output))
-      outputs_list[i].append(keras.backend.eval(outputs))
+      last_output_list[i].append(backend.eval(last_output))
+      outputs_list[i].append(backend.eval(outputs))
       self.assertLen(new_states, 2)
-      state_list[i].append(keras.backend.eval(new_states[0]))
-      additional_state_list[i].append(keras.backend.eval(new_states[1]))
+      state_list[i].append(backend.eval(new_states[0]))
+      additional_state_list[i].append(backend.eval(new_states[1]))
 
       def assert_list_pairwise(z_list, atol=1e-05):
         for (z1, z2) in zip(z_list[1:], z_list[:-1]):
@@ -1417,11 +1412,11 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
     expected_last_state[1] += (num_timesteps - mask_last_num_timesteps)
 
     # verify same expected output for `unroll=true/false`
-    inputs = keras.backend.variable(inputs_vals)
-    initial_states = [keras.backend.variable(initial_state_vals)]
-    mask = keras.backend.variable(mask_vals)
+    inputs = backend.variable(inputs_vals)
+    initial_states = [backend.variable(initial_state_vals)]
+    mask = backend.variable(mask_vals)
     for unroll in [True, False]:
-      _, outputs, last_states = keras.backend.rnn(
+      _, outputs, last_states = backend.rnn(
           step_function,
           inputs,
           initial_states,
@@ -1429,9 +1424,8 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
           unroll=unroll,
           input_length=num_timesteps if unroll else None)
 
-      self.assertAllClose(keras.backend.eval(outputs), expected_outputs)
-      self.assertAllClose(
-          keras.backend.eval(last_states[0]), expected_last_state)
+      self.assertAllClose(backend.eval(outputs), expected_outputs)
+      self.assertAllClose(backend.eval(last_states[0]), expected_last_state)
 
   def test_rnn_output_num_dim_larger_than_2_masking(self):
     num_samples = 3
@@ -1439,8 +1433,8 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
     num_features = 5
 
     def step_function(inputs, states):
-      outputs = keras.backend.tile(keras.backend.expand_dims(inputs), [1, 1, 2])
-      return outputs, [keras.backend.identity(s) for s in states]
+      outputs = backend.tile(backend.expand_dims(inputs), [1, 1, 2])
+      return outputs, [backend.identity(s) for s in states]
       # Note: cannot just return states (which can be a problem) ->
       # tensorflow/python/ops/resource_variable_ops.py", line 824, in set_shape
       # NotImplementedError: ResourceVariable does not implement set_shape()
@@ -1455,11 +1449,11 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
     # same as the second to final output (before masked region)
     expected_outputs[-1, -1] = expected_outputs[-1, -2]
 
-    inputs = keras.backend.variable(inputs_vals)
-    initial_states = [keras.backend.variable(initial_state_vals)]
-    mask = keras.backend.variable(mask_vals)
+    inputs = backend.variable(inputs_vals)
+    initial_states = [backend.variable(initial_state_vals)]
+    mask = backend.variable(mask_vals)
     for unroll in [True, False]:
-      _, outputs, _ = keras.backend.rnn(
+      _, outputs, _ = backend.rnn(
           step_function,
           inputs,
           initial_states,
@@ -1467,7 +1461,7 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
           unroll=unroll,
           input_length=num_timesteps if unroll else None)
 
-      self.assertAllClose(keras.backend.eval(outputs), expected_outputs)
+      self.assertAllClose(backend.eval(outputs), expected_outputs)
 
   def test_rnn_state_num_dim_larger_than_2_masking(self):
     num_samples = 3
@@ -1485,11 +1479,11 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
     expected_last_state[0] += (num_timesteps - 2)
     expected_last_state[1:] += num_timesteps
 
-    inputs = keras.backend.variable(inputs_vals)
-    initial_states = [keras.backend.variable(initial_state_vals)]
-    mask = keras.backend.variable(mask_vals)
+    inputs = backend.variable(inputs_vals)
+    initial_states = [backend.variable(initial_state_vals)]
+    mask = backend.variable(mask_vals)
     for unroll in [True, False]:
-      _, _, last_states = keras.backend.rnn(
+      _, _, last_states = backend.rnn(
           step_function,
           inputs,
           initial_states,
@@ -1497,28 +1491,27 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
           unroll=unroll,
           input_length=num_timesteps if unroll else None)
 
-      self.assertAllClose(
-          keras.backend.eval(last_states[0]), expected_last_state)
+      self.assertAllClose(backend.eval(last_states[0]), expected_last_state)
 
   def test_batch_normalization(self):
     g_val = np.random.random((3,))
     b_val = np.random.random((3,))
-    gamma = keras.backend.variable(g_val)
-    beta = keras.backend.variable(b_val)
+    gamma = backend.variable(g_val)
+    beta = backend.variable(b_val)
 
     # 3D NHC case
     val = np.random.random((10, 5, 3))
-    x = keras.backend.variable(val)
+    x = backend.variable(val)
     mean, var = nn.moments(x, (0, 1), None, None, False)
-    normed = keras.backend.batch_normalization(
+    normed = backend.batch_normalization(
         x, mean, var, beta, gamma, axis=-1, epsilon=1e-3)
     self.assertEqual(normed.shape.as_list(), [10, 5, 3])
 
     # 4D NHWC case
     val = np.random.random((10, 5, 5, 3))
-    x = keras.backend.variable(val)
+    x = backend.variable(val)
     mean, var = nn.moments(x, (0, 1, 2), None, None, False)
-    normed = keras.backend.batch_normalization(
+    normed = backend.batch_normalization(
         x, mean, var, beta, gamma, axis=-1, epsilon=1e-3)
     self.assertEqual(normed.shape.as_list(), [10, 5, 5, 3])
 
@@ -1526,22 +1519,22 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
     if not context.executing_eagerly():
       # Eager CPU kernel for NCHW does not exist.
       val = np.random.random((10, 3, 5, 5))
-      x = keras.backend.variable(val)
+      x = backend.variable(val)
       mean, var = nn.moments(x, (0, 2, 3), None, None, False)
-      normed = keras.backend.batch_normalization(
+      normed = backend.batch_normalization(
           x, mean, var, beta, gamma, axis=1, epsilon=1e-3)
       self.assertEqual(normed.shape.as_list(), [10, 3, 5, 5])
 
   def test_normalize_batch_in_training(self):
     val = np.random.random((10, 3, 10, 10))
-    x = keras.backend.variable(val)
+    x = backend.variable(val)
     reduction_axes = (0, 2, 3)
 
     g_val = np.random.random((3,))
     b_val = np.random.random((3,))
-    gamma = keras.backend.variable(g_val)
-    beta = keras.backend.variable(b_val)
-    normed, mean, var = keras.backend.normalize_batch_in_training(
+    gamma = backend.variable(g_val)
+    beta = backend.variable(b_val)
+    normed, mean, var = backend.normalize_batch_in_training(
         x, gamma, beta, reduction_axes, epsilon=1e-3)
     self.assertEqual(normed.shape.as_list(), [10, 3, 10, 10])
     self.assertEqual(mean.shape.as_list(), [
@@ -1553,7 +1546,7 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
 
     # case: gamma=None
     gamma = None
-    normed, mean, var = keras.backend.normalize_batch_in_training(
+    normed, mean, var = backend.normalize_batch_in_training(
         x, gamma, beta, reduction_axes, epsilon=1e-3)
     self.assertEqual(normed.shape.as_list(), [10, 3, 10, 10])
     self.assertEqual(mean.shape.as_list(), [
@@ -1565,7 +1558,7 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
 
     # case: beta=None
     beta = None
-    normed, mean, var = keras.backend.normalize_batch_in_training(
+    normed, mean, var = backend.normalize_batch_in_training(
         x, gamma, beta, reduction_axes, epsilon=1e-3)
     self.assertEqual(normed.shape.as_list(), [10, 3, 10, 10])
     self.assertEqual(mean.shape.as_list(), [
@@ -1577,129 +1570,123 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
 
   def test_dropout(self):
     inputs = array_ops.ones((200, 200))
-    outputs = keras.backend.dropout(inputs, 0.2)
-    outputs_val = keras.backend.eval(outputs)
+    outputs = backend.dropout(inputs, 0.2)
+    outputs_val = backend.eval(outputs)
     self.assertEqual(np.min(outputs_val), 0)
     self.assertAllClose(np.count_nonzero(outputs_val), 32000, atol=1000)
     # Test noise shape
-    outputs = keras.backend.dropout(inputs, 0.2, noise_shape=(200, 1))
-    outputs_val = keras.backend.eval(outputs)
+    outputs = backend.dropout(inputs, 0.2, noise_shape=(200, 1))
+    outputs_val = backend.eval(outputs)
     self.assertAllClose(outputs_val[2, :], outputs_val[3, :], atol=1e-5)
 
 
-class BackendCrossEntropyLossesTest(test.TestCase):
+class BackendCrossEntropyLossesTest(test.TestCase, parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_binary_crossentropy_with_sigmoid(self):
-    t = keras.backend.constant([[0, 1, 0]])
-    logits = keras.backend.constant([[8., 1., 1.]])
-    p = keras.backend.sigmoid(logits)
+    t = backend.constant([[0, 1, 0]])
+    logits = backend.constant([[8., 1., 1.]])
+    p = backend.sigmoid(logits)
     p = array_ops.identity(array_ops.identity(p))
-    result = self.evaluate(keras.backend.binary_crossentropy(t, p))
+    result = self.evaluate(backend.binary_crossentropy(t, p))
     self.assertArrayNear(result[0], [8., 0.313, 1.313], 1e-3)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_categorical_crossentropy_loss(self):
-    t = keras.backend.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+    t = backend.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
 
-    p = keras.backend.constant([[.9, .05, .05], [.05, .89, .06],
-                                [.05, .01, .94]])
-    result = keras.backend.categorical_crossentropy(t, p)
+    p = backend.constant([[.9, .05, .05], [.05, .89, .06], [.05, .01, .94]])
+    result = backend.categorical_crossentropy(t, p)
     self.assertArrayNear(self.evaluate(result), [.105, .116, .062], 1e-3)
 
-    p = keras.backend.constant([[.9, .05, .05], [.05, .89, .01],
-                                [.05, .06, .94]])
-    result = keras.backend.categorical_crossentropy(t, p, axis=0)
+    p = backend.constant([[.9, .05, .05], [.05, .89, .01], [.05, .06, .94]])
+    result = backend.categorical_crossentropy(t, p, axis=0)
     self.assertArrayNear(self.evaluate(result), [.105, .116, .062], 1e-3)
 
-    p = keras.backend.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    result = keras.backend.categorical_crossentropy(t, p, from_logits=True),
+    p = backend.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    result = backend.categorical_crossentropy(t, p, from_logits=True),
     self.assertArrayNear(self.evaluate(result)[0], [.002, 0, .17], 1e-3)
 
-    p = keras.backend.constant([[8., 0., 2.], [1., 9., 3.], [1., 1., 5.]])
-    result = keras.backend.categorical_crossentropy(
-        t, p, from_logits=True, axis=0),
+    p = backend.constant([[8., 0., 2.], [1., 9., 3.], [1., 1., 5.]])
+    result = backend.categorical_crossentropy(t, p, from_logits=True, axis=0),
     self.assertArrayNear(self.evaluate(result)[0], [.002, 0, .17], 1e-3)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_categorical_crossentropy_loss_with_unknown_rank_tensor(self):
-    t = keras.backend.placeholder()
-    p = keras.backend.placeholder()
-    o = keras.backend.categorical_crossentropy(t, p)
+    t = backend.placeholder()
+    p = backend.placeholder()
+    o = backend.categorical_crossentropy(t, p)
 
     t_val = ops.convert_to_tensor_v2([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])
     p_val = ops.convert_to_tensor_v2([[.9, .05, .05], [.05, .89, .06],
                                       [.05, .01, .94]])
-    f = keras.backend.function([t, p], o)
+    f = backend.function([t, p], o)
 
     result = f([t_val, p_val])
     self.assertArrayNear(result, [.105, .116, .062], 1e-3)
 
     # With axis set
-    o = keras.backend.categorical_crossentropy(t, p, axis=0)
-    f = keras.backend.function([t, p], o)
+    o = backend.categorical_crossentropy(t, p, axis=0)
+    f = backend.function([t, p], o)
 
     result = f([t_val, p_val])
     self.assertArrayNear(result, [.105, .065, .111], 1e-3)
 
     # from logits
     p_val = ops.convert_to_tensor_v2([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    o = keras.backend.categorical_crossentropy(t, p, from_logits=True)
-    f = keras.backend.function([t, p], o)
+    o = backend.categorical_crossentropy(t, p, from_logits=True)
+    f = backend.function([t, p], o)
 
     result = f([t_val, p_val])
     self.assertArrayNear(result, [.002, 0, .17], 1e-3)
 
     # from logits and axis set
-    o = keras.backend.categorical_crossentropy(t, p, from_logits=True, axis=0)
-    f = keras.backend.function([t, p], o)
+    o = backend.categorical_crossentropy(t, p, from_logits=True, axis=0)
+    f = backend.function([t, p], o)
 
     result = f([t_val, p_val])
     self.assertArrayNear(result, [.002, .003, .036], 1e-3)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_categorical_crossentropy_with_softmax(self):
-    t = keras.backend.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
-    logits = keras.backend.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    p = keras.backend.softmax(logits)
+    t = backend.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+    logits = backend.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    p = backend.softmax(logits)
     p = array_ops.identity(array_ops.identity(p))
-    result = self.evaluate(keras.backend.categorical_crossentropy(t, p))
+    result = self.evaluate(backend.categorical_crossentropy(t, p))
     self.assertArrayNear(result, [0.002, 0.0005, 0.17], 1e-3)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_sparse_categorical_crossentropy_loss(self):
-    t = keras.backend.constant([0, 1, 2])
+    t = backend.constant([0, 1, 2])
 
-    p = keras.backend.constant([[.9, .05, .05], [.05, .89, .06],
-                                [.05, .01, .94]])
-    result = keras.backend.sparse_categorical_crossentropy(t, p)
+    p = backend.constant([[.9, .05, .05], [.05, .89, .06], [.05, .01, .94]])
+    result = backend.sparse_categorical_crossentropy(t, p)
     self.assertArrayNear(self.evaluate(result), [.105, .116, .062], 1e-3)
 
-    p = keras.backend.constant([[.9, .05, .05], [.05, .89, .01],
-                                [.05, .06, .94]])
-    result = keras.backend.sparse_categorical_crossentropy(t, p, axis=0)
+    p = backend.constant([[.9, .05, .05], [.05, .89, .01], [.05, .06, .94]])
+    result = backend.sparse_categorical_crossentropy(t, p, axis=0)
     self.assertArrayNear(self.evaluate(result), [.105, .116, .062], 1e-3)
 
-    p = keras.backend.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    result = keras.backend.sparse_categorical_crossentropy(
-        t, p, from_logits=True),
+    p = backend.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    result = backend.sparse_categorical_crossentropy(t, p, from_logits=True),
     self.assertArrayNear(self.evaluate(result)[0], [.002, 0, .17], 1e-3)
 
-    p = keras.backend.constant([[8., 0., 2.], [1., 9., 3.], [1., 1., 5.]])
-    result = keras.backend.sparse_categorical_crossentropy(
+    p = backend.constant([[8., 0., 2.], [1., 9., 3.], [1., 1., 5.]])
+    result = backend.sparse_categorical_crossentropy(
         t, p, from_logits=True, axis=0),
     self.assertArrayNear(self.evaluate(result)[0], [.002, 0, .17], 1e-3)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_sparse_categorical_crossentropy_loss_with_unknown_rank_tensor(self):
-    t = keras.backend.placeholder()
-    p = keras.backend.placeholder()
-    o = keras.backend.sparse_categorical_crossentropy(t, p)
+    t = backend.placeholder()
+    p = backend.placeholder()
+    o = backend.sparse_categorical_crossentropy(t, p)
 
     t_val = ops.convert_to_tensor_v2([0, 1, 2])
     p_val = ops.convert_to_tensor_v2([[.9, .05, .05], [.05, .89, .06],
                                       [.05, .01, .94]])
-    f = keras.backend.function([t, p], o)
+    f = backend.function([t, p], o)
 
     result = f([t_val, p_val])
     self.assertArrayNear(result, [.105, .116, .062], 1e-3)
@@ -1708,15 +1695,15 @@ class BackendCrossEntropyLossesTest(test.TestCase):
     with self.assertRaisesRegex(
         ValueError,
         'Cannot compute sparse categorical crossentropy with `axis=0`'):
-      o = keras.backend.sparse_categorical_crossentropy(t, p, axis=0)
-      f = keras.backend.function([t, p], o)
+      o = backend.sparse_categorical_crossentropy(t, p, axis=0)
+      f = backend.function([t, p], o)
 
       _ = f([t_val, p_val])
 
     # from logits
     p_val = ops.convert_to_tensor_v2([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    o = keras.backend.sparse_categorical_crossentropy(t, p, from_logits=True)
-    f = keras.backend.function([t, p], o)
+    o = backend.sparse_categorical_crossentropy(t, p, from_logits=True)
+    f = backend.function([t, p], o)
 
     result = f([t_val, p_val])
     self.assertArrayNear(result, [.002, 0, .17], 1e-3)
@@ -1725,24 +1712,24 @@ class BackendCrossEntropyLossesTest(test.TestCase):
     with self.assertRaisesRegex(
         ValueError,
         'Cannot compute sparse categorical crossentropy with `axis=0`'):
-      o = keras.backend.sparse_categorical_crossentropy(
+      o = backend.sparse_categorical_crossentropy(
           t, p, from_logits=True, axis=0)
-      f = keras.backend.function([t, p], o)
+      f = backend.function([t, p], o)
 
       _ = f([t_val, p_val])
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_sparse_categorical_crossentropy_with_softmax(self):
-    t = keras.backend.constant([0, 1, 2])
-    logits = keras.backend.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    p = keras.backend.softmax(logits)
+    t = backend.constant([0, 1, 2])
+    logits = backend.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    p = backend.softmax(logits)
     p = array_ops.identity(array_ops.identity(p))
-    result = self.evaluate(keras.backend.sparse_categorical_crossentropy(t, p))
+    result = self.evaluate(backend.sparse_categorical_crossentropy(t, p))
     self.assertArrayNear(result, [0.002, 0.0005, 0.17], 1e-3)
 
 
-@test_util.run_all_in_graph_and_eager_modes
 @test_util.with_control_flow_v2
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class TestCTC(test.TestCase):
 
   def test_ctc_decode(self):
@@ -1763,11 +1750,10 @@ class TestCTC(test.TestCase):
                for t in range(seq_len_0)] +  # Pad to max_time_steps = 8
               2 * [np.zeros((1, depth), dtype=np.float32)])
 
-    inputs = keras.backend.variable(np.asarray(inputs).transpose((1, 0, 2)))
+    inputs = backend.variable(np.asarray(inputs).transpose((1, 0, 2)))
 
     # batch_size length vector of sequence_lengths
-    input_length = keras.backend.variable(
-        np.array([seq_len_0], dtype=np.int32))
+    input_length = backend.variable(np.array([seq_len_0], dtype=np.int32))
     # batch_size length vector of negative log probabilities
     log_prob_truth = np.array([
         -3.5821197,  # output beam 0
@@ -1778,7 +1764,7 @@ class TestCTC(test.TestCase):
     beam_width = 2
     top_paths = 2
 
-    decode_pred_tf, log_prob_pred_tf = keras.backend.ctc_decode(
+    decode_pred_tf, log_prob_pred_tf = backend.ctc_decode(
         inputs,
         input_length,
         greedy=False,
@@ -1786,11 +1772,10 @@ class TestCTC(test.TestCase):
         top_paths=top_paths)
 
     self.assertEqual(len(decode_pred_tf), top_paths)
-    log_prob_pred = keras.backend.eval(log_prob_pred_tf)
+    log_prob_pred = backend.eval(log_prob_pred_tf)
     for i in range(top_paths):
       self.assertTrue(
-          np.alltrue(
-              decode_truth[i] == keras.backend.eval(decode_pred_tf[i])))
+          np.alltrue(decode_truth[i] == backend.eval(decode_pred_tf[i])))
     self.assertAllClose(log_prob_truth, log_prob_pred)
 
   def test_ctc_batch_cost(self):
@@ -1814,12 +1799,12 @@ class TestCTC(test.TestCase):
             [0.423286, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046]]],
           dtype=np.float32)
 
-      labels = keras.backend.variable(labels, dtype='int32')
-      inputs = keras.backend.variable(inputs, dtype='float32')
-      input_lens = keras.backend.variable(input_lens, dtype='int32')
-      label_lens = keras.backend.variable(label_lens, dtype='int32')
-      res = keras.backend.eval(
-          keras.backend.ctc_batch_cost(labels, inputs, input_lens, label_lens))
+      labels = backend.variable(labels, dtype='int32')
+      inputs = backend.variable(inputs, dtype='float32')
+      input_lens = backend.variable(input_lens, dtype='int32')
+      label_lens = backend.variable(label_lens, dtype='int32')
+      res = backend.eval(
+          backend.ctc_batch_cost(labels, inputs, input_lens, label_lens))
       self.assertAllClose(res[:, 0], loss_log_probs, atol=1e-05)
 
       # test when batch_size = 1, that is, one sample only
@@ -1837,75 +1822,75 @@ class TestCTC(test.TestCase):
           ],
           dtype=np.float32)
 
-      k_labels = keras.backend.variable(labels, dtype='int32')
-      k_inputs = keras.backend.variable(inputs, dtype='float32')
-      k_input_lens = keras.backend.variable(input_lens, dtype='int32')
-      k_label_lens = keras.backend.variable(label_lens, dtype='int32')
-      res = keras.backend.eval(
-          keras.backend.ctc_batch_cost(k_labels, k_inputs, k_input_lens,
-                                       k_label_lens))
+      k_labels = backend.variable(labels, dtype='int32')
+      k_inputs = backend.variable(inputs, dtype='float32')
+      k_input_lens = backend.variable(input_lens, dtype='int32')
+      k_label_lens = backend.variable(label_lens, dtype='int32')
+      res = backend.eval(
+          backend.ctc_batch_cost(k_labels, k_inputs, k_input_lens,
+                                 k_label_lens))
       self.assertAllClose(res[:, 0], ref, atol=1e-05)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class TestRandomOps(test.TestCase):
 
   def test_random_normal(self):
     np.random.seed(123)
-    x = keras.backend.random_normal((500, 500))
-    val = keras.backend.eval(x)
+    x = backend.random_normal((500, 500))
+    val = backend.eval(x)
     self.assertAllClose(np.mean(val), 0., atol=0.01)
     self.assertAllClose(np.std(val), 1., atol=0.01)
 
   def test_random_uniform(self):
     np.random.seed(123)
-    x = keras.backend.random_uniform((500, 500))
-    val = keras.backend.eval(x)
+    x = backend.random_uniform((500, 500))
+    val = backend.eval(x)
     self.assertAllClose(np.mean(val), 0.5, atol=0.01)
     self.assertAllClose(np.max(val), 1., atol=0.01)
     self.assertAllClose(np.min(val), 0., atol=0.01)
 
   def test_random_binomial(self):
     np.random.seed(123)
-    x = keras.backend.random_binomial((500, 500), p=0.5)
-    self.assertAllClose(np.mean(keras.backend.eval(x)), 0.5, atol=0.01)
+    x = backend.random_binomial((500, 500), p=0.5)
+    self.assertAllClose(np.mean(backend.eval(x)), 0.5, atol=0.01)
 
   def test_truncated_normal(self):
     np.random.seed(123)
-    x = keras.backend.truncated_normal((500, 500), mean=0.0, stddev=1.0)
-    x = keras.backend.truncated_normal((1000, 1000), mean=0.0, stddev=1.0)
-    y = keras.backend.eval(x)
+    x = backend.truncated_normal((500, 500), mean=0.0, stddev=1.0)
+    x = backend.truncated_normal((1000, 1000), mean=0.0, stddev=1.0)
+    y = backend.eval(x)
     self.assertAllClose(np.mean(y), 0., atol=0.01)
     self.assertAllClose(np.std(y), 0.88, atol=0.01)
     self.assertAllClose(np.max(y), 2., atol=0.01)
     self.assertAllClose(np.min(y), -2., atol=0.01)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class FunctionTest(test.TestCase):
 
   def test_function_basics(self):
-    x1 = keras.backend.placeholder(shape=(), dtype='float32')
-    x2 = keras.backend.placeholder(shape=(), dtype='int32')
-    v = keras.backend.variable(10.)
+    x1 = backend.placeholder(shape=(), dtype='float32')
+    x2 = backend.placeholder(shape=(), dtype='int32')
+    v = backend.variable(10.)
 
-    y1 = x1 + keras.backend.cast(x2, 'float32') + v
-    y2 = x1 * keras.backend.cast(x2, 'float32')
+    y1 = x1 + backend.cast(x2, 'float32') + v
+    y2 = x1 * backend.cast(x2, 'float32')
 
     with ops.control_dependencies([y1]):
-      u = keras.backend.update(v, x1)
+      u = backend.update(v, x1)
 
-    f = keras.backend.function([x1, x2], [y1, y2], updates=[u])
+    f = backend.function([x1, x2], [y1, y2], updates=[u])
     output_values = f([2, 3])
     self.assertEqual(output_values, [15., 6.])
-    self.assertEqual(keras.backend.eval(v), 2.)
+    self.assertEqual(backend.eval(v), 2.)
 
   def test_function_dict_outputs(self):
-    x_ph = keras.backend.placeholder(shape=(), name='x')
-    y_ph = keras.backend.placeholder(shape=(), name='y')
+    x_ph = backend.placeholder(shape=(), name='x')
+    y_ph = backend.placeholder(shape=(), name='y')
     outputs = {'x*y': y_ph * x_ph, 'x*x': x_ph * x_ph}
 
-    f = keras.backend.function(inputs=[x_ph, y_ph], outputs=outputs)
+    f = backend.function(inputs=[x_ph, y_ph], outputs=outputs)
     x, y = 2., 5.
     results = f([x, y])
 
@@ -1914,46 +1899,46 @@ class FunctionTest(test.TestCase):
 
   def test_function_dict_inputs(self):
     placeholders = {
-        'x': keras.backend.placeholder(shape=()),
-        'y': keras.backend.placeholder(shape=())
+        'x': backend.placeholder(shape=()),
+        'y': backend.placeholder(shape=())
     }
     outputs = [placeholders['x'] * placeholders['y']]
 
-    f = keras.backend.function(inputs=placeholders, outputs=outputs)
+    f = backend.function(inputs=placeholders, outputs=outputs)
     results = f({'x': 2., 'y': 3.})
     self.assertEqual(results[0], 6.)
 
   def test_function_single_input_output(self):
-    x_ph = keras.backend.placeholder(shape=(), name='x')
+    x_ph = backend.placeholder(shape=(), name='x')
     output = x_ph * x_ph
-    f = keras.backend.function(x_ph, output)
+    f = backend.function(x_ph, output)
     result = f(2.)
     self.assertEqual(result, 4.)
 
   def test_tuple_updates(self):
-    x_ph = keras.backend.placeholder(ndim=2)
-    v = keras.backend.variable(np.ones((4, 2)))
+    x_ph = backend.placeholder(ndim=2)
+    v = backend.variable(np.ones((4, 2)))
     output = x_ph ** 2 + v
     new_v = v + x_ph
-    f = keras.backend.function(x_ph, output, updates=[(v, new_v)])
+    f = backend.function(x_ph, output, updates=[(v, new_v)])
     input_val = np.random.random((4, 2))
     result = f(input_val)
     self.assertAllClose(result, input_val ** 2 + 1)
-    self.assertAllClose(keras.backend.get_value(v), np.ones((4, 2)) + input_val)
+    self.assertAllClose(backend.get_value(v), np.ones((4, 2)) + input_val)
 
 
-class BackendGraphTests(test.TestCase):
+class BackendGraphTests(test.TestCase, parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_function_placeholder_with_default(self):
-    with keras.backend.get_graph().as_default():
+    with backend.get_graph().as_default():
       x1 = array_ops.placeholder_with_default(
           np.array(2., dtype='float32'), shape=())
       x2 = array_ops.placeholder_with_default(
           np.array(3, dtype='int32'), shape=())
-    y1 = x1 + keras.backend.cast(x2, 'float32')
-    y2 = x1 * keras.backend.cast(x2, 'float32')
-    f = keras.backend.function([x1, x2], [y1, y2])
+    y1 = x1 + backend.cast(x2, 'float32')
+    y2 = x1 * backend.cast(x2, 'float32')
+    f = backend.function([x1, x2], [y1, y2])
     output_values = f([4, 5])
     self.assertEqual(output_values, [9., 20.])
     output_values = f([None, None])
@@ -1963,23 +1948,23 @@ class BackendGraphTests(test.TestCase):
     # Test Keras backend functions with TF tensor inputs.
     with ops.Graph().as_default(), self.cached_session():
       # Test feeding a resource variable to `function`.
-      x1 = keras.backend.placeholder(shape=())
-      x2 = keras.backend.placeholder(shape=())
-      lr = keras.backend.learning_phase()  # Include a placeholder_with_default.
+      x1 = backend.placeholder(shape=())
+      x2 = backend.placeholder(shape=())
+      lr = backend.learning_phase()  # Include a placeholder_with_default.
 
-      y1 = keras.backend.variable(10.)
+      y1 = backend.variable(10.)
       y2 = 3
 
-      f = keras.backend.function(
+      f = backend.function(
           inputs=[x1, x2, lr],
-          outputs=[x1 + 1, keras.backend.in_train_phase(x2 + 2, x2 - 1)])
+          outputs=[x1 + 1, backend.in_train_phase(x2 + 2, x2 - 1)])
       outs = f([y1, y2, None])  # Use default learning_phase value.
       self.assertEqual(outs, [11., 2.])
       outs = f([y1, y2, 1])  # Set learning phase value.
       self.assertEqual(outs, [11., 5.])
 
       # Test triggering a callable refresh by changing the input.
-      y3 = keras.backend.constant(20.)  # Test with tensor
+      y3 = backend.constant(20.)  # Test with tensor
       outs = f([y3, y2, None])
       self.assertEqual(outs, [21., 2.])
 
@@ -1988,31 +1973,30 @@ class BackendGraphTests(test.TestCase):
       self.assertEqual(outs, [5., 2.])
 
       # Test with a different dtype
-      y5 = keras.backend.constant(10., dtype='float64')
+      y5 = backend.constant(10., dtype='float64')
       outs = f([y5, y2, None])
       self.assertEqual(outs, [11., 2.])
 
   def test_function_tf_fetches(self):
     # Additional operations can be passed to tf.compat.v1.Session().run() via
     # its `fetches` arguments. In contrast to `updates` argument of
-    # keras.backend.function() these do not have control dependency on `outputs`
+    # backend.function() these do not have control dependency on `outputs`
     # so they can run in parallel. Also they should not contribute to output of
-    # keras.backend.function().
+    # backend.function().
     with ops.Graph().as_default(), self.cached_session():
-      x = keras.backend.variable(0.)
-      y = keras.backend.variable(0.)
-      x_placeholder = keras.backend.placeholder(shape=())
-      y_placeholder = keras.backend.placeholder(shape=())
+      x = backend.variable(0.)
+      y = backend.variable(0.)
+      x_placeholder = backend.placeholder(shape=())
+      y_placeholder = backend.placeholder(shape=())
 
-      f = keras.backend.function(
+      f = backend.function(
           inputs=[x_placeholder, y_placeholder],
           outputs=[x_placeholder + y_placeholder],
           updates=[(x, x_placeholder + 1.)],
-          fetches=[keras.backend.update(y, 5.)])
+          fetches=[backend.update(y, 5.)])
       output = f([10., 20.])
       self.assertEqual(output, [30.])
-      self.assertEqual(keras.backend.get_session().run(fetches=[x, y]),
-                       [11., 5.])
+      self.assertEqual(backend.get_session().run(fetches=[x, y]), [11., 5.])
 
   def test_function_tf_feed_dict(self):
     # Additional substitutions can be passed to `tf.compat.v1.Session().run()`
@@ -2021,14 +2005,14 @@ class BackendGraphTests(test.TestCase):
     # this feed_dict we can provide additional substitutions besides Keras
     # inputs.
     with ops.Graph().as_default(), self.cached_session():
-      x = keras.backend.variable(0.)
-      y = keras.backend.variable(0.)
-      x_placeholder = keras.backend.placeholder(shape=())
-      y_placeholder = keras.backend.placeholder(shape=())
+      x = backend.variable(0.)
+      y = backend.variable(0.)
+      x_placeholder = backend.placeholder(shape=())
+      y_placeholder = backend.placeholder(shape=())
 
       feed_dict = {y_placeholder: 3.}
-      fetches = [keras.backend.update(y, y_placeholder * 10.)]
-      f = keras.backend.function(
+      fetches = [backend.update(y, y_placeholder * 10.)]
+      f = backend.function(
           inputs=[x_placeholder],
           outputs=[x_placeholder + 1.],
           updates=[(x, x_placeholder + 10.)],
@@ -2036,40 +2020,38 @@ class BackendGraphTests(test.TestCase):
           fetches=fetches)
       output = f([10.])
       self.assertEqual(output, [11.])
-      self.assertEqual(keras.backend.get_session().run(fetches=[x, y]),
-                       [20., 30.])
+      self.assertEqual(backend.get_session().run(fetches=[x, y]), [20., 30.])
 
       # updated value in feed_dict will be modified within the K.function()
       feed_dict[y_placeholder] = 4.
       output = f([20.])
       self.assertEqual(output, [21.])
-      self.assertEqual(keras.backend.get_session().run(fetches=[x, y]),
-                       [30., 40.])
+      self.assertEqual(backend.get_session().run(fetches=[x, y]), [30., 40.])
 
   def test_function_tf_run_options_with_run_metadata(self):
     with ops.Graph().as_default(), self.cached_session():
-      x_placeholder = keras.backend.placeholder(shape=())
-      y_placeholder = keras.backend.placeholder(shape=())
+      x_placeholder = backend.placeholder(shape=())
+      y_placeholder = backend.placeholder(shape=())
 
       run_options = config_pb2.RunOptions(output_partition_graphs=True)
       run_metadata = config_pb2.RunMetadata()
       # enable run_options.
-      f = keras.backend.function(
+      f = backend.function(
           inputs=[x_placeholder, y_placeholder],
           outputs=[x_placeholder + y_placeholder],
           options=run_options,
           run_metadata=run_metadata)
       output = f([10., 20.])
       self.assertEqual(output, [30.])
-      self.assertGreater(len(run_metadata.partition_graphs), 0)
+      self.assertNotEmpty(run_metadata.partition_graphs)
       # disable run_options.
-      f1 = keras.backend.function(
+      f1 = backend.function(
           inputs=[x_placeholder, y_placeholder],
           outputs=[x_placeholder + y_placeholder],
           run_metadata=run_metadata)
       output1 = f1([10., 20.])
       self.assertEqual(output1, [30.])
-      self.assertEqual(len(run_metadata.partition_graphs), 0)
+      self.assertEmpty(run_metadata.partition_graphs)
 
   def test_function_fetch_callbacks(self):
 
@@ -2085,12 +2067,12 @@ class BackendGraphTests(test.TestCase):
 
     with ops.Graph().as_default(), self.cached_session():
       callback = CallbackStub()
-      x_placeholder = keras.backend.placeholder(shape=())
-      y_placeholder = keras.backend.placeholder(shape=())
+      x_placeholder = backend.placeholder(shape=())
+      y_placeholder = backend.placeholder(shape=())
 
       callback_op = x_placeholder * y_placeholder
 
-      f = keras.backend.function(
+      f = backend.function(
           inputs=[x_placeholder, y_placeholder],
           outputs=[x_placeholder + y_placeholder])
       f.fetches.append(callback_op)
@@ -2103,16 +2085,16 @@ class BackendGraphTests(test.TestCase):
 
   def test_get_session_different_graphs(self):
     with ops.Graph().as_default():
-      x = keras.backend.constant(1)
-      session = keras.backend.get_session()
-      self.assertIs(session, keras.backend.get_session((x,)))
-      self.assertIs(session, keras.backend.get_session())
+      x = backend.constant(1)
+      session = backend.get_session()
+      self.assertIs(session, backend.get_session((x,)))
+      self.assertIs(session, backend.get_session())
     with ops.Graph().as_default():
-      self.assertIs(session, keras.backend.get_session((x,)))
-      self.assertIsNot(session, keras.backend.get_session())
+      self.assertIs(session, backend.get_session((x,)))
+      self.assertIsNot(session, backend.get_session())
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class ControlOpsTests(test.TestCase):
 
   def test_function_switch_basics(self):
@@ -2120,16 +2102,16 @@ class ControlOpsTests(test.TestCase):
     y = array_ops.constant(3.0)
 
     def xpowy():
-      return keras.backend.pow(x, y)
+      return backend.pow(x, y)
 
     def ypowx():
-      return keras.backend.pow(y, x)
+      return backend.pow(y, x)
 
-    tensor = keras.backend.switch(keras.backend.less(x, y), xpowy, ypowx)
-    self.assertEqual(keras.backend.eval(tensor), [8.0])
+    tensor = backend.switch(backend.less(x, y), xpowy, ypowx)
+    self.assertEqual(backend.eval(tensor), [8.0])
 
-    tensor = keras.backend.switch(keras.backend.greater(x, y), xpowy, ypowx)
-    self.assertEqual(keras.backend.eval(tensor), [9.0])
+    tensor = backend.switch(backend.greater(x, y), xpowy, ypowx)
+    self.assertEqual(backend.eval(tensor), [9.0])
 
   def test_unequal_rank(self):
     x = ops.convert_to_tensor_v2(
@@ -2144,13 +2126,13 @@ class ControlOpsTests(test.TestCase):
 
     with self.assertRaisesRegexp(ValueError,
                                  'Rank of `condition` should be less than'):
-      keras.backend.switch(keras.backend.equal(x, x), false_func, true_func)
+      backend.switch(backend.equal(x, x), false_func, true_func)
 
 
 class ContextValueCacheTest(test.TestCase):
 
   def test_cache(self):
-    cache = keras.backend.ContextValueCache(list)
+    cache = backend.ContextValueCache(list)
     graph1 = ops.Graph()
     graph2 = ops.Graph()
 
@@ -2177,8 +2159,8 @@ class ContextValueCacheTest(test.TestCase):
     self.assertLen(cache, 2)
 
   def test_cache_in_parent_graph(self):
-    cache = keras.backend.ContextValueCache(int)
-    cache.setdefault(None, keras.backend.constant(5))
+    cache = backend.ContextValueCache(int)
+    cache.setdefault(None, backend.constant(5))
 
     with ops.Graph().as_default() as g:
       # g is not a child graph of the default test context, so the recursive
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index bf93d15870e..bb9e61d01a2 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -39,12 +39,14 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.distribute import multi_worker_training_state as training_state
+from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras.utils.data_utils import Sequence
 from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.ops import variables
@@ -221,6 +223,18 @@ class CallbackList(object):
     self._queue_length = 10
     self._reset_batch_timing()
 
+    # Determines if batch-level hooks need to be called.
+    # This is important for performance, because processing batch-level logs
+    # will cause async eager to block on each batch.
+    # pylint: disable=protected-access
+    self._should_call_train_batch_hooks = any(
+        cb._implements_train_batch_hooks() for cb in self.callbacks)
+    self._should_call_test_batch_hooks = any(
+        cb._implements_test_batch_hooks() for cb in self.callbacks)
+    self._should_call_predict_batch_hooks = any(
+        cb._implements_predict_batch_hooks() for cb in self.callbacks)
+    # pylint: enable=protected-access
+
   def _add_default_callbacks(self, add_history, add_progbar):
     """Adds `Callback`s that are always present."""
     self._progbar = None
@@ -311,12 +325,14 @@ class CallbackList(object):
       self.on_predict_end()
 
   def on_batch_begin(self, batch, logs=None):
-    logs = self._process_logs(logs)
-    self._call_batch_hook(ModeKeys.TRAIN, 'begin', batch, logs=logs)
+    if self._should_call_train_batch_hooks:
+      logs = self._process_logs(logs)
+      self._call_batch_hook(ModeKeys.TRAIN, 'begin', batch, logs=logs)
 
   def on_batch_end(self, batch, logs=None):
-    logs = self._process_logs(logs)
-    self._call_batch_hook(ModeKeys.TRAIN, 'end', batch, logs=logs)
+    if self._should_call_train_batch_hooks:
+      logs = self._process_logs(logs)
+      self._call_batch_hook(ModeKeys.TRAIN, 'end', batch, logs=logs)
 
   def on_epoch_begin(self, epoch, logs=None):
     """Calls the `on_epoch_begin` methods of its callbacks.
@@ -356,8 +372,11 @@ class CallbackList(object):
         logs: dict. Has keys `batch` and `size` representing the current batch
           number and the size of the batch.
     """
-    logs = self._process_logs(logs)
-    self._call_batch_hook(ModeKeys.TRAIN, 'begin', batch, logs=logs)
+    # TODO(b/150629188): Make ProgBarLogger callback not use batch hooks
+    # when verbose != 1
+    if self._should_call_train_batch_hooks:
+      logs = self._process_logs(logs)
+      self._call_batch_hook(ModeKeys.TRAIN, 'begin', batch, logs=logs)
 
   def on_train_batch_end(self, batch, logs=None):
     """Calls the `on_train_batch_end` methods of its callbacks.
@@ -366,8 +385,9 @@ class CallbackList(object):
         batch: integer, index of batch within the current epoch.
         logs: dict. Metric results for this batch.
     """
-    logs = self._process_logs(logs)
-    self._call_batch_hook(ModeKeys.TRAIN, 'end', batch, logs=logs)
+    if self._should_call_train_batch_hooks:
+      logs = self._process_logs(logs)
+      self._call_batch_hook(ModeKeys.TRAIN, 'end', batch, logs=logs)
 
   def on_test_batch_begin(self, batch, logs=None):
     """Calls the `on_test_batch_begin` methods of its callbacks.
@@ -377,8 +397,9 @@ class CallbackList(object):
         logs: dict. Has keys `batch` and `size` representing the current batch
           number and the size of the batch.
     """
-    logs = self._process_logs(logs)
-    self._call_batch_hook(ModeKeys.TEST, 'begin', batch, logs=logs)
+    if self._should_call_test_batch_hooks:
+      logs = self._process_logs(logs)
+      self._call_batch_hook(ModeKeys.TEST, 'begin', batch, logs=logs)
 
   def on_test_batch_end(self, batch, logs=None):
     """Calls the `on_test_batch_end` methods of its callbacks.
@@ -387,7 +408,9 @@ class CallbackList(object):
         batch: integer, index of batch within the current epoch.
         logs: dict. Metric results for this batch.
     """
-    self._call_batch_hook(ModeKeys.TEST, 'end', batch, logs=logs)
+    if self._should_call_test_batch_hooks:
+      logs = self._process_logs(logs)
+      self._call_batch_hook(ModeKeys.TEST, 'end', batch, logs=logs)
 
   def on_predict_batch_begin(self, batch, logs=None):
     """Calls the `on_predict_batch_begin` methods of its callbacks.
@@ -397,8 +420,9 @@ class CallbackList(object):
         logs: dict. Has keys `batch` and `size` representing the current batch
           number and the size of the batch.
     """
-    logs = self._process_logs(logs)
-    self._call_batch_hook(ModeKeys.PREDICT, 'begin', batch, logs=logs)
+    if self._should_call_predict_batch_hooks:
+      logs = self._process_logs(logs)
+      self._call_batch_hook(ModeKeys.PREDICT, 'begin', batch, logs=logs)
 
   def on_predict_batch_end(self, batch, logs=None):
     """Calls the `on_predict_batch_end` methods of its callbacks.
@@ -407,8 +431,9 @@ class CallbackList(object):
         batch: integer, index of batch within the current epoch.
         logs: dict. Metric results for this batch.
     """
-    logs = self._process_logs(logs)
-    self._call_batch_hook(ModeKeys.PREDICT, 'end', batch, logs=logs)
+    if self._should_call_predict_batch_hooks:
+      logs = self._process_logs(logs)
+      self._call_batch_hook(ModeKeys.PREDICT, 'end', batch, logs=logs)
 
   def on_train_begin(self, logs=None):
     """Calls the `on_train_begin` methods of its callbacks.
@@ -524,10 +549,12 @@ class Callback(object):
     self.model = model
 
   @doc_controls.for_subclass_implementers
+  @generic_utils.default
   def on_batch_begin(self, batch, logs=None):
     """A backwards compatibility alias for `on_train_batch_begin`."""
 
   @doc_controls.for_subclass_implementers
+  @generic_utils.default
   def on_batch_end(self, batch, logs=None):
     """A backwards compatibility alias for `on_train_batch_end`."""
 
@@ -559,6 +586,7 @@ class Callback(object):
     """
 
   @doc_controls.for_subclass_implementers
+  @generic_utils.default
   def on_train_batch_begin(self, batch, logs=None):
     """Called at the beginning of a training batch in `fit` methods.
 
@@ -573,6 +601,7 @@ class Callback(object):
     self.on_batch_begin(batch, logs=logs)
 
   @doc_controls.for_subclass_implementers
+  @generic_utils.default
   def on_train_batch_end(self, batch, logs=None):
     """Called at the end of a training batch in `fit` methods.
 
@@ -586,6 +615,7 @@ class Callback(object):
     self.on_batch_end(batch, logs=logs)
 
   @doc_controls.for_subclass_implementers
+  @generic_utils.default
   def on_test_batch_begin(self, batch, logs=None):
     """Called at the beginning of a batch in `evaluate` methods.
 
@@ -601,6 +631,7 @@ class Callback(object):
     """
 
   @doc_controls.for_subclass_implementers
+  @generic_utils.default
   def on_test_batch_end(self, batch, logs=None):
     """Called at the end of a batch in `evaluate` methods.
 
@@ -615,6 +646,7 @@ class Callback(object):
     """
 
   @doc_controls.for_subclass_implementers
+  @generic_utils.default
   def on_predict_batch_begin(self, batch, logs=None):
     """Called at the beginning of a batch in `predict` methods.
 
@@ -627,6 +659,7 @@ class Callback(object):
     """
 
   @doc_controls.for_subclass_implementers
+  @generic_utils.default
   def on_predict_batch_end(self, batch, logs=None):
     """Called at the end of a batch in `predict` methods.
 
@@ -703,6 +736,23 @@ class Callback(object):
           but that may change in the future.
     """
 
+  def _implements_train_batch_hooks(self):
+    """Determines if this Callback should be called for each train batch."""
+    return (not generic_utils.is_default(self.on_batch_begin) or
+            not generic_utils.is_default(self.on_batch_end) or
+            not generic_utils.is_default(self.on_train_batch_begin) or
+            not generic_utils.is_default(self.on_train_batch_end))
+
+  def _implements_test_batch_hooks(self):
+    """Determines if this Callback should be called for each test batch."""
+    return (not generic_utils.is_default(self.on_test_batch_begin) or
+            not generic_utils.is_default(self.on_test_batch_end))
+
+  def _implements_predict_batch_hooks(self):
+    """Determines if this Callback should be called for each predict batch."""
+    return (not generic_utils.is_default(self.on_predict_batch_begin) or
+            not generic_utils.is_default(self.on_predict_batch_end))
+
 
 @keras_export('keras.callbacks.BaseLogger')
 class BaseLogger(Callback):
@@ -1105,8 +1155,8 @@ class ModelCheckpoint(Callback):
         self.model._training_state = None
 
   def on_batch_end(self, batch, logs=None):
-    logs = logs or {}
-    if isinstance(self.save_freq, int):
+    if self._implements_train_batch_hooks():
+      logs = logs or {}
       self._batches_seen_since_last_saving += 1
       if self._batches_seen_since_last_saving >= self.save_freq:
         self._save_model(epoch=self._current_epoch, logs=logs)
@@ -1307,6 +1357,10 @@ class ModelCheckpoint(Callback):
       # the file path with the largest file name.
       return file_path_with_largest_file_name
 
+  def _implements_train_batch_hooks(self):
+    # If save_freq="epoch", batch-level hooks don't need to be run.
+    return isinstance(self.save_freq, int)
+
 
 @keras_export('keras.callbacks.EarlyStopping')
 class EarlyStopping(Callback):
@@ -1895,9 +1949,7 @@ class TensorBoard(Callback):
   def on_train_begin(self, logs=None):
     self._init_batch_steps()
     if self._start_batch == 1:
-      summary_ops_v2.trace_on(graph=True, profiler=False)
-      profiler.start(logdir=os.path.join(self._log_write_dir, 'train'))
-      self._is_tracing = True
+      self._enable_trace()
 
   def on_test_begin(self, logs=None):
     self._set_default_writer(self._validation_run_name)
@@ -1911,6 +1963,8 @@ class TensorBoard(Callback):
       batch: Integer, index of batch within the current epoch.
       logs: Dict. Metric results for this batch.
     """
+    # TODO(b/150629188): Make TensorBoard callback not use batch hooks
+    # by default.
     if self.update_freq == 'epoch' and self._start_batch is None:
       return
 
@@ -1921,14 +1975,14 @@ class TensorBoard(Callback):
       self._log_metrics(logs, prefix='batch_', step=train_batches)
 
     self._increment_step(self._train_run_name)
-
-    if context.executing_eagerly():
-      if self._is_tracing and math_ops.greater_equal(train_batches,
-                                                     self._stop_batch):
-        self._log_trace()
-      elif (not self._is_tracing and
-            math_ops.equal(train_batches, self._start_batch - 1)):
-        self._enable_trace()
+    if self._is_tracing:
+      control_flow_ops.cond(
+          math_ops.greater_equal(train_batches, self._stop_batch),
+          lambda: self._log_trace_return_true(), lambda: False)  # pylint: disable=unnecessary-lambda
+    else:
+      control_flow_ops.cond(
+          math_ops.equal(train_batches, self._start_batch - 1),
+          lambda: self._enable_trace_return_true(), lambda: False)  # pylint: disable=unnecessary-lambda
 
   def on_test_batch_end(self, batch, logs=None):
     if self.update_freq == 'epoch':
@@ -1965,21 +2019,48 @@ class TensorBoard(Callback):
           self.log_dir, self.model._get_distribution_strategy())  # pylint: disable=protected-access
 
   def _enable_trace(self):
+    """Starts to collect trace graph to TensorBoard.
+
+    Collects both trace and graph in eager mode, and trace only in graph mode.
+    """
     if context.executing_eagerly():
+      # Graph must be traced in eager mode.
       summary_ops_v2.trace_on(graph=True, profiler=False)
-      profiler.start(logdir=os.path.join(self._log_write_dir, 'train'))
-      self._is_tracing = True
+    profiler.start(logdir=os.path.join(self._log_write_dir, 'train'))
+    self._is_tracing = True
+
+  def _enable_trace_return_true(self):
+    """Starts to collect trace graph to TensorBoard and returns True.
+
+    Returns:
+      True.
+    """
+    self._enable_trace()
+    return True
 
   def _log_trace(self):
-    """Logs the trace graph to TensorBoard."""
+    """Logs the trace graph to TensorBoard.
+
+    Logs both trace and graph in eager mode, and trace only in graph mode.
+    """
+    profiler.stop()
     if context.executing_eagerly():
+      # Graph must be traced in eager mode.
       with self._get_writer(self._train_run_name).as_default(), \
           summary_ops_v2.always_record_summaries():
         # TODO(b/126388999): Remove step info in the summary name.
         step = K.get_value(self._total_batches_seen[self._train_run_name])
         summary_ops_v2.trace_export(name='batch_%d' % step, step=step)
-        profiler.stop()
-      self._is_tracing = False
+    self._is_tracing = False
+
+  def _log_trace_return_true(self):
+    """Logs the trace graph to TensorBoard and returns True.
+
+    Returns:
+      True.
+    """
+    self._log_trace()
+    return True
 
   def _log_metrics(self, logs, prefix, step):
     """Writes metrics out as custom scalar summaries.
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index 0ef052d0fdb..eb62d0b29ee 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -36,6 +36,7 @@ from tensorflow.core.framework import summary_pb2
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
@@ -881,12 +882,10 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
           optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])
       weights = model.get_weights()
 
-      stopper = keras.callbacks.EarlyStopping(monitor='acc', patience=patience)
-      hist = model.fit(data, labels, callbacks=[stopper], verbose=0, epochs=20)
-      assert len(hist.epoch) >= patience
-
       # This should allow training to go for at least `patience` epochs
       model.set_weights(weights)
+
+      stopper = keras.callbacks.EarlyStopping(monitor='acc', patience=patience)
       hist = model.fit(data, labels, callbacks=[stopper], verbose=0, epochs=20)
       assert len(hist.epoch) >= patience
 
@@ -954,7 +953,8 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
 
   def test_RemoteMonitor(self):
     if requests is None:
-      return
+      self.skipTest('`requests` required to run this test')
+      return None
 
     monitor = keras.callbacks.RemoteMonitor()
     # This will raise a warning since the default address in unreachable:
@@ -1378,6 +1378,7 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
   def test_RemoteMonitorWithJsonPayload(self):
     if requests is None:
       self.skipTest('`requests` required to run this test')
+      return None
     with self.cached_session():
       (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
           train_samples=TRAIN_SAMPLES,
@@ -1442,6 +1443,123 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
     self.assertTrue(callback.on_batch_end_called)
     self.assertTrue(callback.on_batch_end_called)
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_implements_batch_hooks(self):
+
+    class MyCallbackWithBatchHooks(keras.callbacks.Callback):
+
+      def __init__(self):
+        self.train_batches = 0
+        self.test_batches = 0
+        self.predict_batches = 0
+
+      def on_train_batch_end(self, batch, logs=None):
+        self.train_batches += 1
+
+      def on_test_batch_end(self, batch, logs=None):
+        self.test_batches += 1
+
+      def on_predict_batch_end(self, batch, logs=None):
+        self.predict_batches += 1
+
+    class MyCallbackWithoutBatchHooks(keras.callbacks.Callback):
+
+      def __init__(self):
+        self.epochs = 0
+
+      def on_epoch_end(self, epoch, logs=None):
+        self.epochs += 1
+
+    x, y = np.ones((10, 1)), np.ones((10, 1))
+    model = keras.Sequential([keras.layers.Dense(1)])
+    model.compile('sgd', 'mse')
+
+    my_cb = MyCallbackWithBatchHooks()
+    cb_list = keras.callbacks.CallbackList([my_cb], verbose=0)
+    self.assertTrue(cb_list._should_call_train_batch_hooks)
+    self.assertTrue(cb_list._should_call_test_batch_hooks)
+    self.assertTrue(cb_list._should_call_predict_batch_hooks)
+
+    model.fit(x, y, epochs=2, batch_size=10, callbacks=[my_cb], verbose=0)
+    model.evaluate(x, y, batch_size=10, callbacks=[my_cb], verbose=0)
+    model.predict(x, batch_size=10, callbacks=[my_cb], verbose=0)
+
+    self.assertEqual(my_cb.train_batches, 2)
+    self.assertEqual(my_cb.test_batches, 1)
+    self.assertEqual(my_cb.predict_batches, 1)
+
+    my_cb = MyCallbackWithoutBatchHooks()
+    cb_list = keras.callbacks.CallbackList([my_cb], verbose=0)
+    self.assertLen(cb_list.callbacks, 1)
+    self.assertFalse(cb_list._should_call_train_batch_hooks)
+    self.assertFalse(cb_list._should_call_test_batch_hooks)
+    self.assertFalse(cb_list._should_call_predict_batch_hooks)
+
+    model.fit(x, y, epochs=2, batch_size=10, callbacks=[my_cb], verbose=0)
+    model.evaluate(x, y, batch_size=10, callbacks=[my_cb], verbose=0)
+    model.predict(x, batch_size=10, callbacks=[my_cb], verbose=0)
+
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_implements_batch_hooks_override(self):
+
+    class MyCallback(keras.callbacks.Callback):
+
+      def __init__(self, should_run=True):
+        self.should_run = should_run
+        self.train_batches = 0
+        self.test_batches = 0
+        self.predict_batches = 0
+
+      def on_train_batch_end(self, batch, logs=None):
+        self.train_batches += 1
+
+      def on_test_batch_end(self, batch, logs=None):
+        self.test_batches += 1
+
+      def on_predict_batch_end(self, batch, logs=None):
+        self.predict_batches += 1
+
+      def _implements_train_batch_hooks(self):
+        return self.should_run
+
+      def _implements_test_batch_hooks(self):
+        return self.should_run
+
+      def _implements_predict_batch_hooks(self):
+        return self.should_run
+
+    x, y = np.ones((10, 1)), np.ones((10, 1))
+    model = keras.Sequential([keras.layers.Dense(1)])
+    model.compile('sgd', 'mse')
+
+    my_cb = MyCallback(should_run=True)
+    cb_list = keras.callbacks.CallbackList([my_cb], verbose=0)
+    self.assertTrue(cb_list._should_call_train_batch_hooks)
+    self.assertTrue(cb_list._should_call_test_batch_hooks)
+    self.assertTrue(cb_list._should_call_predict_batch_hooks)
+
+    model.fit(x, y, epochs=2, batch_size=10, callbacks=[my_cb], verbose=0)
+    model.evaluate(x, y, batch_size=10, callbacks=[my_cb], verbose=0)
+    model.predict(x, batch_size=10, callbacks=[my_cb], verbose=0)
+
+    self.assertEqual(my_cb.train_batches, 2)
+    self.assertEqual(my_cb.test_batches, 1)
+    self.assertEqual(my_cb.predict_batches, 1)
+
+    my_cb = MyCallback(should_run=False)
+    cb_list = keras.callbacks.CallbackList([my_cb], verbose=0)
+    self.assertFalse(cb_list._should_call_train_batch_hooks)
+    self.assertFalse(cb_list._should_call_test_batch_hooks)
+    self.assertFalse(cb_list._should_call_predict_batch_hooks)
+
+    model.fit(x, y, epochs=2, batch_size=10, callbacks=[my_cb], verbose=0)
+    model.evaluate(x, y, batch_size=10, callbacks=[my_cb], verbose=0)
+    model.predict(x, batch_size=10, callbacks=[my_cb], verbose=0)
+
+    self.assertEqual(my_cb.train_batches, 0)
+    self.assertEqual(my_cb.test_batches, 0)
+    self.assertEqual(my_cb.predict_batches, 0)
+
 
 # A summary that was emitted during a test. Fields:
 #   logdir: str. The logdir of the FileWriter to which the summary was
@@ -1482,8 +1600,7 @@ def list_summaries(logdir):
     ValueError: If an event file contains an summary of unexpected kind.
   """
   result = _SummaryFile()
-  for (dirpath, dirnames, filenames) in os.walk(logdir):
-    del dirnames  # unused
+  for (dirpath, _, filenames) in os.walk(logdir):
     for filename in filenames:
       if not filename.startswith('events.out.'):
         continue
@@ -1624,8 +1741,7 @@ class TestTensorBoardV2(keras_parameterized.TestCase):
         callbacks=[tb_cbk])
 
     events_file_run_basenames = set()
-    for (dirpath, dirnames, filenames) in os.walk(self.logdir):
-      del dirnames  # unused
+    for (dirpath, _, filenames) in os.walk(self.logdir):
       if any(fn.startswith('events.out.') for fn in filenames):
         events_file_run_basenames.add(os.path.basename(dirpath))
     self.assertEqual(events_file_run_basenames, {'train'})
@@ -1950,6 +2066,31 @@ class TestTensorBoardV2NonParameterizedTest(keras_parameterized.TestCase):
     )
     self.assertIsNotNone(self._get_trace_file(logdir=self.train_dir))
 
+  # Test case that replicates a Github issue.
+  # https://github.com/tensorflow/tensorflow/issues/37543
+  def test_TensorBoard_autoTrace_profileTwiceGraphMode(self):
+    ops.disable_eager_execution()
+    inp = keras.Input((1,))
+    out = keras.layers.Dense(units=1)(inp)
+    model = keras.Model(inp, out)
+
+    model.compile(gradient_descent.SGD(1), 'mse')
+
+    model.fit(
+        np.zeros((64, 1)),
+        np.zeros((64, 1)),
+        callbacks=[keras.callbacks.TensorBoard(self.logdir, profile_batch=1)],
+    )
+    # Verifies trace exists in the first train_dir.
+    self.assertIsNotNone(self._get_trace_file(logdir=self.train_dir))
+    model.fit(
+        np.zeros((64, 1)),
+        np.zeros((64, 1)),
+        callbacks=[keras.callbacks.TensorBoard(self.logdir, profile_batch=2)],
+    )
+    # Verifies trace exists in the second train_dir.
+    self.assertIsNotNone(self._get_trace_file(logdir=self.train_dir))
+
   def test_TensorBoard_autoTrace_profileBatchRange(self):
     model = self._get_seq_model()
     x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
diff --git a/tensorflow/python/keras/callbacks_v1_test.py b/tensorflow/python/keras/callbacks_v1_test.py
index 52d86c6486f..f409e0dad85 100644
--- a/tensorflow/python/keras/callbacks_v1_test.py
+++ b/tensorflow/python/keras/callbacks_v1_test.py
@@ -22,14 +22,19 @@ import os
 import shutil
 import tempfile
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.core.framework import summary_pb2
-from tensorflow.python import keras
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras import callbacks
 from tensorflow.python.keras import callbacks_v1
+from tensorflow.python.keras import combinations
+from tensorflow.python.keras import layers
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.engine import input_layer
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.utils import np_utils
 from tensorflow.python.platform import test
 from tensorflow.python.training import adam
@@ -43,7 +48,7 @@ NUM_HIDDEN = 5
 BATCH_SIZE = 5
 
 
-class TestTensorBoardV1(test.TestCase):
+class TestTensorBoardV1(test.TestCase, parameterized.TestCase):
 
   def test_TensorBoard(self):
     np.random.seed(1337)
@@ -77,13 +82,13 @@ class TestTensorBoardV1(test.TestCase):
 
     # case: Sequential
     with ops.Graph().as_default(), self.cached_session():
-      model = keras.models.Sequential()
+      model = sequential.Sequential()
       model.add(
-          keras.layers.Dense(
+          layers.Dense(
               NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
       # non_trainable_weights: moving_variance, moving_mean
-      model.add(keras.layers.BatchNormalization())
-      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+      model.add(layers.BatchNormalization())
+      model.add(layers.Dense(NUM_CLASSES, activation='softmax'))
       model.compile(
           loss='categorical_crossentropy',
           optimizer='sgd',
@@ -184,14 +189,14 @@ class TestTensorBoardV1(test.TestCase):
           i += 1
           i %= max_batch_index
 
-      inp1 = keras.Input((INPUT_DIM,))
-      inp2 = keras.Input((INPUT_DIM,))
-      inp = keras.layers.add([inp1, inp2])
-      hidden = keras.layers.Dense(2, activation='relu')(inp)
-      hidden = keras.layers.Dropout(0.1)(hidden)
-      output1 = keras.layers.Dense(NUM_CLASSES, activation='softmax')(hidden)
-      output2 = keras.layers.Dense(NUM_CLASSES, activation='softmax')(hidden)
-      model = keras.models.Model([inp1, inp2], [output1, output2])
+      inp1 = input_layer.Input((INPUT_DIM,))
+      inp2 = input_layer.Input((INPUT_DIM,))
+      inp = layers.add([inp1, inp2])
+      hidden = layers.Dense(2, activation='relu')(inp)
+      hidden = layers.Dropout(0.1)(hidden)
+      output1 = layers.Dense(NUM_CLASSES, activation='softmax')(hidden)
+      output2 = layers.Dense(NUM_CLASSES, activation='softmax')(hidden)
+      model = training.Model([inp1, inp2], [output1, output2])
       model.compile(loss='categorical_crossentropy',
                     optimizer='sgd',
                     metrics=['accuracy'])
@@ -271,13 +276,13 @@ class TestTensorBoardV1(test.TestCase):
     y_train = np_utils.to_categorical(y_train)
 
     with ops.Graph().as_default(), self.cached_session():
-      model = keras.models.Sequential()
+      model = sequential.Sequential()
       model.add(
-          keras.layers.Dense(
+          layers.Dense(
               NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
       # non_trainable_weights: moving_variance, moving_mean
-      model.add(keras.layers.BatchNormalization())
-      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+      model.add(layers.BatchNormalization())
+      model.add(layers.Dense(NUM_CLASSES, activation='softmax'))
       model.compile(
           loss='categorical_crossentropy',
           optimizer='sgd',
@@ -371,7 +376,7 @@ class TestTensorBoardV1(test.TestCase):
           loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
 
       cbks = [
-          keras.callbacks.ReduceLROnPlateau(
+          callbacks.ReduceLROnPlateau(
               monitor='val_loss', factor=0.5, patience=4, verbose=1),
           callbacks_v1.TensorBoard(log_dir=temp_dir)
       ]
@@ -463,7 +468,7 @@ class TestTensorBoardV1(test.TestCase):
       self.assertEqual(epoch_step, 0)
       self.assertEqual(epoch_summary.value[0].simple_value, 10.0)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_Tensorboard_eager(self):
     temp_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
     self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
@@ -529,7 +534,7 @@ class TestTensorBoardV1(test.TestCase):
       tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 1})
       self.assertEqual(tb_cbk.writer.batch_summaries, [])
       tb_cbk.on_epoch_end(0, {'acc': 10.0, 'size': 1})
-      self.assertEqual(len(tb_cbk.writer.epoch_summaries), 1)
+      self.assertLen(tb_cbk.writer.epoch_summaries, 1)
       tb_cbk.on_train_end()
 
       # Batch mode
@@ -537,9 +542,9 @@ class TestTensorBoardV1(test.TestCase):
       tb_cbk.writer = FileWriterStub(temp_dir)
 
       tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 1})
-      self.assertEqual(len(tb_cbk.writer.batch_summaries), 1)
+      self.assertLen(tb_cbk.writer.batch_summaries, 1)
       tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 1})
-      self.assertEqual(len(tb_cbk.writer.batch_summaries), 2)
+      self.assertLen(tb_cbk.writer.batch_summaries, 2)
       self.assertFalse(tb_cbk.writer.epoch_summaries)
       tb_cbk.on_train_end()
 
@@ -550,13 +555,13 @@ class TestTensorBoardV1(test.TestCase):
       tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
       self.assertFalse(tb_cbk.writer.batch_summaries)
       tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
-      self.assertEqual(len(tb_cbk.writer.batch_summaries), 1)
+      self.assertLen(tb_cbk.writer.batch_summaries, 1)
       tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
-      self.assertEqual(len(tb_cbk.writer.batch_summaries), 1)
+      self.assertLen(tb_cbk.writer.batch_summaries, 1)
       tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
-      self.assertEqual(len(tb_cbk.writer.batch_summaries), 2)
+      self.assertLen(tb_cbk.writer.batch_summaries, 2)
       tb_cbk.on_batch_end(0, {'acc': 10.0, 'size': 10})
-      self.assertEqual(len(tb_cbk.writer.batch_summaries), 2)
+      self.assertLen(tb_cbk.writer.batch_summaries, 2)
       self.assertFalse(tb_cbk.writer.epoch_summaries)
       tb_cbk.on_train_end()
 
diff --git a/tensorflow/python/keras/combinations.py b/tensorflow/python/keras/combinations.py
new file mode 100644
index 00000000000..1d8e6a91930
--- /dev/null
+++ b/tensorflow/python/keras/combinations.py
@@ -0,0 +1,110 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""This module customizes `test_combinations` for `tf.keras` related tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+from tensorflow.python import tf2
+from tensorflow.python.framework import combinations
+from tensorflow.python.framework import test_combinations
+from tensorflow.python.keras import testing_utils
+
+KERAS_MODEL_TYPES = ['functional', 'subclass', 'sequential']
+
+
+def keras_mode_combinations(mode=None, run_eagerly=None):
+  """Returns the default test combinations for tf.keras tests.
+
+  Note that if tf2 is enabled, then v1 session test will be skipped.
+
+  Args:
+    mode: List of modes to run the tests. The valid options are 'graph' and
+      'eager'. Default to ['graph', 'eager'] if not specified. If a empty list
+      is provide, then the test will run under the context based on tf's
+      version, eg graph for v1 and eager for v2.
+    run_eagerly: List of `run_eagerly` value to be run with the tests.
+      Default to [True, False] if not specified. Note that for `graph` mode,
+      run_eagerly value will only be False.
+
+  Returns:
+    A list contains all the combinations to be used to generate test cases.
+  """
+  if mode is None:
+    mode = ['eager'] if tf2.enabled() else ['graph', 'eager']
+  if run_eagerly is None:
+    run_eagerly = [True, False]
+  result = []
+  if 'eager' in mode:
+    result += combinations.combine(mode=['eager'], run_eagerly=run_eagerly)
+  if 'graph' in mode:
+    result += combinations.combine(mode=['graph'], run_eagerly=[False])
+  return result
+
+
+def keras_model_type_combinations():
+  return combinations.combine(model_type=KERAS_MODEL_TYPES)
+
+
+class KerasModeCombination(test_combinations.TestCombination):
+  """Combination for Keras test mode.
+
+  It by default includes v1_session, v2_eager and v2_tf_function.
+  """
+
+  def context_managers(self, kwargs):
+    run_eagerly = kwargs.pop('run_eagerly', None)
+
+    if run_eagerly is not None:
+      return [testing_utils.run_eagerly_scope(run_eagerly)]
+    else:
+      return []
+
+  def parameter_modifiers(self):
+    return [test_combinations.OptionalParameter('run_eagerly')]
+
+
+class KerasModelTypeCombination(test_combinations.TestCombination):
+  """Combination for Keras model types when doing model test.
+
+  It by default includes 'functional', 'subclass', 'sequential'.
+
+  Various methods in `testing_utils` to get models will auto-generate a model
+  of the currently active Keras model type. This allows unittests to confirm
+  the equivalence between different Keras models.
+  """
+
+  def context_managers(self, kwargs):
+    model_type = kwargs.pop('model_type', None)
+    if model_type in KERAS_MODEL_TYPES:
+      return [testing_utils.model_type_scope(model_type)]
+    else:
+      return []
+
+  def parameter_modifiers(self):
+    return [test_combinations.OptionalParameter('model_type')]
+
+
+_defaults = combinations.generate.keywords['test_combinations']
+generate = functools.partial(
+    combinations.generate,
+    test_combinations=_defaults +
+    (KerasModeCombination(), KerasModelTypeCombination()))
+combine = test_combinations.combine
+times = test_combinations.times
+NamedObject = test_combinations.NamedObject
diff --git a/tensorflow/python/keras/combinations_test.py b/tensorflow/python/keras/combinations_test.py
new file mode 100644
index 00000000000..b8f63ff772a
--- /dev/null
+++ b/tensorflow/python/keras/combinations_test.py
@@ -0,0 +1,178 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras combinations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+from absl.testing import parameterized
+
+from tensorflow.python import tf2
+from tensorflow.python.eager import context
+from tensorflow.python.keras import combinations
+from tensorflow.python.keras import models as keras_models
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class CombinationsTest(test.TestCase):
+
+  def test_run_all_keras_modes(self):
+    test_params = []
+
+    class ExampleTest(parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @combinations.generate(combinations.keras_mode_combinations())
+      def testBody(self):
+        mode = "eager" if context.executing_eagerly() else "graph"
+        should_run_eagerly = testing_utils.should_run_eagerly()
+        test_params.append((mode, should_run_eagerly))
+
+    e = ExampleTest()
+    if not tf2.enabled():
+      e.testBody_test_mode_graph_runeagerly_False()
+    e.testBody_test_mode_eager_runeagerly_True()
+    e.testBody_test_mode_eager_runeagerly_False()
+
+    if not tf2.enabled():
+      self.assertLen(test_params, 3)
+      self.assertAllEqual(test_params, [
+          ("graph", False),
+          ("eager", True),
+          ("eager", False),
+      ])
+
+      ts = unittest.makeSuite(ExampleTest)
+      res = unittest.TestResult()
+      ts.run(res)
+      self.assertLen(test_params, 6)
+    else:
+      self.assertLen(test_params, 2)
+      self.assertAllEqual(test_params, [
+          ("eager", True),
+          ("eager", False),
+      ])
+
+      ts = unittest.makeSuite(ExampleTest)
+      res = unittest.TestResult()
+      ts.run(res)
+      self.assertLen(test_params, 4)
+
+  def test_generate_keras_mode_eager_only(self):
+    result = combinations.keras_mode_combinations(mode=["eager"])
+    self.assertLen(result, 2)
+    self.assertEqual(result[0], {"mode": "eager", "run_eagerly": True})
+    self.assertEqual(result[1], {"mode": "eager", "run_eagerly": False})
+
+  def test_generate_keras_mode_skip_run_eagerly(self):
+    result = combinations.keras_mode_combinations(run_eagerly=[False])
+    if tf2.enabled():
+      self.assertLen(result, 1)
+      self.assertEqual(result[0], {"mode": "eager", "run_eagerly": False})
+    else:
+      self.assertLen(result, 2)
+      self.assertEqual(result[0], {"mode": "eager", "run_eagerly": False})
+      self.assertEqual(result[1], {"mode": "graph", "run_eagerly": False})
+
+  def test_run_all_keras_model_types(self):
+    model_types = []
+    models = []
+
+    class ExampleTest(parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @combinations.generate(combinations.keras_model_type_combinations())
+      def testBody(self):
+        model_types.append(testing_utils.get_model_type())
+        models.append(testing_utils.get_small_mlp(1, 4, input_dim=3))
+
+    e = ExampleTest()
+    e.testBody_test_modeltype_functional()
+    e.testBody_test_modeltype_subclass()
+    e.testBody_test_modeltype_sequential()
+
+    self.assertLen(model_types, 3)
+    self.assertAllEqual(model_types, [
+        "functional",
+        "subclass",
+        "sequential"
+    ])
+
+    # Validate that the models are what they should be
+    self.assertTrue(models[0]._is_graph_network)
+    self.assertFalse(models[1]._is_graph_network)
+    self.assertNotIsInstance(models[0], keras_models.Sequential)
+    self.assertNotIsInstance(models[1], keras_models.Sequential)
+    self.assertIsInstance(models[2], keras_models.Sequential)
+
+    ts = unittest.makeSuite(ExampleTest)
+    res = unittest.TestResult()
+    ts.run(res)
+
+    self.assertLen(model_types, 6)
+
+  def test_combine_combinations(self):
+    test_cases = []
+
+    @combinations.generate(combinations.times(
+        combinations.keras_mode_combinations(),
+        combinations.keras_model_type_combinations()))
+    class ExampleTest(parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @parameterized.named_parameters(dict(testcase_name="_arg",
+                                           arg=True))
+      def testBody(self, arg):
+        del arg
+        mode = "eager" if context.executing_eagerly() else "graph"
+        should_run_eagerly = testing_utils.should_run_eagerly()
+        test_cases.append((mode, should_run_eagerly,
+                           testing_utils.get_model_type()))
+
+    ts = unittest.makeSuite(ExampleTest)
+    res = unittest.TestResult()
+    ts.run(res)
+
+    expected_combinations = [
+        ("eager", False, "functional"),
+        ("eager", False, "sequential"),
+        ("eager", False, "subclass"),
+        ("eager", True, "functional"),
+        ("eager", True, "sequential"),
+        ("eager", True, "subclass"),
+    ]
+
+    if not tf2.enabled():
+      expected_combinations.extend([
+          ("graph", False, "functional"),
+          ("graph", False, "sequential"),
+          ("graph", False, "subclass"),
+      ])
+
+    self.assertAllEqual(sorted(test_cases), expected_combinations)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/constraints_test.py b/tensorflow/python/keras/constraints_test.py
index 741be34530a..0cdaa778adf 100644
--- a/tensorflow/python/keras/constraints_test.py
+++ b/tensorflow/python/keras/constraints_test.py
@@ -22,8 +22,9 @@ import math
 
 import numpy as np
 
-from tensorflow.python import keras
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import combinations
+from tensorflow.python.keras import constraints
 from tensorflow.python.platform import test
 
 
@@ -44,47 +45,45 @@ def get_example_kernel(width):
   return example_array
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class KerasConstraintsTest(test.TestCase):
 
   def test_serialization(self):
     all_activations = ['max_norm', 'non_neg',
                        'unit_norm', 'min_max_norm']
     for name in all_activations:
-      fn = keras.constraints.get(name)
-      ref_fn = getattr(keras.constraints, name)()
+      fn = constraints.get(name)
+      ref_fn = getattr(constraints, name)()
       assert fn.__class__ == ref_fn.__class__
-      config = keras.constraints.serialize(fn)
-      fn = keras.constraints.deserialize(config)
+      config = constraints.serialize(fn)
+      fn = constraints.deserialize(config)
       assert fn.__class__ == ref_fn.__class__
 
   def test_max_norm(self):
     array = get_example_array()
     for m in get_test_values():
-      norm_instance = keras.constraints.max_norm(m)
-      normed = norm_instance(keras.backend.variable(array))
-      assert np.all(keras.backend.eval(normed) < m)
+      norm_instance = constraints.max_norm(m)
+      normed = norm_instance(backend.variable(array))
+      assert np.all(backend.eval(normed) < m)
 
     # a more explicit example
-    norm_instance = keras.constraints.max_norm(2.0)
+    norm_instance = constraints.max_norm(2.0)
     x = np.array([[0, 0, 0], [1.0, 0, 0], [3, 0, 0], [3, 3, 3]]).T
     x_normed_target = np.array(
         [[0, 0, 0], [1.0, 0, 0], [2.0, 0, 0],
          [2. / np.sqrt(3), 2. / np.sqrt(3), 2. / np.sqrt(3)]]).T
-    x_normed_actual = keras.backend.eval(
-        norm_instance(keras.backend.variable(x)))
+    x_normed_actual = backend.eval(norm_instance(backend.variable(x)))
     self.assertAllClose(x_normed_actual, x_normed_target, rtol=1e-05)
 
   def test_non_neg(self):
-    non_neg_instance = keras.constraints.non_neg()
-    normed = non_neg_instance(keras.backend.variable(get_example_array()))
-    assert np.all(np.min(keras.backend.eval(normed), axis=1) == 0.)
+    non_neg_instance = constraints.non_neg()
+    normed = non_neg_instance(backend.variable(get_example_array()))
+    assert np.all(np.min(backend.eval(normed), axis=1) == 0.)
 
   def test_unit_norm(self):
-    unit_norm_instance = keras.constraints.unit_norm()
-    normalized = unit_norm_instance(keras.backend.variable(get_example_array()))
-    norm_of_normalized = np.sqrt(
-        np.sum(keras.backend.eval(normalized)**2, axis=0))
+    unit_norm_instance = constraints.unit_norm()
+    normalized = unit_norm_instance(backend.variable(get_example_array()))
+    norm_of_normalized = np.sqrt(np.sum(backend.eval(normalized)**2, axis=0))
     # In the unit norm constraint, it should be equal to 1.
     difference = norm_of_normalized - 1.
     largest_difference = np.max(np.abs(difference))
@@ -93,10 +92,9 @@ class KerasConstraintsTest(test.TestCase):
   def test_min_max_norm(self):
     array = get_example_array()
     for m in get_test_values():
-      norm_instance = keras.constraints.min_max_norm(
-          min_value=m, max_value=m * 2)
-      normed = norm_instance(keras.backend.variable(array))
-      value = keras.backend.eval(normed)
+      norm_instance = constraints.min_max_norm(min_value=m, max_value=m * 2)
+      normed = norm_instance(backend.variable(array))
+      value = backend.eval(normed)
       l2 = np.sqrt(np.sum(np.square(value), axis=0))
       assert not l2[l2 < m]
       assert not l2[l2 > m * 2 + 1e-5]
@@ -104,9 +102,9 @@ class KerasConstraintsTest(test.TestCase):
   def test_conv2d_radial_constraint(self):
     for width in (3, 4, 5, 6):
       array = get_example_kernel(width)
-      norm_instance = keras.constraints.radial_constraint()
-      normed = norm_instance(keras.backend.variable(array))
-      value = keras.backend.eval(normed)
+      norm_instance = constraints.radial_constraint()
+      normed = norm_instance(backend.variable(array))
+      value = backend.eval(normed)
       assert np.all(value.shape == array.shape)
       assert np.all(value[0:, 0, 0, 0] == value[-1:, 0, 0, 0])
       assert len(set(value[..., 0, 0].flatten())) == math.ceil(float(width) / 2)
diff --git a/tensorflow/python/keras/distribute/distribute_strategy_test.py b/tensorflow/python/keras/distribute/distribute_strategy_test.py
index d3aeea6f738..4ca3cf2b142 100644
--- a/tensorflow/python/keras/distribute/distribute_strategy_test.py
+++ b/tensorflow/python/keras/distribute/distribute_strategy_test.py
@@ -232,6 +232,8 @@ tpu_strategies = [
     strategy_combinations.tpu_strategy_one_step
 ]
 
+all_strategies = strategies_minus_tpu + tpu_strategies
+
 
 def strategy_minus_tpu_combinations():
   return combinations.combine(
@@ -531,7 +533,8 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
         return grad_v1, grad_v2
       if context.executing_eagerly():
         run_fn = def_function.function(run_fn)
-      grad_v1, grad_v2 = distribution.experimental_run_v2(run_fn)
+
+      grad_v1, grad_v2 = distribution.run(run_fn)
       self.assertIsNotNone(grad_v1)
       self.assertIsNotNone(grad_v2)
 
@@ -1570,6 +1573,112 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     model.predict(inputs)
     model.evaluate(inputs, targets)
 
+  @combinations.generate(
+      combinations.combine(distribution=all_strategies, mode=['eager']))
+  def test_distributed_dataset(self, distribution):
+    with distribution.scope():
+
+      class CBCounter(keras.callbacks.Callback):
+
+        def __init__(self):
+          self.epochs = 0
+          self.train_batches = 0
+          self.test_batches = 0
+
+        def on_epoch_end(self, batch, logs=None):
+          self.epochs += 1
+
+        def on_train_batch_end(self, batch, logs=None):
+          self.train_batches += 1
+
+        def on_test_batch_end(self, batch, logs=None):
+          self.test_batches += 1
+
+      model = keras.Sequential([keras.layers.Dense(1)])
+      model.compile('sgd', 'mse')
+      cb_counter = CBCounter()
+
+      x, y = np.ones((100, 10)), np.ones((100, 1))
+      ds = dataset_ops.DatasetV2.from_tensor_slices((x, y))
+      ds = ds.batch(10).repeat(2)
+      ds = distribution.experimental_distribute_dataset(ds)
+
+      val_ds = dataset_ops.DatasetV2.from_tensor_slices((x, y))
+      val_ds = val_ds.batch(20)
+      val_ds = distribution.experimental_distribute_dataset(val_ds)
+
+      model.fit(
+          ds,
+          steps_per_epoch=10,
+          validation_data=val_ds,
+          validation_steps=5,
+          epochs=2,
+          callbacks=[cb_counter])
+
+      self.assertEqual(cb_counter.train_batches, 20)
+      self.assertEqual(cb_counter.test_batches, 10)
+      self.assertEqual(cb_counter.epochs, 2)
+
+      # Check for `steps_per_epoch`.
+      if distribution.num_replicas_in_sync > 1:
+        with self.assertRaisesRegexp(ValueError,
+                                     'distributed dataset, you must specify'):
+          model.fit(ds, epochs=2)
+
+  @combinations.generate(
+      combinations.combine(distribution=all_strategies, mode=['eager']))
+  def test_distributed_datasets_from_function(self, distribution):
+    with distribution.scope():
+
+      class CBCounter(keras.callbacks.Callback):
+
+        def __init__(self):
+          self.epochs = 0
+          self.train_batches = 0
+          self.test_batches = 0
+
+        def on_epoch_end(self, batch, logs=None):
+          self.epochs += 1
+
+        def on_train_batch_end(self, batch, logs=None):
+          self.train_batches += 1
+
+        def on_test_batch_end(self, batch, logs=None):
+          self.test_batches += 1
+
+      model = keras.Sequential([keras.layers.Dense(1)])
+      model.compile('sgd', 'mse')
+      cb_counter = CBCounter()
+
+      def make_dataset(_):
+        x, y = np.ones((100, 10)), np.ones((100, 1))
+        ds = dataset_ops.DatasetV2.from_tensor_slices((x, y))
+        ds = ds.batch(5).repeat()
+        return ds
+
+      ds = distribution.experimental_distribute_datasets_from_function(
+          make_dataset)
+      val_ds = distribution.experimental_distribute_datasets_from_function(
+          make_dataset)
+
+      model.fit(
+          ds,
+          steps_per_epoch=10,
+          validation_data=val_ds,
+          validation_steps=5,
+          epochs=2,
+          callbacks=[cb_counter])
+
+      self.assertEqual(cb_counter.train_batches, 20)
+      self.assertEqual(cb_counter.test_batches, 10)
+      self.assertEqual(cb_counter.epochs, 2)
+
+      # Check for `steps_per_epoch`.
+      if distribution.num_replicas_in_sync > 1:
+        with self.assertRaisesRegexp(ValueError,
+                                     'distributed dataset, you must specify'):
+          model.fit(ds, epochs=2)
+
   @combinations.generate(
       combinations.times(
           all_strategy_combinations_minus_default()))
@@ -1949,8 +2058,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
           optimizer.apply_gradients(zip(grads, model.trainable_variables))
           return loss
 
-        per_replica_losses = distribution.experimental_run_v2(
-            step_fn, args=(dist_inputs,))
+        per_replica_losses = distribution.run(step_fn, args=(dist_inputs,))
         return distribution.reduce(
             reduce_util.ReduceOp.SUM, per_replica_losses, axis=None)
 
diff --git a/tensorflow/python/keras/distribute/distributed_training_utils.py b/tensorflow/python/keras/distribute/distributed_training_utils.py
index 86e8116d9bd..07bbf3f2b1c 100644
--- a/tensorflow/python/keras/distribute/distributed_training_utils.py
+++ b/tensorflow/python/keras/distribute/distributed_training_utils.py
@@ -863,8 +863,7 @@ def _make_execution_function_without_cloning(model, mode):
       # PerReplicas as arguments.  On every replica inside this call, each
       # PerReplica object will return the value for that replica.  The outputs
       # are PerReplicas too.
-      outputs = strategy.experimental_run_v2(
-          per_replica_function, args=(x, y, sample_weights))
+      outputs = strategy.run(per_replica_function, args=(x, y, sample_weights))
       # Out of PerReplica outputs reduce or pick values to return.
       all_outputs = unwrap_outputs(
           strategy, outputs, with_loss_tensor=(mode != ModeKeys.PREDICT))
diff --git a/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py b/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
index 63fe86adcad..c99b6db8f4d 100644
--- a/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
@@ -94,6 +94,8 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
           x=train_ds,
           epochs=num_epoch,
           steps_per_epoch=steps,
+          validation_data=train_ds,
+          validation_steps=steps,
           callbacks=[
               callbacks.ModelCheckpoint(
                   filepath=saving_filepath, save_weights_only=save_weights_only)
@@ -210,12 +212,16 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
       # The saving_filepath shouldn't exist at the beginning (as it's unique).
       test_obj.assertFalse(file_io.file_exists(saving_filepath))
 
+      multi_process_runner.barrier().wait()
+
       model.fit(
           x=train_ds,
           epochs=num_epoch,
           steps_per_epoch=steps,
           callbacks=[callbacks.TensorBoard(log_dir=saving_filepath)])
 
+      multi_process_runner.barrier().wait()
+
       test_obj.assertTrue(file_io.list_directory(saving_filepath))
 
     saving_filepath = os.path.join(self.get_temp_dir(), 'logfile')
@@ -256,4 +262,4 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
 
 
 if __name__ == '__main__':
-  multi_process_runner.test_main()
+  multi_process_runner.test_main(barrier_parties=2)
diff --git a/tensorflow/python/keras/engine/BUILD b/tensorflow/python/keras/engine/BUILD
index 47765190ff6..54cce9e6486 100644
--- a/tensorflow/python/keras/engine/BUILD
+++ b/tensorflow/python/keras/engine/BUILD
@@ -65,7 +65,7 @@ py_library(
         "//tensorflow/python/module",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/ops/ragged:ragged_util",
-        "//tensorflow/python/profiler:traceme",
+        "//tensorflow/python/profiler:trace",
         "//tensorflow/python/tpu:tpu_lib",
         "//tensorflow/python/training/tracking:data_structures",
         "//tensorflow/tools/docs:doc_controls",
@@ -183,7 +183,10 @@ tf_py_test(
     deps = [
         ":base_layer_utils",
         "//tensorflow/python:client_testlib",
-        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python/keras:backend",
+        "//tensorflow/python/keras:combinations",
     ],
 )
 
@@ -193,6 +196,7 @@ tf_py_test(
     srcs = ["data_adapter_test.py"],
     python_version = "PY3",
     tags = [
+        "no_oss_py38",  # TODO(b/150615192)
         "nomac",  # TODO(mihaimaruseac): b/127695564
     ],
     deps = [
@@ -227,8 +231,12 @@ cuda_py_test(
         "nomac",  # TODO(mihaimaruseac): b/127695564
     ],
     deps = [
+        ":engine",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python/keras",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/keras:backend",
+        "//tensorflow/python/keras:combinations",
+        "//tensorflow/python/keras/layers:convolutional",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -279,8 +287,31 @@ tf_py_test(
         "notsan",
     ],
     deps = [
+        ":engine",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python/keras",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:training_lib",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:function",
+        "//tensorflow/python/keras:backend",
+        "//tensorflow/python/keras:callbacks",
+        "//tensorflow/python/keras:combinations",
+        "//tensorflow/python/keras:losses",
+        "//tensorflow/python/keras:metrics",
+        "//tensorflow/python/keras:testing_utils",
+        "//tensorflow/python/keras/layers",
+        "//tensorflow/python/keras/utils:data_utils",
+        "//tensorflow/python/keras/utils:np_utils",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -349,8 +380,20 @@ tf_py_test(
         "notsan",
     ],
     deps = [
+        ":engine",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/keras",
+        "//tensorflow/python/keras:combinations",
+        "//tensorflow/python/keras:losses",
+        "//tensorflow/python/keras:metrics",
+        "//tensorflow/python/keras:testing_utils",
+        "//tensorflow/python/keras/layers",
+        "//tensorflow/python/keras/optimizer_v2",
+        "//tensorflow/python/keras/utils:data_utils",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -437,10 +480,28 @@ tf_py_test(
         "nomac",  # TODO(mihaimaruseac): b/127695564
     ],
     deps = [
+        ":base_layer",
+        ":engine",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python/keras",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/keras:backend",
+        "//tensorflow/python/keras:combinations",
+        "//tensorflow/python/keras:initializers",
+        "//tensorflow/python/keras:models",
+        "//tensorflow/python/keras:testing_utils",
+        "//tensorflow/python/keras/layers",
+        "//tensorflow/python/keras/utils:layer_utils",
+        "//tensorflow/python/keras/utils:tf_utils",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/training/tracking:util",
         "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -455,10 +516,36 @@ tf_py_test(
         "nomac",  # TODO(mihaimaruseac): b/127695564
     ],
     deps = [
+        ":base_layer",
+        ":engine",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python/keras",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:summary_ops_v2",
+        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/keras:backend",
+        "//tensorflow/python/keras:combinations",
+        "//tensorflow/python/keras:regularizers",
+        "//tensorflow/python/keras:testing_utils",
+        "//tensorflow/python/keras/layers",
+        "//tensorflow/python/keras/mixed_precision/experimental:policy",
+        "//tensorflow/python/keras/optimizer_v2",
+        "//tensorflow/python/keras/utils:tf_utils",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
         "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index dc2a9323e54..a6cb30aa181 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -383,7 +383,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     self._auto_track_sub_layers = True
 
   @trackable.no_automatic_dependency_tracking
-  @base_layer_utils.default
+  @generic_utils.default
   def build(self, input_shape):
     """Creates the variables of the layer (optional, for subclass implementers).
 
@@ -407,9 +407,14 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
   def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
     """This is where the layer's logic lives.
 
+    Note here that `call()` method in `tf.keras` is little bit different
+    from `keras` API. In `keras` API, you can pass support masking for
+    layers as additional arguements. Whereas `tf.keras` has `compute_mask()`
+    method to support masking.
+
     Arguments:
         inputs: Input tensor, or list/tuple of input tensors.
-        **kwargs: Additional keyword arguments.
+        **kwargs: Additional keyword arguments. Currently unused.
 
     Returns:
         A tensor or list/tuple of tensors.
@@ -508,7 +513,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       dtype = self.dtype or backend.floatx()
     dtype = dtypes.as_dtype(dtype)
     if self._dtype_policy.variable_dtype is None:
-      # The policy is "infer", so we infer the policy from the variable dtype.
+      # The policy is "_infer", so we infer the policy from the variable dtype.
       self._dtype_policy = policy.Policy(dtype.base_dtype.name)
     initializer = initializers.get(initializer)
     regularizer = regularizers.get(regularizer)
@@ -597,7 +602,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         self._non_trainable_weights.append(variable)
     return variable
 
-  @base_layer_utils.default
+  @generic_utils.default
   def get_config(self):
     """Returns the config of the layer.
 
@@ -737,7 +742,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         lambda s: tensor_spec.TensorSpec(dtype=dtype, shape=s),
         output_shape)
 
-  @base_layer_utils.default
+  @generic_utils.default
   def compute_mask(self, inputs, mask=None):  # pylint: disable=unused-argument
     """Computes an output mask tensor.
 
@@ -1350,23 +1355,17 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     elif from_metric_obj:
       name = value._metric_obj.name
 
-    if in_call_context:
+    if not in_call_context and not is_symbolic:
+      raise ValueError('Expected a symbolic Tensor for the metric value, '
+                       'received: ' + str(value))
+
+    # If a metric was added in a Layer's `call` or `build`.
+    if in_call_context or not getattr(self, '_is_graph_network', False):
       # TF Function path should take the eager path.
       if is_symbolic and not base_layer_utils.is_in_tf_function():
-        self._symbolic_add_metric(value, aggregation, name)
-      else:
-        self._eager_add_metric(value, aggregation, name)
+        base_layer_utils.check_graph_consistency(value, method='add_metric')
+      self._add_metric(value, aggregation, name)
     else:
-      if not is_symbolic:
-        raise ValueError('Expected a symbolic Tensor for the metric value, '
-                         'received: ' + str(value))
-
-      # Possible a metric was added in a Layer's `build`.
-      if not getattr(self, '_is_graph_network', False):
-        with backend.get_graph().as_default():
-          self._symbolic_add_metric(value, aggregation, name)
-        return
-
       if from_metric_obj:
         raise ValueError('Using the result of calling a `Metric` object '
                          'when calling `add_metric` on a Functional '
@@ -2052,7 +2051,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
           "Layer {self.name} is casting an input tensor from dtype "
           "{input_dtype} to the layer's dtype of {layer_dtype}, which is new "
           "behavior in TensorFlow 2.  The layer has dtype {layer_dtype} "
-          "because it's dtype defaults to floatx.\n\n"
+          'because its dtype defaults to floatx.\n\n'
           ""
           "If you intended to run this layer in {layer_dtype}, you can safely "
           "ignore this warning. If in doubt, this warning is likely only an "
@@ -2104,7 +2103,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
           'We found {} metrics with the name: "{}"'.format(len(match), name))
     return match[0]
 
-  def _eager_add_metric(self, value, aggregation=None, name=None):
+  def _add_metric(self, value, aggregation=None, name=None):
     # If the given metric is available in `metrics` list we just update state
     # on it, otherwise we create a new metric instance and
     # add it to the `metrics` list.
@@ -2133,43 +2132,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       metric_obj(value)
     return
 
-  def _symbolic_add_metric(self, value, aggregation=None, name=None):
-    base_layer_utils.check_graph_consistency(value, method='add_metric')
-    match = self._get_existing_metric(name)
-    if aggregation is None:
-      # Iterate over the metrics and check if the given metric exists already.
-      # This can happen when a metric instance is created in subclassed model
-      # layer `__init__` and we have tracked that instance already in
-      # model.__setattr__.
-      if match:
-        result_tensor = value
-        metric_obj = match
-      elif hasattr(value, '_metric_obj'):
-        # We track the instance using the metadata on the result tensor.
-        result_tensor = value
-        metric_obj = result_tensor._metric_obj
-        self._metrics.append(metric_obj)
-      else:
-        raise ValueError(
-            'We do not support adding an aggregated metric result tensor that '
-            'is not the output of a `tf.keras.metrics.Metric` metric instance. '
-            'Without having access to the metric instance we cannot reset the '
-            'state of a metric after every epoch during training. You can '
-            'create a `tf.keras.metrics.Metric` instance and pass the result '
-            'here or pass an un-aggregated result with `aggregation` parameter '
-            'set as `mean`. For example: `self.add_metric(tf.reduce_sum(inputs)'
-            ', name=\'mean_activation\', aggregation=\'mean\')`')
-    else:
-      # If a non-aggregated tensor is given as input (ie. `aggregation` is
-      # explicitly set to `mean`), we wrap the tensor in `Mean` metric.
-      if match:
-        result_tensor = match(value)
-        metric_obj = match
-      else:
-        metric_obj, result_tensor = base_layer_utils.create_mean_metric(
-            value, name)
-        self._metrics.append(metric_obj)
-
   def _handle_weight_regularization(self, name, variable, regularizer):
     """Create lambdas which compute regularization losses."""
 
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index cdbe2667ec4..94766fe177a 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -25,7 +25,6 @@ import traceback
 
 import numpy as np
 
-from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
@@ -33,11 +32,16 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_spec
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import layers
+from tensorflow.python.keras import regularizers
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import input_layer
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.engine import training as training_lib
 from tensorflow.python.keras.mixed_precision.experimental import policy
 from tensorflow.python.keras.optimizer_v2 import rmsprop
 from tensorflow.python.keras.utils import tf_utils
@@ -81,7 +85,7 @@ class InvalidLayer(base_layer.Layer):
 
 class BaseLayerTest(keras_parameterized.TestCase):
 
-  @keras_parameterized.run_with_all_model_types
+  @combinations.generate(combinations.keras_model_type_combinations())
   def test_dynamic_layer(self):
     model = testing_utils.get_model_from_layers([DynamicLayer(dynamic=True)],
                                                 input_shape=(3,))
@@ -90,7 +94,7 @@ class BaseLayerTest(keras_parameterized.TestCase):
     self.assertEqual(model.run_eagerly, True)
     model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
 
-  @keras_parameterized.run_with_all_model_types
+  @combinations.generate(combinations.keras_model_type_combinations())
   def test_dynamic_layer_error(self):
     with self.assertRaisesRegexp(TypeError,
                                  'attempting to use Python control flow'):
@@ -99,7 +103,7 @@ class BaseLayerTest(keras_parameterized.TestCase):
       model.compile(rmsprop.RMSprop(0.001), loss='mse')
       model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
 
-  @keras_parameterized.run_with_all_model_types
+  @combinations.generate(combinations.keras_model_type_combinations())
   def test_dynamic_layer_error_running_in_graph_mode(self):
     with ops.get_default_graph().as_default():
       model = testing_utils.get_model_from_layers([DynamicLayer(dynamic=True)],
@@ -111,7 +115,8 @@ class BaseLayerTest(keras_parameterized.TestCase):
         model.compile(rmsprop.RMSprop(0.001), loss='mse')
 
   def test_manual_compute_output_shape(self):
-    class BuildCounter(keras.layers.Layer):
+
+    class BuildCounter(base_layer.Layer):
 
       def __init__(self, *args, **kwargs):  # pylint: disable=redefined-outer-name
         super(BuildCounter, self).__init__(*args, **kwargs)
@@ -123,45 +128,41 @@ class BaseLayerTest(keras_parameterized.TestCase):
       def call(self, inputs):
         return inputs
 
-    with context.eager_mode():
-      layer = BuildCounter(dtype=dtypes.float64)
-      output_shape = layer.compute_output_shape((None, 10))
-      self.assertEqual(layer.build_counter, 1)
-      self.assertEqual(output_shape.as_list(), [None, 10])
-      output_signature = layer.compute_output_signature(
-          tensor_spec.TensorSpec(dtype=dtypes.float64, shape=[None, 10]))
-      self.assertEqual(layer.build_counter, 1)
-      self.assertEqual(output_signature.dtype, dtypes.float64)
-      self.assertEqual(output_signature.shape.as_list(), [None, 10])
-      layer(np.ones((5, 10)))
-      self.assertEqual(layer.build_counter, 1)
+    layer = BuildCounter(dtype=dtypes.float64)
+    output_shape = layer.compute_output_shape((None, 10))
+    self.assertEqual(layer.build_counter, 1)
+    self.assertEqual(output_shape.as_list(), [None, 10])
+    output_signature = layer.compute_output_signature(
+        tensor_spec.TensorSpec(dtype=dtypes.float64, shape=[None, 10]))
+    self.assertEqual(layer.build_counter, 1)
+    self.assertEqual(output_signature.dtype, dtypes.float64)
+    self.assertEqual(output_signature.shape.as_list(), [None, 10])
+    layer(np.ones((5, 10)))
+    self.assertEqual(layer.build_counter, 1)
 
   def test_eager_switch_case_input(self):
-    with context.eager_mode():
-      task = keras.Input(shape=(), dtype=dtypes.int32)
-      control_flow_ops.switch_case(
-          task[0], [lambda: constant_op.constant(1.0) for _ in range(10)])
+    task = input_layer.Input(shape=(), dtype=dtypes.int32)
+    control_flow_ops.switch_case(
+        task[0], [lambda: constant_op.constant(1.0) for _ in range(10)])
 
   def test_dynamic_layer_with_deferred_sequential_model(self):
-    model = keras.Sequential(
-        [DynamicLayer(dynamic=True),
-         keras.layers.Dense(3)])
+    model = sequential.Sequential([DynamicLayer(dynamic=True), layers.Dense(3)])
     self.assertEqual(model.dynamic, True)
     model.compile(rmsprop.RMSprop(0.001), loss='mse')
     self.assertEqual(model.run_eagerly, True)
     model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
 
   def test_nested_dynamic_layers_in_eager_mode(self):
-    inputs = keras.Input((3,))
+    inputs = input_layer.Input((3,))
     outputs = DynamicLayer(dynamic=True)(inputs)
-    inner_model = keras.Model(inputs, outputs)
+    inner_model = training_lib.Model(inputs, outputs)
     self.assertEqual(inner_model.dynamic, True)
 
-    inputs = keras.Input((3,))
+    inputs = input_layer.Input((3,))
     x = DynamicLayer(dynamic=True)(inputs)
     outputs = inner_model(x)
 
-    model = keras.Model(inputs, outputs)
+    model = training_lib.Model(inputs, outputs)
     self.assertEqual(model.dynamic, True)
     model.compile(rmsprop.RMSprop(0.001), loss='mse')
     self.assertEqual(model.run_eagerly, True)
@@ -169,12 +170,12 @@ class BaseLayerTest(keras_parameterized.TestCase):
 
   def test_dynamic_subclassed_model_no_shape_inference(self):
 
-    class MyModel(keras.Model):
+    class MyModel(training_lib.Model):
 
       def __init__(self):
         super(MyModel, self).__init__(dynamic=True)
-        self.layer1 = keras.layers.Dense(3)
-        self.layer2 = keras.layers.Dense(3)
+        self.layer1 = layers.Dense(3)
+        self.layer2 = layers.Dense(3)
 
       def call(self, inputs):
         if math_ops.reduce_sum(inputs) > 0:
@@ -191,12 +192,12 @@ class BaseLayerTest(keras_parameterized.TestCase):
 
   def test_dynamic_subclassed_model_with_shape_inference(self):
 
-    class MyModel(keras.Model):
+    class MyModel(training_lib.Model):
 
       def __init__(self):
         super(MyModel, self).__init__(dynamic=True)
-        self.layer1 = keras.layers.Dense(3)
-        self.layer2 = keras.layers.Dense(3)
+        self.layer1 = layers.Dense(3)
+        self.layer2 = layers.Dense(3)
 
       def call(self, inputs):
         if math_ops.reduce_sum(inputs) > 0:
@@ -216,74 +217,72 @@ class BaseLayerTest(keras_parameterized.TestCase):
     self.assertEqual(outputs.shape.as_list(), [2, 3])
 
   def test_deepcopy(self):
-    with context.eager_mode():
-      bias_reg = lambda x: 1e-3 * math_ops.reduce_sum(x)
-      layer = keras.layers.Conv2D(32, (3, 3), bias_regularizer=bias_reg)
-      # Call the Layer on data to generate regularize losses.
-      layer(array_ops.ones((1, 10, 10, 3)))
-      self.assertLen(layer.losses, 1)
-      new_layer = copy.deepcopy(layer)
-      self.assertEqual(new_layer.bias_regularizer, bias_reg)
-      self.assertEqual(layer.get_config(), new_layer.get_config())
+    bias_reg = lambda x: 1e-3 * math_ops.reduce_sum(x)
+    layer = layers.Conv2D(32, (3, 3), bias_regularizer=bias_reg)
+    # Call the Layer on data to generate regularize losses.
+    layer(array_ops.ones((1, 10, 10, 3)))
+    self.assertLen(layer.losses, 1)
+    new_layer = copy.deepcopy(layer)
+    self.assertEqual(new_layer.bias_regularizer, bias_reg)
+    self.assertEqual(layer.get_config(), new_layer.get_config())
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_invalid_forward_pass(self):
-    inputs = keras.Input((3,))
+    inputs = input_layer.Input((3,))
     with self.assertRaisesRegexp(ValueError, 'You did something wrong!'):
       _ = InvalidLayer()(inputs)
 
   def test_no_legacy_model(self):
-    inputs = keras.Input((1,))
+    inputs = input_layer.Input((1,))
     legacy_dense_0 = legacy_core.Dense(1, name='legacy_dense_0')
     legacy_dense_1 = legacy_core.Dense(1, name='legacy_dense_1')
 
     layer = legacy_dense_0(inputs)
-    layer = keras.layers.Dense(1)(layer)
+    layer = layers.Dense(1)(layer)
     layer = legacy_dense_1(layer)
 
     expected_regex = (r'The following are legacy tf\.layers\.Layers:\n  '
                       '{}\n  {}'.format(legacy_dense_0, legacy_dense_1))
 
     with self.assertRaisesRegexp(TypeError, expected_regex):
-      _ = keras.models.Model(inputs=[inputs], outputs=[layer])
+      _ = training_lib.Model(inputs=[inputs], outputs=[layer])
 
-    model = keras.models.Model(inputs=[inputs], outputs=[inputs])
+    model = training_lib.Model(inputs=[inputs], outputs=[inputs])
     with self.assertRaisesRegexp(TypeError, expected_regex):
       model._insert_layers([legacy_dense_0, legacy_dense_1])
 
   def test_no_legacy_sequential(self):
-    layers = [
-        keras.layers.Dense(1),
-        legacy_core.Dense(1, name='legacy_dense_0')
-    ]
+    layer = [layers.Dense(1), legacy_core.Dense(1, name='legacy_dense_0')]
 
-    expected_regex = r'legacy tf\.layers\.Layers:\n  {}'.format(layers[1])
+    expected_regex = r'legacy tf\.layers\.Layers:\n  {}'.format(layer[1])
     with self.assertRaisesRegexp(TypeError, expected_regex):
-      _ = keras.models.Sequential(layers)
+      _ = sequential.Sequential(layer)
 
     with self.assertRaisesRegexp(TypeError, expected_regex):
-      _ = keras.models.Sequential([keras.layers.Input(shape=(4,))] + layers)
+      _ = sequential.Sequential([input_layer.Input(shape=(4,))] + layer)
 
-    model = keras.models.Sequential()
+    model = sequential.Sequential()
     with self.assertRaisesRegexp(TypeError, expected_regex):
-      for l in layers:
+      for l in layer:
         model.add(l)
 
-  @keras_parameterized.run_with_all_model_types
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(
+      combinations.times(
+          combinations.keras_model_type_combinations(),
+          combinations.combine(mode=['graph', 'eager'])))
   def test_build_with_numpy_data(self):
     model_layers = [
-        keras.layers.Dense(3, activation='relu', kernel_initializer='ones'),
-        keras.layers.Dense(1, activation='sigmoid', kernel_initializer='ones')
+        layers.Dense(3, activation='relu', kernel_initializer='ones'),
+        layers.Dense(1, activation='sigmoid', kernel_initializer='ones')
     ]
     model = testing_utils.get_model_from_layers(model_layers, input_shape=(4,))
     model(np.zeros((2, 4), dtype='float32'))
     self.assertTrue(model.built)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_default_add_weight(self):
 
-    class TestLayer(keras.layers.Layer):
+    class TestLayer(base_layer.Layer):
 
       def __init__(self):
         super(TestLayer, self).__init__()
@@ -300,76 +299,77 @@ class BaseLayerTest(keras_parameterized.TestCase):
     self.assertEqual(len(layer.losses), 1)
     if not context.executing_eagerly():
       # Cannot access tensor.name in eager execution.
-      self.assertTrue('Variable_2/Regularizer' in layer.losses[0].name)
+      self.assertIn('Variable_2/Regularizer', layer.losses[0].name)
 
-  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  @combinations.generate(combinations.keras_mode_combinations(mode=['eager']))
   def test_learning_phase_freezing_for_layers(self):
-    class LearningPhaseLayer(keras.layers.Layer):
+
+    class LearningPhaseLayer(base_layer.Layer):
 
       def call(self, inputs):
-        return keras.backend.in_train_phase(
-            lambda: array_ops.ones_like(inputs),
-            lambda: array_ops.zeros_like(inputs))
+        return backend.in_train_phase(lambda: array_ops.ones_like(inputs),
+                                      lambda: array_ops.zeros_like(inputs))
 
     def get_learning_phase_value():
-      model = keras.models.Sequential([LearningPhaseLayer(input_shape=(1,))])
+      model = sequential.Sequential([LearningPhaseLayer(input_shape=(1,))])
       model._run_eagerly = testing_utils.should_run_eagerly()
       return np.sum(model(np.ones((1, 1))))
 
     self.assertEqual(get_learning_phase_value(), 0)
 
     # Test scope.
-    with keras.backend.learning_phase_scope(1):
+    with backend.learning_phase_scope(1):
       self.assertEqual(get_learning_phase_value(), 1)
 
     # The effects of the scope end after exiting it.
     self.assertEqual(get_learning_phase_value(), 0)
 
     # Test setting.
-    keras.backend.set_learning_phase(1)
+    backend.set_learning_phase(1)
     self.assertEqual(get_learning_phase_value(), 1)
-    keras.backend.set_learning_phase(0)
+    backend.set_learning_phase(0)
     self.assertEqual(get_learning_phase_value(), 0)
 
   # Cannot be enabled with `run_eagerly=True`, see b/123904578
-  @test_util.run_all_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_layer_can_return_variable(self):
 
-    class ComputeSum(keras.layers.Layer):
+    class ComputeSum(base_layer.Layer):
 
       def __init__(self):
         super(ComputeSum, self).__init__()
         self.total = variables.Variable(
             initial_value=array_ops.zeros((1, 1)), trainable=False)
         if not context.executing_eagerly():
-          keras.backend.get_session().run(self.total.initializer)
+          backend.get_session().run(self.total.initializer)
 
       def call(self, inputs):
         self.total.assign_add(inputs)
         return self.total
 
-    inputs = keras.Input(shape=(1,))
-    model = keras.Model(inputs, ComputeSum()(inputs))
+    inputs = input_layer.Input(shape=(1,))
+    model = training_lib.Model(inputs, ComputeSum()(inputs))
     model.predict(np.ones((1, 1)))
 
   def _get_layer_with_training_arg(self):
 
-    class TrainingLayer(keras.layers.Layer):
+    class TrainingLayer(base_layer.Layer):
       """A layer with a `training` argument in a defuned `call`."""
 
       @def_function.function
       def call(self, inputs, training=None):
         if training is None:
-          training = keras.backend.learning_phase()
+          training = backend.learning_phase()
         return tf_utils.smart_cond(training,
                                    lambda: array_ops.ones_like(inputs),
                                    lambda: array_ops.zeros_like(inputs))
 
     return TrainingLayer()
 
-  @keras_parameterized.run_with_all_model_types
   # b/124459427: can't test with `run_eagerly=True` for now.
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(
+      combinations.times(combinations.keras_mode_combinations(),
+                         combinations.keras_model_type_combinations()))
   def test_training_arg_in_defun(self):
     layer = self._get_layer_with_training_arg()
     model = testing_utils.get_model_from_layers([layer], input_shape=(1,))
@@ -383,20 +383,21 @@ class BaseLayerTest(keras_parameterized.TestCase):
     # Test that the argument injection performed in `call` is not active
     # when the argument is passed explicitly.
     layer = self._get_layer_with_training_arg()
-    inputs = keras.Input(shape=(1,))
+    inputs = input_layer.Input(shape=(1,))
     # Pass `training` by name
     outputs = layer(inputs, training=False)
-    model = keras.Model(inputs, outputs)
+    model = training_lib.Model(inputs, outputs)
     model.compile(rmsprop.RMSprop(0.),
                   loss='mae')
     history = model.fit(np.zeros((1, 1)), np.zeros((1, 1)))
     self.assertEqual(history.history['loss'][0], 0.)
 
-  @keras_parameterized.run_with_all_model_types
-  @keras_parameterized.run_all_keras_modes
+  @combinations.generate(
+      combinations.times(combinations.keras_mode_combinations(),
+                         combinations.keras_model_type_combinations()))
   def test_raw_variable_assignment(self):
 
-    class RawVariableLayer(keras.layers.Layer):
+    class RawVariableLayer(base_layer.Layer):
 
       def __init__(self, **kwargs):
         super(RawVariableLayer, self).__init__(**kwargs)
@@ -416,15 +417,15 @@ class BaseLayerTest(keras_parameterized.TestCase):
     # Checks that variables get initialized.
     model.fit(x, y, batch_size=2, epochs=2)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_layer_names(self):
-    inputs = keras.layers.Input(shape=[2])
+    inputs = input_layer.Input(shape=[2])
     add1 = inputs + inputs
-    add2 = keras.layers.Add()([inputs, inputs])
+    add2 = layers.Add()([inputs, inputs])
     add3 = inputs + inputs
-    add4 = keras.layers.Add()([inputs, inputs])
-    model = keras.models.Model(
-        inputs=[inputs], outputs=[add1, add2, add3, add4])
+    add4 = layers.Add()([inputs, inputs])
+    model = training_lib.Model(inputs=[inputs],
+                               outputs=[add1, add2, add3, add4])
     actual_names = [l.name for l in model.layers]
     graph_names = [
         'input_1', 'tf_op_layer_AddV2', 'add', 'tf_op_layer_AddV2_1', 'add_1'
@@ -437,7 +438,7 @@ class BaseLayerTest(keras_parameterized.TestCase):
 
   def test_add_trainable_weight_on_frozen_layer(self):
 
-    class TestLayer(keras.layers.Layer):
+    class TestLayer(base_layer.Layer):
 
       def build(self, input_shape):
         self.w = self.add_weight(shape=(), trainable=True)
@@ -451,12 +452,12 @@ class BaseLayerTest(keras_parameterized.TestCase):
     layer.trainable = True
     self.assertListEqual(layer.trainable_weights, [layer.w])
 
-  @keras_parameterized.run_with_all_model_types
-  @keras_parameterized.run_all_keras_modes
+  @combinations.generate(
+      combinations.times(combinations.keras_mode_combinations(),
+                         combinations.keras_model_type_combinations()))
   def test_passing_initial_weights_values(self):
     kernel_value = np.random.random((10, 2))
-    layer_with_weights = keras.layers.Dense(
-        2, use_bias=False, weights=[kernel_value])
+    layer_with_weights = layers.Dense(2, use_bias=False, weights=[kernel_value])
 
     model = testing_utils.get_model_from_layers([layer_with_weights],
                                                 input_shape=(10,))
@@ -469,9 +470,9 @@ class BaseLayerTest(keras_parameterized.TestCase):
     self.assertAllClose(model.layers[-1].get_weights()[0], kernel_value)
     self.assertAllClose(out, np.dot(inputs, kernel_value))
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_set_weights_and_get_weights(self):
-    layer = keras.layers.Dense(2)
+    layer = layers.Dense(2)
     layer.build((None, 10))
     kernel = np.random.random((10, 2))
     bias = np.random.random((2,))
@@ -489,7 +490,7 @@ class BaseLayerTest(keras_parameterized.TestCase):
 
   def test_get_config_error(self):
 
-    class MyLayer(keras.layers.Layer):
+    class MyLayer(base_layer.Layer):
 
       def __init__(self, my_kwarg='default', **kwargs):
         super(MyLayer, self).__init__(**kwargs)
@@ -500,7 +501,7 @@ class BaseLayerTest(keras_parameterized.TestCase):
     with self.assertRaisesRegexp(NotImplementedError, 'Layer MyLayer has'):
       MyLayer('custom').get_config()
 
-    class MyLayerNew(keras.layers.Layer):
+    class MyLayerNew(base_layer.Layer):
 
       def __init__(self, my_kwarg='default', **kwargs):
         super(MyLayerNew, self).__init__(**kwargs)
@@ -515,7 +516,7 @@ class BaseLayerTest(keras_parameterized.TestCase):
     # from an overridden `get_config`:
     self.assertEqual(MyLayerNew('custom').get_config()['my_kwarg'], 'custom')
 
-    class MyLayerNew2(keras.layers.Layer):
+    class MyLayerNew2(base_layer.Layer):
 
       def __init__(self, name='MyLayerName', dtype=None, **kwargs):  # pylint:disable=redefined-outer-name
         super(MyLayerNew2, self).__init__(name=name, dtype=dtype, **kwargs)
@@ -524,27 +525,27 @@ class BaseLayerTest(keras_parameterized.TestCase):
     # arguments, no error is thrown:
     self.assertEqual(MyLayerNew2(name='New').get_config()['name'], 'New')
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_count_params(self):
-    dense = keras.layers.Dense(16)
+    dense = layers.Dense(16)
     dense.build((None, 4))
     self.assertEqual(dense.count_params(), 16 * 4 + 16)
 
-    dense = keras.layers.Dense(16)
+    dense = layers.Dense(16)
     with self.assertRaisesRegexp(ValueError, 'call `count_params`'):
       dense.count_params()
 
-    model = keras.Sequential(keras.layers.Dense(16))
+    model = sequential.Sequential(layers.Dense(16))
     with self.assertRaisesRegexp(ValueError, 'call `count_params`'):
       model.count_params()
 
-    dense = keras.layers.Dense(16, input_dim=4)
-    model = keras.Sequential(dense)
+    dense = layers.Dense(16, input_dim=4)
+    model = sequential.Sequential(dense)
     self.assertEqual(model.count_params(), 16 * 4 + 16)
 
   def test_super_not_called(self):
 
-    class CustomLayerNotCallingSuper(keras.layers.Layer):
+    class CustomLayerNotCallingSuper(base_layer.Layer):
 
       def __init__(self):
         pass
@@ -553,11 +554,11 @@ class BaseLayerTest(keras_parameterized.TestCase):
     with self.assertRaisesRegexp(RuntimeError, 'You must call `super()'):
       layer(np.random.random((10, 2)))
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_first_arg_not_called_inputs(self):
     x, y = array_ops.ones((10, 1)), array_ops.ones((10, 1))
 
-    class ArgLayer(keras.layers.Layer):
+    class ArgLayer(base_layer.Layer):
 
       def call(self, x, y):
         return x + y
@@ -566,7 +567,7 @@ class BaseLayerTest(keras_parameterized.TestCase):
     out = self.evaluate(layer(x=x, y=y))
     self.assertAllClose(out, 2 * np.ones((10, 1)))
 
-    class KwargLayer(keras.layers.Layer):
+    class KwargLayer(base_layer.Layer):
 
       def call(self, x=None, y=None):
         return x + y
@@ -578,7 +579,7 @@ class BaseLayerTest(keras_parameterized.TestCase):
     with self.assertRaisesRegexp(ValueError, 'must always be passed'):
       layer(y=y)
 
-    class TFFunctionLayer(keras.layers.Layer):
+    class TFFunctionLayer(base_layer.Layer):
 
       @def_function.function
       def call(self, x, y=None):
@@ -591,7 +592,8 @@ class BaseLayerTest(keras_parameterized.TestCase):
     self.assertAllClose(out, 2 * np.ones((10, 1)))
 
   def test_build_input_shape(self):
-    class CustomLayer(keras.layers.Layer):
+
+    class CustomLayer(base_layer.Layer):
 
       def build(self, input_shape):
         self.add_weight('w', shape=input_shape[1:])
@@ -605,36 +607,36 @@ class BaseLayerTest(keras_parameterized.TestCase):
     self.assertEqual([None, 1, 2, 3], layer._build_input_shape)
 
     layer = CustomLayer()
-    layer(keras.Input((3,)))
+    layer(input_layer.Input((3,)))
     self.assertTrue(layer.built)
     self.assertEqual([None, 3], layer._build_input_shape.as_list())
 
 
-class SymbolicSupportTest(test.TestCase):
+class SymbolicSupportTest(keras_parameterized.TestCase):
 
   def test_using_symbolic_tensors_with_tf_ops(self):
     # Single-input.
-    x = keras.Input((3,))
+    x = input_layer.Input((3,))
     y = math_ops.square(x)
-    self.assertEqual(y.graph, keras.backend.get_graph())
+    self.assertEqual(y.graph, backend.get_graph())
 
     # Multi-inputs.
-    x1, x2 = keras.Input((3,)), keras.Input((3,))
+    x1, x2 = input_layer.Input((3,)), input_layer.Input((3,))
     y = array_ops.concat([x1, x2], axis=1)
-    self.assertEqual(y.graph, keras.backend.get_graph())
+    self.assertEqual(y.graph, backend.get_graph())
 
     # Mixing Keras symbolic tensors and graph tensors from the same graph works.
-    with keras.backend.get_graph().as_default():
-      x1 = keras.Input((3,))
-    x2 = keras.Input((3,))
+    with backend.get_graph().as_default():
+      x1 = input_layer.Input((3,))
+    x2 = input_layer.Input((3,))
     y = math_ops.matmul(x1, x2)
-    self.assertEqual(y.graph, keras.backend.get_graph())
+    self.assertEqual(y.graph, backend.get_graph())
 
     # Creating same op type (matmul) multiple times in the Keras graph works.
-    x1 = keras.Input((3,))
-    x2 = keras.Input((3,))
+    x1 = input_layer.Input((3,))
+    x2 = input_layer.Input((3,))
     y = math_ops.matmul(x1, x2)
-    self.assertEqual(y.graph, keras.backend.get_graph())
+    self.assertEqual(y.graph, backend.get_graph())
 
   def test_mixing_eager_and_graph_tensors(self):
     with ops.Graph().as_default():
@@ -651,33 +653,33 @@ class SymbolicSupportTest(test.TestCase):
     with self.assertRaisesRegexp(TypeError, 'Graph tensors'):
       math_ops.matmul(x1, x2)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_mixing_keras_symbolic_tensors_and_eager_tensors(self):
-    x1 = keras.Input((3,))
+    x1 = input_layer.Input((3,))
     x2 = array_ops.ones((3, 3))
     y = math_ops.matmul(x1, x2)
-    self.assertEqual(y.graph, keras.backend.get_graph())
-    fn = keras.backend.function(inputs=[x1], outputs=[y])
+    self.assertEqual(y.graph, backend.get_graph())
+    fn = backend.function(inputs=[x1], outputs=[y])
     x_val = np.random.random((3, 3))
     y_val = np.ones((3, 3))
     self.assertAllClose(fn([x_val])[0],
                         np.matmul(x_val, y_val),
                         atol=1e-5)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_mixing_keras_symbolic_tensors_and_numpy_arrays(self):
-    x1 = keras.Input((3,))
+    x1 = input_layer.Input((3,))
     x2 = np.ones((3, 3), dtype='float32')
     y = math_ops.matmul(x1, x2)
-    self.assertEqual(y.graph, keras.backend.get_graph())
-    fn = keras.backend.function(inputs=[x1], outputs=[y])
+    self.assertEqual(y.graph, backend.get_graph())
+    fn = backend.function(inputs=[x1], outputs=[y])
     x_val = np.random.random((3, 3))
     y_val = np.ones((3, 3))
     self.assertAllClose(fn([x_val])[0],
                         np.matmul(x_val, y_val),
                         atol=1e-5)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_reraising_exception(self):
     # When layer is not dynamic, we have some pattern matching during exception
     # handling to detect when the user is trying to use python control flow.
@@ -692,7 +694,7 @@ class SymbolicSupportTest(test.TestCase):
           raise TypeError('Non-matching TypeError message.')
         easily_identifiable_name()
 
-    inputs = keras.Input((3,))
+    inputs = input_layer.Input((3,))
 
     try:
       _ = TypeErrorLayer()(inputs)
@@ -707,12 +709,12 @@ class SymbolicSupportTest(test.TestCase):
         function_name = last_entry[2]
       self.assertEqual(function_name, 'easily_identifiable_name')
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_summaries_in_tf_function(self):
     if not context.executing_eagerly():
       return
 
-    class MyLayer(keras.layers.Layer):
+    class MyLayer(base_layer.Layer):
 
       def call(self, inputs):
         summary_ops_v2.scalar('mean', math_ops.reduce_mean(inputs))
@@ -739,19 +741,19 @@ class SymbolicSupportTest(test.TestCase):
     self.assertEqual(set(['my_layer/mean']), tags)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class NestedTrackingTest(test.TestCase):
 
   def test_nested_layer_variable_tracking(self):
     # Test that variables from nested sublayers are
     # being tracked by subclassed layers.
 
-    class MyLayer(keras.layers.Layer):
+    class MyLayer(base_layer.Layer):
 
       def __init__(self):
         super(MyLayer, self).__init__()
-        self.dense1 = keras.layers.Dense(1)
-        self.dense2 = keras.layers.BatchNormalization()
+        self.dense1 = layers.Dense(1)
+        self.dense2 = layers.BatchNormalization()
 
       def build(self, input_shape):
         self.v1 = self.add_weight('v1', shape=input_shape[1:].as_list())
@@ -765,7 +767,7 @@ class NestedTrackingTest(test.TestCase):
         return x + self.v1 + self.v2
 
     layer = MyLayer()
-    inputs = keras.Input((1,))
+    inputs = input_layer.Input((1,))
     _ = layer(inputs)
 
     self.assertEqual(len(layer.weights), 8)
@@ -789,7 +791,7 @@ class NestedTrackingTest(test.TestCase):
     # Test that updates and losses from nested sublayers are
     # being tracked by subclassed layers.
 
-    class UpdateAndLossLayer(keras.layers.Layer):
+    class UpdateAndLossLayer(base_layer.Layer):
 
       def build(self, _):
         self.v1 = self.add_weight('v1', shape=())
@@ -799,7 +801,7 @@ class NestedTrackingTest(test.TestCase):
         self.add_update(state_ops.assign_add(self.v1, 1))
         return inputs + 1
 
-    class MyLayer(keras.layers.Layer):
+    class MyLayer(base_layer.Layer):
 
       def build(self, _):
         self.v1 = self.add_weight('v1', shape=())
@@ -823,23 +825,23 @@ class NestedTrackingTest(test.TestCase):
       self.assertEqual(len(layer.losses), 3)
       self.assertLen(layer.get_losses_for(None), 3)
     else:
-      inputs = keras.Input((1,))
+      inputs = input_layer.Input((1,))
       _ = layer(inputs)
       self.assertEqual(len(layer.losses), 3)
       self.assertEqual(len(layer.updates), 3)
       self.assertLen(layer.get_losses_for(None), 3)
 
   def test_attribute_reassignment(self):
-    l = keras.layers.Layer()
-    l.a = keras.layers.Layer()
+    l = base_layer.Layer()
+    l.a = base_layer.Layer()
     l.a = []
     l.a = variables.Variable(1.)
-    l.a = keras.layers.Layer()
-    last_assignment = keras.layers.Layer()
+    l.a = base_layer.Layer()
+    last_assignment = base_layer.Layer()
     l.a = last_assignment
     l.b = variables.Variable(1.)
     del l.b
-    l.c = keras.layers.Layer()
+    l.c = base_layer.Layer()
     del l.c
     l.d = last_assignment
     del l.d
@@ -852,7 +854,7 @@ class NestedTrackingTest(test.TestCase):
 
   def test_assign_op_not_tracked_as_variable(self):
 
-    class LayerWithAssignAttr(keras.layers.Layer):
+    class LayerWithAssignAttr(base_layer.Layer):
 
       def build(self, input_shape):
         self.v = variables.Variable(1.)
@@ -866,11 +868,11 @@ class NestedTrackingTest(test.TestCase):
   def test_layer_class_not_tracked_as_sublayer(self):
     # See https://github.com/tensorflow/tensorflow/issues/27431 for details.
 
-    class LayerWithClassAttribute(keras.layers.Layer):
+    class LayerWithClassAttribute(base_layer.Layer):
 
       def __init__(self):
         super(LayerWithClassAttribute, self).__init__()
-        self.layer_fn = keras.layers.Dense
+        self.layer_fn = layers.Dense
 
     layer = LayerWithClassAttribute()
     self.assertEmpty(layer.variables)
@@ -878,12 +880,12 @@ class NestedTrackingTest(test.TestCase):
 
   def test_layer_call_fn_args(self):
 
-    class NonDefunLayer(keras.layers.Layer):
+    class NonDefunLayer(base_layer.Layer):
 
       def call(self, inputs, a, mask, b=None, training=None):
         return inputs
 
-    class DefunLayer(keras.layers.Layer):
+    class DefunLayer(base_layer.Layer):
 
       @def_function.function
       def call(self, x, mask, a, training=None, b=None):
@@ -897,18 +899,18 @@ class NestedTrackingTest(test.TestCase):
                      ['x', 'mask', 'a', 'training', 'b'])
 
   def test_sequential_model(self):
-    model = keras.Sequential([keras.layers.Dense(10, input_shape=(10,)),
-                              keras.layers.Dense(5)])
+    model = sequential.Sequential(
+        [layers.Dense(10, input_shape=(10,)),
+         layers.Dense(5)])
     self.assertLen(model.layers, 2)
     self.assertLen(model.weights, 4)
 
     # Make sure a subclass model also works when it is called 'Sequential'.
-    class Sequential(keras.Model):
+    class Sequential(training_lib.Model):
 
       def __init__(self):
         super(Sequential, self).__init__()
-        self.dense_layers = [keras.layers.Dense(10),
-                             keras.layers.Dense(5)]
+        self.dense_layers = [layers.Dense(10), layers.Dense(5)]
 
       def call(self, inputs):
         x = inputs
@@ -920,31 +922,31 @@ class NestedTrackingTest(test.TestCase):
     self.assertLen(s.layers, 2)
     self.assertLen(s.weights, 0)
 
-    s(keras.Input((10,)))
+    s(input_layer.Input((10,)))
     self.assertLen(s.weights, 4)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class NameScopingTest(keras_parameterized.TestCase):
 
   def test_name_scope_layer(self):
-    x = keras.backend.placeholder(shape=(10, 10))
-    layer = keras.layers.Dense(10, name='MyName')
+    x = backend.placeholder(shape=(10, 10))
+    layer = layers.Dense(10, name='MyName')
     layer(x)
     self.assertEqual(layer.bias.name, 'MyName/bias:0')
     self.assertEqual(layer.kernel.name, 'MyName/kernel:0')
 
   def test_name_scope_sublayer(self):
 
-    class NameScopeTracker(keras.layers.Layer):
+    class NameScopeTracker(base_layer.Layer):
 
       def call(self, inputs):
         self.active_name_scope = ops.get_name_scope()
         return inputs
 
-    x = keras.backend.placeholder(shape=(10, 10))
+    x = backend.placeholder(shape=(10, 10))
     sublayer = NameScopeTracker(name='Sublayer')
-    layer = keras.layers.Dense(10, activation=sublayer, name='MyName2')
+    layer = layers.Dense(10, activation=sublayer, name='MyName2')
     layer(x)
     self.assertEqual(layer.bias.name, 'MyName2/bias:0')
     self.assertEqual(layer.kernel.name, 'MyName2/kernel:0')
@@ -952,21 +954,21 @@ class NameScopingTest(keras_parameterized.TestCase):
 
   def test_name_scope_tf_tensor(self):
     x = ops.convert_to_tensor_v2(np.ones((10, 10)))
-    layer = keras.layers.Dense(
-        10, activation=keras.layers.ReLU(name='MyAct'), name='MyName3')
+    layer = layers.Dense(
+        10, activation=layers.ReLU(name='MyAct'), name='MyName3')
     layer(x)
     self.assertEqual(layer.bias.name, 'MyName3/bias:0')
     self.assertEqual(layer.kernel.name, 'MyName3/kernel:0')
 
 
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+@combinations.generate(combinations.keras_mode_combinations(mode=['eager']))
 class AutographControlFlowTest(keras_parameterized.TestCase):
 
   def test_disabling_in_context_is_matched(self):
 
     test_obj = self
 
-    class MyLayer(keras.layers.Layer):
+    class MyLayer(base_layer.Layer):
 
       def call(self, inputs, training=None):
         with test_obj.assertRaisesRegex(TypeError, 'Tensor.*as.*bool'):
@@ -982,16 +984,16 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
 
   def test_if_training_pattern_output(self):
 
-    class MyLayer(keras.layers.Layer):
+    class MyLayer(base_layer.Layer):
 
       def call(self, inputs, training=None):
         if training:
           return inputs * 1.
         return inputs * 0.
 
-    inputs = keras.Input((3,))
+    inputs = input_layer.Input((3,))
     outputs = MyLayer()(inputs)
-    model = keras.Model(inputs, outputs)
+    model = training_lib.Model(inputs, outputs)
     model.compile(
         'sgd',
         'mse',
@@ -1003,7 +1005,7 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
 
   def test_if_training_pattern_loss(self):
 
-    class MyLayer(keras.layers.Layer):
+    class MyLayer(base_layer.Layer):
 
       def call(self, inputs, training=None):
         if training:
@@ -1013,9 +1015,9 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
         self.add_loss(loss)
         return inputs
 
-    inputs = keras.Input((3,))
+    inputs = input_layer.Input((3,))
     outputs = MyLayer()(inputs)
-    model = keras.Model(inputs, outputs)
+    model = training_lib.Model(inputs, outputs)
     model.compile(
         'sgd',
         'mse',
@@ -1027,7 +1029,7 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
 
   def test_if_training_pattern_metric(self):
 
-    class MyLayer(keras.layers.Layer):
+    class MyLayer(base_layer.Layer):
 
       def call(self, inputs, training=None):
         if training:
@@ -1037,9 +1039,9 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
         self.add_metric(metric, name='my_metric', aggregation='mean')
         return inputs
 
-    inputs = keras.Input((3,))
+    inputs = input_layer.Input((3,))
     outputs = MyLayer()(inputs)
-    model = keras.Model(inputs, outputs)
+    model = training_lib.Model(inputs, outputs)
     model.compile(
         'sgd',
         'mse',
@@ -1055,7 +1057,7 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
 
   def test_if_training_pattern_update(self):
 
-    class MyLayer(keras.layers.Layer):
+    class MyLayer(base_layer.Layer):
 
       def build(self, input_shape):
         self.counter = self.add_weight(
@@ -1069,20 +1071,20 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
         self.counter.assign_add(increment)
         return inputs
 
-    inputs = keras.Input((3,))
+    inputs = input_layer.Input((3,))
     layer = MyLayer()
     outputs = layer(inputs)
-    model = keras.Model(inputs, outputs)
+    model = training_lib.Model(inputs, outputs)
     model.compile(
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(np.ones((2, 3)), np.ones((2, 3)))
-    self.assertEqual(keras.backend.get_value(layer.counter), 1.)
+    self.assertEqual(backend.get_value(layer.counter), 1.)
 
   def test_conditional_updates_in_call(self):
 
-    class MyLayer(keras.layers.Layer):
+    class MyLayer(base_layer.Layer):
 
       def __init__(self):
         super(MyLayer,
@@ -1102,27 +1104,27 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
         return input_shape
 
     if testing_utils.should_run_eagerly():
-      inputs = keras.Input((3,))
+      inputs = input_layer.Input((3,))
       layer = MyLayer()
       outputs = layer(inputs)
-      model = keras.Model(inputs, outputs)
+      model = training_lib.Model(inputs, outputs)
       model.compile(
           'sgd',
           'mse',
           run_eagerly=testing_utils.should_run_eagerly())
       model.train_on_batch(np.ones((2, 3)), np.ones((2, 3)))
-      self.assertEqual(keras.backend.get_value(layer.counter), 6.)
+      self.assertEqual(backend.get_value(layer.counter), 6.)
     else:
       # TODO(fchollet): support the same workflow in graph mode.
       with self.assertRaisesRegexp(RuntimeError,
                                    '`add_update` in a control flow branch'):
         layer = MyLayer()
-        layer(keras.Input((3,)))
+        layer(input_layer.Input((3,)))
         _ = layer.updates
 
   def test_conditional_losses_in_call(self):
 
-    class MyLayer(keras.layers.Layer):
+    class MyLayer(base_layer.Layer):
 
       def __init__(self):
         super(MyLayer,
@@ -1137,10 +1139,10 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
         return input_shape
 
     if testing_utils.should_run_eagerly():
-      inputs = keras.Input((3,))
+      inputs = input_layer.Input((3,))
       layer = MyLayer()
       outputs = layer(inputs)
-      model = keras.Model(inputs, outputs)
+      model = training_lib.Model(inputs, outputs)
       model.compile(
           'sgd',
           'mse',
@@ -1150,12 +1152,12 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
     else:
       with self.assertRaisesRegexp(RuntimeError,
                                    '`add_loss` in a control flow branch'):
-        layer = MyLayer()(keras.Input((3,)))
+        layer = MyLayer()(input_layer.Input((3,)))
 
   def test_conditional_callable_losses(self):
-    model = keras.Sequential([
-        keras.layers.Dense(
-            1, kernel_regularizer=keras.regularizers.l2(1e-4), input_shape=(1,))
+    model = sequential.Sequential([
+        layers.Dense(
+            1, kernel_regularizer=regularizers.l2(1e-4), input_shape=(1,))
     ])
     model._run_eagerly = testing_utils.should_run_eagerly()
 
@@ -1175,7 +1177,7 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
 
   def test_conditional_metrics_in_call(self):
 
-    class MyLayer(keras.layers.Layer):
+    class MyLayer(base_layer.Layer):
 
       def __init__(self):
         super(MyLayer,
@@ -1192,10 +1194,10 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
         return input_shape
 
     if testing_utils.should_run_eagerly():
-      inputs = keras.Input((3,))
+      inputs = input_layer.Input((3,))
       layer = MyLayer()
       outputs = layer(inputs)
-      model = keras.Model(inputs, outputs)
+      model = training_lib.Model(inputs, outputs)
       model.compile(
           'sgd',
           'mse',
@@ -1206,16 +1208,16 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
       # TODO(fchollet): support the same workflow in graph mode.
       with self.assertRaisesRegexp(RuntimeError,
                                    '`add_metric` in a control flow branch'):
-        layer = MyLayer()(keras.Input((3,)))
+        layer = MyLayer()(input_layer.Input((3,)))
 
   def test_conditional_activity_regularizer_in_call(self):
 
-    class TestModel(keras.Model):
+    class TestModel(training_lib.Model):
 
       def __init__(self):
         super(TestModel, self).__init__(
             name='test_model', dynamic=testing_utils.should_run_eagerly())
-        self.layer = keras.layers.Dense(2, activity_regularizer='l2')
+        self.layer = layers.Dense(2, activity_regularizer='l2')
 
       def call(self, x, training=None):
         if math_ops.greater(math_ops.reduce_sum(x), 0.0):
@@ -1241,14 +1243,13 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
 
   def test_conditional_activity_regularizer_with_wrappers_in_call(self):
 
-    class TestModel(keras.Model):
+    class TestModel(training_lib.Model):
 
       def __init__(self):
         super(TestModel, self).__init__(
             name='test_model', dynamic=testing_utils.should_run_eagerly())
-        self.layer = keras.layers.TimeDistributed(
-            keras.layers.Dense(2, activity_regularizer='l2'),
-            input_shape=(3, 4))
+        self.layer = layers.TimeDistributed(
+            layers.Dense(2, activity_regularizer='l2'), input_shape=(3, 4))
 
       def call(self, x, training=None):
         if math_ops.greater(math_ops.reduce_sum(x), 0.0):
@@ -1273,8 +1274,8 @@ class AutographControlFlowTest(keras_parameterized.TestCase):
         model.fit(x, y, epochs=2, batch_size=5)
 
 
-class AddLayer(keras.layers.Layer):
-  """A layer which adds it's input to a variable.
+class AddLayer(base_layer.Layer):
+  """A layer which adds its input to a variable.
 
   Useful for testing a layer with a variable
   """
@@ -1287,8 +1288,8 @@ class AddLayer(keras.layers.Layer):
     return inputs + self.v
 
 
-class IdentityLayer(keras.layers.Layer):
-  """A layer that returns it's input.
+class IdentityLayer(base_layer.Layer):
+  """A layer that returns its input.
 
   Useful for testing a layer without a variable.
   """
@@ -1297,7 +1298,7 @@ class IdentityLayer(keras.layers.Layer):
     return inputs
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class DTypeTest(keras_parameterized.TestCase):
 
   # This class only have tests relating to layer.dtype. Tests for dtype policies
@@ -1365,7 +1366,7 @@ class DTypeTest(keras_parameterized.TestCase):
   @testing_utils.enable_v2_dtype_behavior
   def multiple_inputs_cast_to_dtype(self):
 
-    class MultiIdentityLayer(keras.layers.Layer):
+    class MultiIdentityLayer(base_layer.Layer):
 
       def call(self, inputs):
         return [array_ops.identity(x) for x in inputs]
@@ -1394,7 +1395,7 @@ class DTypeTest(keras_parameterized.TestCase):
   @testing_utils.enable_v2_dtype_behavior
   def test_extra_args_and_kwargs_not_casted(self):
 
-    class IdentityLayerWithArgs(keras.layers.Layer):
+    class IdentityLayerWithArgs(base_layer.Layer):
 
       def call(self, inputs, *args, **kwargs):
         return nest.flatten([inputs, args, kwargs])
@@ -1495,9 +1496,10 @@ class DTypeTest(keras_parameterized.TestCase):
       ragged = ragged_tensor.RaggedTensor.from_row_splits(
           values=array_ops.constant([1., 2., 3.], dtype='float32'),
           row_splits=array_ops.constant([0, 2, 2, 3], dtype='int64'))
-      model = keras.Sequential([
-          keras.layers.InputLayer(input_shape=(None,), ragged=True),
-          IdentityLayer()])
+      model = sequential.Sequential([
+          input_layer.InputLayer(input_shape=(None,), ragged=True),
+          IdentityLayer()
+      ])
       model.compile(rmsprop.RMSprop(0.001), loss='mse')
       model.train_on_batch(ragged)
 
diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
index 1c84a6ec7aa..72024a0f658 100644
--- a/tensorflow/python/keras/engine/base_layer_utils.py
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -181,7 +181,8 @@ def create_keras_history(tensors):
       operations and need to have Keras metadata assigned to them.
 
   Returns:
-    keras_tensors: The Tensors found that came from a Keras Layer.
+    created_layers: List. The `TensorFlowOpLayer` instances created to wrap
+      the raw Tensorflow operations.
   """
   _, created_layers = _create_keras_history_helper(tensors, set(), [])
   return created_layers
@@ -650,12 +651,6 @@ def mark_as_return(outputs, acd):
   return nest.map_structure(_mark_as_return, outputs)
 
 
-def default(method):
-  """Decorates a method to detect overrides in subclasses."""
-  method._is_default = True  # pylint: disable=protected-access
-  return method
-
-
 V2_DTYPE_BEHAVIOR = None
 
 
diff --git a/tensorflow/python/keras/engine/base_layer_utils_test.py b/tensorflow/python/keras/engine/base_layer_utils_test.py
index 8bb1d39b4c9..21f539d89c5 100644
--- a/tensorflow/python/keras/engine/base_layer_utils_test.py
+++ b/tensorflow/python/keras/engine/base_layer_utils_test.py
@@ -1,13 +1,13 @@
 # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the 'License');
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an 'AS IS' BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
@@ -17,15 +17,15 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.platform import test
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class TrackableWeightHandlerTest(keras_parameterized.TestCase):
 
   def get_table_handler(self):
@@ -44,7 +44,7 @@ class TrackableWeightHandlerTest(keras_parameterized.TestCase):
   def test_get_and_set_weights(self):
     table_handler = self.get_table_handler()
 
-    table_data = {b"a": 1, b"b": 2, b"c": 3}
+    table_data = {b'a': 1, b'b': 2, b'c': 3}
     table_handler.set_weights(
         [list(table_data.keys()),
          list(table_data.values())])
@@ -54,7 +54,7 @@ class TrackableWeightHandlerTest(keras_parameterized.TestCase):
 
   def test_get_and_set_weights_does_not_add_ops(self):
     table_handler = self.get_table_handler()
-    table_data = {b"a": 1, b"b": 2, b"c": 3}
+    table_data = {b'a': 1, b'b': 2, b'c': 3}
     table_handler.set_weights(
         [list(table_data.keys()),
          list(table_data.values())])
@@ -66,5 +66,5 @@ class TrackableWeightHandlerTest(keras_parameterized.TestCase):
     _ = backend.batch_get_value(table_handler.get_tensors())
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/base_layer_v1.py b/tensorflow/python/keras/engine/base_layer_v1.py
index 60ee17d76d5..19fc9dd443c 100644
--- a/tensorflow/python/keras/engine/base_layer_v1.py
+++ b/tensorflow/python/keras/engine/base_layer_v1.py
@@ -254,7 +254,7 @@ class Layer(base_layer.Layer):
     self._auto_track_sub_layers = True
 
   @trackable.no_automatic_dependency_tracking
-  @base_layer_utils.default
+  @generic_utils.default
   def build(self, input_shape):
     """Creates the variables of the layer (optional, for subclass implementers).
 
@@ -378,7 +378,7 @@ class Layer(base_layer.Layer):
       dtype = self.dtype or backend.floatx()
     dtype = dtypes.as_dtype(dtype)
     if self._dtype_policy.variable_dtype is None:
-      # The policy is "infer", so we infer the policy from the variable dtype.
+      # The policy is "_infer", so we infer the policy from the variable dtype.
       self._dtype_policy = policy.Policy(dtype.base_dtype.name)
     initializer = initializers.get(initializer)
     regularizer = regularizers.get(regularizer)
@@ -467,7 +467,7 @@ class Layer(base_layer.Layer):
         self._non_trainable_weights.append(variable)
     return variable
 
-  @base_layer_utils.default
+  @generic_utils.default
   def get_config(self):
     """Returns the config of the layer.
 
@@ -605,7 +605,7 @@ class Layer(base_layer.Layer):
         lambda s: tensor_spec.TensorSpec(dtype=dtype, shape=s),
         output_shape)
 
-  @base_layer_utils.default
+  @generic_utils.default
   def compute_mask(self, inputs, mask=None):  # pylint: disable=unused-argument
     """Computes an output mask tensor.
 
@@ -1796,7 +1796,7 @@ class Layer(base_layer.Layer):
           "Layer {self.name} is casting an input tensor from dtype "
           "{input_dtype} to the layer's dtype of {layer_dtype}, which is new "
           "behavior in TensorFlow 2.  The layer has dtype {layer_dtype} "
-          "because it's dtype defaults to floatx.\n\n"
+          'because its dtype defaults to floatx.\n\n'
           ""
           "If you intended to run this layer in {layer_dtype}, you can safely "
           "ignore this warning. If in doubt, this warning is likely only an "
diff --git a/tensorflow/python/keras/engine/compile_utils.py b/tensorflow/python/keras/engine/compile_utils.py
index ed96cfd198b..82aed487b85 100644
--- a/tensorflow/python/keras/engine/compile_utils.py
+++ b/tensorflow/python/keras/engine/compile_utils.py
@@ -31,17 +31,91 @@ from tensorflow.python.ops.losses import util as tf_losses_utils
 from tensorflow.python.util import nest
 
 
-class LossesContainer(object):
+class Container(object):
+  """Base Container class."""
+
+  def __init__(self, output_names=None):
+    self._output_names = output_names
+
+  def _build(self, y_pred):
+    if self._output_names is None:
+      # In Subclass API, output names like 'output_1' are used for
+      # `Metric` names.
+      self._output_names = create_pseudo_output_names(y_pred)
+
+  def _conform_to_outputs(self, outputs, struct):
+    """Convenience method to conform `struct` to `outputs` structure.
+
+    Mappings performed:
+
+    (1) Map a dict to a list of outputs, using the output names.
+    (2) Fill missing keys in a dict w/ `None`s.
+    (3) Map a single item to all outputs.
+
+    Arguments:
+      outputs: Model predictions.
+      struct: Arbitrary nested structure (e.g. of labels, sample_weights,
+        losses, or metrics).
+
+    Returns:
+      Mapping of `struct` to `outputs` structure.
+    """
+    struct = map_to_output_names(outputs, self._output_names, struct)
+    struct = map_missing_dict_keys(outputs, struct)
+    # Allow passing one object that applies to all outputs.
+    if not nest.is_sequence(struct) and nest.is_sequence(outputs):
+      struct = nest.map_structure(lambda _: struct, outputs)
+    return struct
+
+  def _maybe_broadcast_to_outputs(self, outputs, objects):
+    """Determines if losses / metrics should be applied to all outputs.
+
+    NOTE: This method should only be called for Metrics / Losses, not for
+    y_true / sample_weight.
+
+    Arguments:
+      outputs: Model predictions.
+      objects: Arbitrary nested structure (e.g. of losses or metrics)
+
+    Returns:
+      Arbitrary nested structure of objects, maybe copied to each output.
+
+    Applies a Loss / Metric to all outputs.
+    """
+    if not self._should_broadcast(objects):
+      return objects
+
+    # When there is more than one Model output, this is needed to keep
+    # each Metric / Loss separate. When there is only one Model output,
+    # the user-supplied object should be used.
+    should_copy_objects = len(nest.flatten(outputs)) > 1
+
+    def _broadcast_fn():
+      if should_copy_objects:
+        return nest.map_structure(self._copy_object, objects)
+      return objects
+
+    return nest.map_structure(lambda _: _broadcast_fn(), outputs)
+
+  def _should_broadcast(self, objects):
+    raise NotImplementedError
+
+  def _copy_object(self, obj):
+    raise NotImplementedError
+
+
+class LossesContainer(Container):
   """A container class for losses passed to `Model.compile`."""
 
   def __init__(self, losses, loss_weights=None, output_names=None):
+    super(LossesContainer, self).__init__(output_names=output_names)
+
     # Keep user-supplied values untouched for recompiling and serialization.
     self._user_losses = losses
     self._user_loss_weights = loss_weights
 
     self._losses = losses
     self._loss_weights = loss_weights
-    self._output_names = output_names
     self._per_output_metrics = None  # Per-output losses become metrics.
     self._loss_metric = metrics_mod.Mean(name='loss')  # Total loss.
     self._built = False
@@ -59,32 +133,23 @@ class LossesContainer(object):
 
   def _build(self, y_pred):
     """One-time setup of loss objects."""
+    super(LossesContainer, self)._build(y_pred)
 
-    if self._output_names is None:
-      # In Subclass API,  output names like 'output_1' are used for
-      # `Metric` names.
-      self._output_names = create_pseudo_output_names(y_pred)
-
-    # Accept a dict of losses keyed by output_name when outputs are a flat
-    # list.
-    self._losses = map_to_output_names(y_pred, self._output_names, self._losses)
-    self._loss_weights = map_to_output_names(y_pred, self._output_names,
-                                             self._loss_weights)
-
-    # Broadcast single config values to apply to each output.
-    if not nest.is_sequence(self._losses):
-      self._losses = nest.map_structure(lambda output: self._losses, y_pred)
-    if not nest.is_sequence(self._loss_weights):
-      self._loss_weights = nest.map_structure(lambda output: self._loss_weights,
-                                              y_pred)
-
+    self._losses = self._maybe_broadcast_to_outputs(y_pred, self._losses)
+    self._losses = self._conform_to_outputs(y_pred, self._losses)
     self._losses = nest.map_structure(self._get_loss_object, self._losses)
-
-    # Now that structures have been checked, it is safe to flatten.
     self._losses = nest.flatten(self._losses)
+
+    self._loss_weights = self._maybe_broadcast_to_outputs(
+        y_pred, self._loss_weights)
+    self._loss_weights = self._conform_to_outputs(y_pred, self._loss_weights)
     self._loss_weights = nest.flatten(self._loss_weights)
 
-    # Create per-output loss metrics, but only for multi-output Models.
+    self._create_metrics()
+    self._built = True
+
+  def _create_metrics(self):
+    """Creates per-output loss metrics, but only for multi-output Models."""
     if len(self._output_names) == 1:
       self._per_output_metrics = [None]
     else:
@@ -96,8 +161,6 @@ class LossesContainer(object):
           self._per_output_metrics.append(
               metrics_mod.Mean(output_name + '_loss'))
 
-    self._built = True
-
   def __call__(self,
                y_true,
                y_pred,
@@ -117,35 +180,22 @@ class LossesContainer(object):
     Returns:
       Tuple of `(total_loss, per_output_loss_list)`
     """
-    y_true = map_to_output_names(y_pred, self._output_names, y_true)
-    sample_weight = map_to_output_names(y_pred, self._output_names,
-                                        sample_weight)
+    y_true = self._conform_to_outputs(y_pred, y_true)
+    sample_weight = self._conform_to_outputs(y_pred, sample_weight)
 
     if not self._built:
       self._build(y_pred)
 
-    y_true = nest.flatten(y_true) if y_true is not None else []
     y_pred = nest.flatten(y_pred)
-
-    # TODO(omalleyt): Remove ambiguity here.
-    # This is currently needed to support passing only 1 loss and 1 target
-    # to a Functional Model with multiple outputs. However, this is
-    # ambiguous, especially with subclass, and we should reconsider how we
-    # support this.
-    if len(y_true) == 1 and len(y_pred) > 1:
-      y_true = y_true * len(y_pred)
-
+    y_true = nest.flatten(y_true)
     sample_weight = nest.flatten(sample_weight)
-    # Allows passing one sample-weight array for all outputs.
-    if len(sample_weight) == 1 and len(y_pred) > 1:
-      sample_weight = sample_weight * len(y_pred)
 
     loss_values = []  # Used for gradient calculation.
     loss_metric_values = []  # Used for loss metric calculation.
     zip_args = (y_true, y_pred, sample_weight, self._losses, self._loss_weights,
                 self._per_output_metrics)
     for y_t, y_p, sw, loss_obj, loss_weight, metric_obj in zip(*zip_args):
-      if loss_obj is None:  # Ok to have no loss for an output.
+      if y_t is None or loss_obj is None:  # Ok to have no loss for an output.
         continue
 
       y_t, y_p, sw = match_dtype_and_rank(y_t, y_p, sw)
@@ -213,18 +263,25 @@ class LossesContainer(object):
     loss._allow_sum_over_batch_size = True  # pylint: disable=protected-access
     return loss
 
+  def _should_broadcast(self, obj):
+    return not nest.is_sequence(obj)
 
-class MetricsContainer(object):
+  def _copy_object(self, obj):
+    return obj  # Losses don't need to be copied.
+
+
+class MetricsContainer(Container):
   """A container class for metrics passed to `Model.compile`."""
 
   def __init__(self, metrics=None, weighted_metrics=None, output_names=None):
+    super(MetricsContainer, self).__init__(output_names=output_names)
+
     # Keep user-supplied values untouched for recompiling and serialization.
     self._user_metrics = metrics
     self._user_weighted_metrics = weighted_metrics
 
     self._metrics = metrics
     self._weighted_metrics = weighted_metrics
-    self._output_names = output_names
     self._built = False
 
   @property
@@ -236,22 +293,15 @@ class MetricsContainer(object):
 
   def _build(self, y_pred, y_true):
     """One-time setup of metric objects."""
+    super(MetricsContainer, self)._build(y_pred)
 
-    if self._output_names is None:
-      # Subclass output names like 'output_1' are used for `Metric` names.
-      self._output_names = create_pseudo_output_names(y_pred)
+    self._metrics = self._maybe_broadcast_to_outputs(y_pred, self._metrics)
+    self._metrics = self._conform_to_outputs(y_pred, self._metrics)
 
-    # If a single metric or flat list of metrics, apply to all outputs.
-    self._metrics = self._maybe_broadcast(self._metrics, y_pred)
-    self._weighted_metrics = self._maybe_broadcast(self._weighted_metrics,
-                                                   y_pred)
-
-    # Accept a dict of metrics keyed by output_name when outputs are a flat
-    # list.
-    self._metrics = map_to_output_names(y_pred, self._output_names,
-                                        self._metrics)
-    self._weighted_metrics = map_to_output_names(y_pred, self._output_names,
-                                                 self._weighted_metrics)
+    self._weighted_metrics = self._maybe_broadcast_to_outputs(
+        y_pred, self._weighted_metrics)
+    self._weighted_metrics = self._conform_to_outputs(y_pred,
+                                                      self._weighted_metrics)
 
     # Standardize on tuple since `tf.data` turns lists into `Tensor`s.
     # pylint: disable=protected-access
@@ -276,18 +326,7 @@ class MetricsContainer(object):
 
     # Assumes metrics, weighted_metrics have been flattened up to outputs.
     self._set_metric_names()
-
-    # Cache the flat order needed when returning metrics, for backwards compat.
-    self._metrics_in_order = []
-    for output_metrics, output_weighted_metrics in zip(self._metrics,
-                                                       self._weighted_metrics):
-      for m in nest.flatten(output_metrics):
-        if m is not None:
-          self._metrics_in_order.append(m)
-      for wm in nest.flatten(output_weighted_metrics):
-        if wm is not None:
-          self._metrics_in_order.append(wm)
-
+    self._create_ordered_metrics()
     self._built = True
 
   def _set_metric_names(self):
@@ -326,38 +365,38 @@ class MetricsContainer(object):
         metric_names.add(wm._name)
     # pylint: enable=protected-access
 
+  def _create_ordered_metrics(self):
+    """Cache the flat order needed when returning metrics, for backwards compat."""
+    self._metrics_in_order = []
+    for output_metrics, output_weighted_metrics in zip(self._metrics,
+                                                       self._weighted_metrics):
+      for m in nest.flatten(output_metrics):
+        if m is not None:
+          self._metrics_in_order.append(m)
+      for wm in nest.flatten(output_weighted_metrics):
+        if wm is not None:
+          self._metrics_in_order.append(wm)
+
   def update_state(self, y_true, y_pred, sample_weight=None):
     """Updates the state of per-output metrics."""
-    y_true = map_to_output_names(y_pred, self._output_names, y_true)
-    sample_weight = map_to_output_names(y_pred, self._output_names,
-                                        sample_weight)
-
-    flat_y_true = nest.flatten(y_true) if y_true is not None else []
-    flat_y_pred = nest.flatten(y_pred)
-
-    if not flat_y_true:
-      return  # Handle case where no targets are passed.
-
-    # TODO(omalleyt): Remove ambiguity here (see LossesContainer).
-    if len(flat_y_true) == 1 and len(flat_y_pred) > 1:
-      y_true = nest.map_structure(lambda _: flat_y_true[0], y_pred)
-      flat_y_true = nest.flatten(y_true)
+    y_true = self._conform_to_outputs(y_pred, y_true)
+    sample_weight = self._conform_to_outputs(y_pred, sample_weight)
 
     if not self._built:
-      # `_build` needs unflattened outputs and labels.
       self._build(y_pred, y_true)
 
-    y_true = flat_y_true
-    y_pred = flat_y_pred
-
+    y_pred = nest.flatten(y_pred)
+    y_true = nest.flatten(y_true) if y_true is not None else []
     sample_weight = nest.flatten(sample_weight)
-    # Allows passing one sample-weight array for all outputs.
-    if len(sample_weight) == 1 and len(y_pred) > 1:
-      sample_weight = sample_weight * len(y_pred)
 
     zip_args = (y_true, y_pred, sample_weight, self._metrics,
                 self._weighted_metrics)
     for y_t, y_p, sw, metric_objs, weighted_metric_objs in zip(*zip_args):
+      # Ok to have no metrics for an output.
+      if (y_t is None or (all(m is None for m in metric_objs) and
+                          all(wm is None for wm in weighted_metric_objs))):
+        continue
+
       y_t, y_p, sw = match_dtype_and_rank(y_t, y_p, sw)
       sw = apply_mask(y_p, sw)
 
@@ -432,30 +471,18 @@ class MetricsContainer(object):
 
     return metric_obj
 
-  def _maybe_broadcast(self, metrics, y_pred):
-    """If a flat list of Metrics is supplied, apply them to all outputs."""
+  def _should_broadcast(self, obj):
+    # e.g. 'mse'.
+    if not nest.is_sequence(obj):
+      return True
+    # e.g. ['mse'] or ['mse', 'mae'].
+    return (isinstance(obj, (list, tuple)) and
+            not any(nest.is_sequence(o) for o in obj))
 
-    def _should_broadcast(metrics):
-      # e.g. 'mse'.
-      if not nest.is_sequence(metrics):
-        return True
-      # e.g. ['mse'] or ['mse', 'mae'].
-      return (isinstance(metrics, (list, tuple)) and
-              not any(nest.is_sequence(m) for m in metrics))
-
-    if _should_broadcast(metrics):
-      copy_metrics = len(nest.flatten(y_pred)) > 1
-
-      def _maybe_copy(m):
-        if copy_metrics and isinstance(m, metrics_mod.Metric):
-          return m.__class__.from_config(m.get_config())
-        return m
-
-      metrics = nest.flatten(metrics)
-      return nest.map_structure(lambda _: [_maybe_copy(m) for m in metrics],
-                                y_pred)
-
-    return metrics
+  def _copy_object(self, obj):
+    if isinstance(obj, metrics_mod.Metric):
+      return obj.__class__.from_config(obj.get_config())
+    return obj  # Can be a function or `None`.
 
 
 def create_pseudo_output_names(outputs):
@@ -540,10 +567,10 @@ def map_to_output_names(y_pred, output_names, struct):
   Returns:
     `struct` mapped to a list in same order as `output_names`.
   """
-  outputs_are_flat_list = (
-      isinstance(y_pred, (list, tuple)) and
-      not any(nest.is_sequence(y_p) for y_p in y_pred))
   single_output = not nest.is_sequence(y_pred)
+  outputs_are_flat_list = (not single_output and
+                           isinstance(y_pred, (list, tuple)) and
+                           not any(nest.is_sequence(y_p) for y_p in y_pred))
 
   if (single_output or outputs_are_flat_list) and isinstance(struct, dict):
     output_names = output_names or create_pseudo_output_names(y_pred)
@@ -560,6 +587,16 @@ def map_to_output_names(y_pred, output_names, struct):
     return struct
 
 
+def map_missing_dict_keys(y_pred, struct):
+  """Replaces missing dict keys in `struct` with `None` placeholders."""
+  if not isinstance(y_pred, dict) or not isinstance(struct, dict):
+    return struct
+  for k in y_pred.keys():
+    if k not in struct:
+      struct[k] = None
+  return struct
+
+
 def match_dtype_and_rank(y_t, y_p, sw):
   """Match dtype and rank of predictions."""
   if y_t.shape.rank == 1 and y_p.shape.rank == 2:
diff --git a/tensorflow/python/keras/engine/compile_utils_test.py b/tensorflow/python/keras/engine/compile_utils_test.py
index f888797746d..6f5fafe4a54 100644
--- a/tensorflow/python/keras/engine/compile_utils_test.py
+++ b/tensorflow/python/keras/engine/compile_utils_test.py
@@ -205,6 +205,40 @@ class LossesContainerTest(keras_parameterized.TestCase):
     self.assertEqual(output_2_metric.name, 'output_2_loss')
     self.assertEqual(output_2_metric.result().numpy(), 0.5)
 
+  def test_missing_label_with_no_loss(self):
+    # It's ok to exclude a label if that label has no
+    # losses or metrics associated with it.
+    loss_container = compile_utils.LossesContainer({
+        'output1': 'mse',
+        'output3': 'mae'
+    })
+
+    y_p = {
+        'output1': ops.convert_to_tensor([[0], [1], [2]]),
+        'output2': ops.convert_to_tensor([[3], [4], [5]]),
+        'output3': ops.convert_to_tensor([[6], [7], [8]])
+    }
+    y_t = {
+        'output1': ops.convert_to_tensor([[1], [2], [3]]),
+        'output3': ops.convert_to_tensor([[4], [5], [6]])
+    }
+
+    total_loss = loss_container(y_t, y_p)
+    self.assertEqual(total_loss.numpy(), 3.)
+    self.assertLen(loss_container.metrics, 3)
+
+    loss_metric = loss_container.metrics[0]
+    self.assertEqual(loss_metric.name, 'loss')
+    self.assertEqual(loss_metric.result().numpy(), 3.)
+
+    output_1_metric = loss_container.metrics[1]
+    self.assertEqual(output_1_metric.name, 'output1_loss')
+    self.assertEqual(output_1_metric.result().numpy(), 1.)
+
+    output_3_metric = loss_container.metrics[2]
+    self.assertEqual(output_3_metric.name, 'output3_loss')
+    self.assertEqual(output_3_metric.result().numpy(), 2.)
+
 
 class MetricsContainerTest(keras_parameterized.TestCase):
 
@@ -453,6 +487,35 @@ class MetricsContainerTest(keras_parameterized.TestCase):
     self.assertEqual(mae_metric.name, 'mae')
     self.assertEqual(mae_metric.result().numpy(), 1.)
 
+  def test_missing_label_with_no_metrics(self):
+    # It's ok to exclude a label if that label has no
+    # losses or metrics associated with it.
+    metric_container = compile_utils.MetricsContainer(metrics={
+        'output1': 'mae',
+        'output3': 'mse'
+    })
+
+    y_p = {
+        'output1': ops.convert_to_tensor([[0], [1], [2]]),
+        'output2': ops.convert_to_tensor([[3], [4], [5]]),
+        'output3': ops.convert_to_tensor([[6], [7], [8]])
+    }
+    y_t = {
+        'output1': ops.convert_to_tensor([[1], [2], [3]]),
+        'output3': ops.convert_to_tensor([[4], [5], [6]])
+    }
+
+    metric_container.update_state(y_t, y_p)
+    self.assertLen(metric_container.metrics, 2)
+
+    mae_metric = metric_container.metrics[0]
+    self.assertEqual(mae_metric.name, 'output1_mae')
+    self.assertEqual(mae_metric.result().numpy(), 1.)
+
+    mse_metric = metric_container.metrics[1]
+    self.assertEqual(mse_metric.name, 'output3_mse')
+    self.assertEqual(mse_metric.result().numpy(), 4.)
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/keras/engine/data_adapter.py b/tensorflow/python/keras/engine/data_adapter.py
index e016935367d..5f1e8e2de64 100644
--- a/tensorflow/python/keras/engine/data_adapter.py
+++ b/tensorflow/python/keras/engine/data_adapter.py
@@ -33,6 +33,8 @@ from tensorflow.python.data.experimental.ops import cardinality
 from tensorflow.python.data.experimental.ops import distribute_options
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import input_lib
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -679,7 +681,8 @@ class DatasetAdapter(DataAdapter):
 
   @staticmethod
   def can_handle(x, y=None):
-    return isinstance(x, (dataset_ops.DatasetV1, dataset_ops.DatasetV2))
+    return (isinstance(x, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)) or
+            _is_distributed_dataset(x))
 
   def __init__(self,
                x,
@@ -713,6 +716,11 @@ class DatasetAdapter(DataAdapter):
     return None
 
   def should_recreate_iterator(self):
+    # Since DistributedDatasets have no cardinality, the user must provide
+    # all steps that need to be run, calling `.repeat()` as needed.
+    if _is_distributed_dataset(self._dataset):
+      return False
+
     # If user doesn't supply `steps`, or if they supply `steps` that
     # exactly equals the size of the `Dataset`, create a new iterator
     # each epoch.
@@ -729,10 +737,18 @@ class DatasetAdapter(DataAdapter):
       raise ValueError("`sample_weight` argument is not supported when using "
                        "dataset as input.")
 
-    size = cardinality.cardinality(self._dataset).numpy()
-    if size == cardinality.INFINITE and steps is None:
-      raise ValueError("When providing an infinite dataset, you must specify "
-                       "the number of steps to run.")
+    if steps is None:
+      if _is_distributed_dataset(self._dataset):
+        raise ValueError("When providing a distributed dataset, you must "
+                         "specify the number of steps to run.")
+
+      size = cardinality.cardinality(self._dataset).numpy()
+      if size == cardinality.INFINITE and steps is None:
+        raise ValueError(
+            "When providing an infinite dataset, you must specify "
+            "the number of steps to run (if you did not intend to ."
+            "create an infinite dataset, make sure to not call "
+            "`repeat()` on the dataset).")
 
 
 class GeneratorDataAdapter(DataAdapter):
@@ -776,7 +792,7 @@ class GeneratorDataAdapter(DataAdapter):
     # Need to build the Model on concrete input shapes.
     if model is not None and not model.built:
       concrete_x, _, _ = unpack_x_y_sample_weight(peek)
-      model.distribute_strategy.experimental_run_v2(
+      model.distribute_strategy.run(
           lambda x: model(x, training=False), args=(concrete_x,))
 
     self._first_batch_size = int(nest.flatten(peek)[0].shape[0])
@@ -1111,8 +1127,11 @@ class DataHandler(object):
     dataset = self._adapter.get_dataset()
     if class_weight:
       dataset = dataset.map(_make_class_weight_map_fn(class_weight))
-    self._steps_per_epoch = self._infer_steps(steps_per_epoch, dataset)
-    self._dataset = strategy.experimental_distribute_dataset(dataset)
+    self._inferred_steps = self._infer_steps(steps_per_epoch, dataset)
+
+    if not _is_distributed_dataset(dataset):
+      dataset = strategy.experimental_distribute_dataset(dataset)
+    self._dataset = dataset
 
   def enumerate_epochs(self):
     """Yields `(epoch, tf.data.Iterator)`."""
@@ -1135,12 +1154,13 @@ class DataHandler(object):
     """Catches errors when an iterator runs out of data."""
     try:
       yield
+      context.async_wait()
     except (StopIteration, errors.OutOfRangeError):
-      if (self._adapter.get_size() is None and self._steps_per_epoch is None and
+      if (self._adapter.get_size() is None and self._inferred_steps is None and
           self._current_step > 0):
         # The input passed by the user ran out of batches.
         # Now we know the cardinality of the input(dataset or generator).
-        self._steps_per_epoch = self._current_step
+        self._inferred_steps = self._current_step
       else:
         self._insufficient_data = True
         total_epochs = self._epochs - self._initial_epoch
@@ -1150,19 +1170,34 @@ class DataHandler(object):
             "least `steps_per_epoch * epochs` batches (in this case, "
             "{} batches). You may need to use the repeat() function "
             "when building your dataset.".format(total_epochs *
-                                                 self._steps_per_epoch))
+                                                 self._inferred_steps))
 
   def steps(self):
     """Yields steps for the current epoch."""
     self._current_step = 0
-    # `self._steps_per_epoch` can be changed by `catch_stop_iteration`.
-    while (self._steps_per_epoch is None or
-           self._current_step < self._steps_per_epoch):
+    # `self._inferred_steps` can be changed by `catch_stop_iteration`.
+    while (self._inferred_steps is None or
+           self._current_step < self._inferred_steps):
       if self._insufficient_data:  # Set by `catch_stop_iteration`.
         break
       yield self._current_step
       self._current_step += 1
 
+  @property
+  def inferred_steps(self):
+    """The inferred steps per epoch of the created `Dataset`.
+
+    This will be `None` in the case where:
+
+    (1) A `Dataset` of unknown cardinality was passed to the `DataHandler`, and
+    (2) `steps_per_epoch` was not provided, and
+    (3) The first epoch of iteration has not yet completed.
+
+    Returns:
+      The inferred steps per epoch of the created `Dataset`.
+    """
+    return self._inferred_steps
+
   def _infer_steps(self, steps, dataset):
     """Infers steps_per_epoch needed to loop through a dataset."""
     if steps is not None:
@@ -1189,17 +1224,13 @@ class DataHandler(object):
       raise ValueError("When passing an infinitely repeating dataset, you "
                        "must specify how many steps to draw.")
     if size >= 0:
-      return size
+      return size.numpy().item()
     return None
 
   @property
   def _samples(self):
     return self._adapter.get_samples()
 
-  @property
-  def _steps(self):
-    return self._adapter.get_size()
-
 
 def _make_class_weight_map_fn(class_weight):
   """Applies class weighting to a `Dataset`.
@@ -1386,3 +1417,10 @@ def _scipy_sparse_to_sparse_tensor(t):
   indices = np.concatenate(
       (np.expand_dims(row, axis=1), np.expand_dims(col, axis=1)), axis=1)
   return sparse_tensor.SparseTensor(indices, data, shape)
+
+
+def _is_distributed_dataset(ds):
+  # TODO(b/151165986): Use public APIs.
+  return isinstance(
+      ds,
+      (input_lib.DistributedDataset, input_lib.DistributedDatasetsFromFunction))
diff --git a/tensorflow/python/keras/engine/data_adapter_test.py b/tensorflow/python/keras/engine/data_adapter_test.py
index c0b875d016f..24cfc1f2bf0 100644
--- a/tensorflow/python/keras/engine/data_adapter_test.py
+++ b/tensorflow/python/keras/engine/data_adapter_test.py
@@ -786,6 +786,7 @@ class DataHandlerTest(keras_parameterized.TestCase):
     # User can choose to only partially consume `Dataset`.
     data_handler = data_adapter.DataHandler(
         data, initial_epoch=0, epochs=2, steps_per_epoch=2)
+    self.assertEqual(data_handler.inferred_steps, 2)
     self.assertFalse(data_handler._adapter.should_recreate_iterator())
     returned_data = []
     for _, iterator in data_handler.enumerate_epochs():
@@ -798,6 +799,7 @@ class DataHandlerTest(keras_parameterized.TestCase):
   def test_finite_dataset_without_steps_per_epoch(self):
     data = dataset_ops.Dataset.from_tensor_slices([0, 1, 2]).batch(1)
     data_handler = data_adapter.DataHandler(data, initial_epoch=0, epochs=2)
+    self.assertEqual(data_handler.inferred_steps, 3)
     returned_data = []
     for _, iterator in data_handler.enumerate_epochs():
       epoch_data = []
@@ -851,6 +853,7 @@ class DataHandlerTest(keras_parameterized.TestCase):
       returned_data.append(epoch_data)
     returned_data = self.evaluate(returned_data)
     self.assertEqual(returned_data, [[0, 1], [2, 3]])
+    self.assertEqual(data_handler.inferred_steps, 2)
 
   def test_unknown_cardinality_dataset_without_steps_per_epoch(self):
     ds = dataset_ops.DatasetV2.from_tensor_slices([0, 1, 2, 3, 4, 5, 6])
@@ -860,6 +863,7 @@ class DataHandlerTest(keras_parameterized.TestCase):
 
     data_handler = data_adapter.DataHandler(
         filtered_ds, initial_epoch=0, epochs=2)
+    self.assertEqual(data_handler.inferred_steps, None)
     self.assertTrue(data_handler._adapter.should_recreate_iterator())
     returned_data = []
     for _, iterator in data_handler.enumerate_epochs():
@@ -870,7 +874,7 @@ class DataHandlerTest(keras_parameterized.TestCase):
       returned_data.append(epoch_data)
     returned_data = self.evaluate(returned_data)
     self.assertEqual(returned_data, [[0, 1, 2, 3], [0, 1, 2, 3]])
-    self.assertEqual(data_handler._steps_per_epoch, 4)
+    self.assertEqual(data_handler.inferred_steps, 4)
 
   def test_insufficient_data(self):
     ds = dataset_ops.DatasetV2.from_tensor_slices([0, 1])
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 148df242e48..91cd1b77734 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -239,12 +239,17 @@ class Network(base_layer.Layer):
       outputs = outputs[0]
     self._nested_outputs = outputs
     self._nested_inputs = inputs
-    self._nested_inputs_are_flat_list = (
-        isinstance(self._nested_inputs, (list, tuple)) and
-        not any(nest.is_sequence(t) for t in self._nested_inputs))
     self.inputs = nest.flatten(inputs)
     self.outputs = nest.flatten(outputs)
 
+    # Models constructed with a single Tensor or list of Tensors can
+    # be called with a dict, where the keys of the dict are the names
+    # of the `Input` objects. Extra keys are ignored.
+    self._enable_dict_to_input_mapping = (
+        not nest.is_sequence(self._nested_inputs) or
+        (isinstance(self._nested_inputs, (list, tuple)) and
+         not any(nest.is_sequence(t) for t in self._nested_inputs)))
+
     if any(not hasattr(tensor, '_keras_history') for tensor in self.outputs):
       base_layer_utils.create_keras_history(self._nested_outputs)
 
@@ -411,7 +416,7 @@ class Network(base_layer.Layer):
   def _checkpoint_dependencies(self):
     dependencies = [
         trackable.TrackableReference(name=name, ref=layer)
-        for name, layer in self._layer_checkpoint_dependencies.items()]
+        for name, layer in sorted(self._layer_checkpoint_dependencies.items())]
     dependencies.extend(super(Network, self)._checkpoint_dependencies)
     return dependencies
 
@@ -586,7 +591,7 @@ class Network(base_layer.Layer):
     """
     return
 
-  @base_layer_utils.default
+  @generic_utils.default
   def build(self, input_shape):
     """Builds the model based on input shapes received.
 
@@ -859,10 +864,13 @@ class Network(base_layer.Layer):
 
           argspec = self._layer_call_argspecs[layer].args
           if 'training' in argspec:
-            kwargs.setdefault('training', training)
-            if (type(kwargs['training']) is ops.Tensor and  # pylint: disable=unidiomatic-typecheck
-                any([kwargs['training'] is x
-                     for x in backend._GRAPH_LEARNING_PHASES.values()])):
+            if 'training' not in kwargs or kwargs['training'] is None:
+              kwargs['training'] = training
+            elif (type(kwargs['training']) is ops.Tensor and  # pylint: disable=unidiomatic-typecheck
+                  any([
+                      kwargs['training'] is x
+                      for x in backend._GRAPH_LEARNING_PHASES.values()
+                  ])):
               kwargs['training'] = training  # Materialize placeholder.
 
           # Map Keras tensors in kwargs to their computed value.
@@ -909,16 +917,20 @@ class Network(base_layer.Layer):
 
   def _flatten_to_reference_inputs(self, tensors):
     """Maps `tensors` to their respective `keras.Input`."""
-    if self._nested_inputs_are_flat_list and isinstance(tensors, dict):
-      # Backwards compat: Allows passing a dict to a Model constructed with a
-      # list. Matches dict keys to input names.
-      tensors = [
-          tensors[inp._keras_history.layer.name] for inp in self._nested_inputs
-      ]
-    else:
-      # Otherwise both self.inputs and tensors will be flattened in same order.
-      tensors = nest.flatten(tensors)
-    return tensors
+    if self._enable_dict_to_input_mapping and isinstance(tensors, dict):
+      ref_inputs = self._nested_inputs
+      if not nest.is_sequence(ref_inputs):
+        ref_inputs = [self._nested_inputs]
+
+      try:
+        # Flatten in the order `Input`s were passed during Model construction.
+        return [tensors[inp._keras_history.layer.name] for inp in ref_inputs]
+      except KeyError:
+        # TODO(b/151582614)
+        return nest.flatten(tensors)
+
+    # Otherwise both self.inputs and tensors will already be in same order.
+    return nest.flatten(tensors)
 
   def _conform_to_reference_input(self, tensor, ref_input):
     """Set shape and dtype based on `keras.Input`s."""
diff --git a/tensorflow/python/keras/engine/network_test.py b/tensorflow/python/keras/engine/network_test.py
index a93bc619309..e227d08f595 100644
--- a/tensorflow/python/keras/engine/network_test.py
+++ b/tensorflow/python/keras/engine/network_test.py
@@ -20,20 +20,25 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import combinations
+from tensorflow.python.keras import initializers
 from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import layers
+from tensorflow.python.keras import models
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import input_layer as input_layer_lib
 from tensorflow.python.keras.engine import network as network_lib
-from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.engine import training as training_lib
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -51,7 +56,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
 
   def test_get_updates(self):
 
-    class MyLayer(keras.layers.Layer):
+    class MyLayer(layers.Layer):
 
       def build(self, input_shape):
         self.a = self.add_variable('a',
@@ -113,24 +118,24 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
       self.assertEqual(len(network.updates), 7)
       self.assertEqual(len(network.get_updates_for(x4)), 2)
 
-  @test_util.run_in_graph_and_eager_modes()
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_get_updates_bn(self):
     x1 = input_layer_lib.Input(shape=(1,))
-    layer = keras.layers.BatchNormalization()
+    layer = layers.BatchNormalization()
     _ = layer(x1)
 
     self.assertEqual(len(layer.updates), 2)
     self.assertEqual(len(layer.get_updates_for(x1)), 2)
     self.assertEqual(len(layer.get_updates_for(None)), 0)
 
-  @test_util.run_in_graph_and_eager_modes()
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testTopologicalAttributes(self):
     # test layer attributes / methods related to cross-layer connectivity.
     a = input_layer_lib.Input(shape=(32,), name='input_a')
     b = input_layer_lib.Input(shape=(32,), name='input_b')
 
     # test input, output, input_shape, output_shape
-    test_layer = keras.layers.Dense(16, name='test_layer')
+    test_layer = layers.Dense(16, name='test_layer')
     a_test = test_layer(a)
     self.assertIs(test_layer.input, a)
     self.assertIs(test_layer.output, a_test)
@@ -138,7 +143,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     self.assertEqual(test_layer.output_shape, (None, 16))
 
     # test `get_*_at` methods
-    dense = keras.layers.Dense(16, name='dense_1')
+    dense = layers.Dense(16, name='dense_1')
     a_2 = dense(a)
     b_2 = dense(b)
 
@@ -155,26 +160,26 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     with self.assertRaises(ValueError):
       dense.get_input_at(2)
     with self.assertRaises(AttributeError):
-      new_dense = keras.layers.Dense(16)
+      new_dense = layers.Dense(16)
       _ = new_dense.input
     with self.assertRaises(AttributeError):
-      new_dense = keras.layers.Dense(16)
+      new_dense = layers.Dense(16)
       _ = new_dense.output
     with self.assertRaises(AttributeError):
-      new_dense = keras.layers.Dense(16)
+      new_dense = layers.Dense(16)
       _ = new_dense.output_shape
     with self.assertRaises(AttributeError):
-      new_dense = keras.layers.Dense(16)
+      new_dense = layers.Dense(16)
       _ = new_dense.input_shape
     with self.assertRaises(AttributeError):
-      new_dense = keras.layers.Dense(16)
+      new_dense = layers.Dense(16)
       a = input_layer_lib.Input(shape=(3, 32))
       a = input_layer_lib.Input(shape=(5, 32))
       a_2 = dense(a)
       b_2 = dense(b)
       _ = new_dense.input_shape
     with self.assertRaises(AttributeError):
-      new_dense = keras.layers.Dense(16)
+      new_dense = layers.Dense(16)
       a = input_layer_lib.Input(shape=(3, 32))
       a = input_layer_lib.Input(shape=(5, 32))
       a_2 = dense(a)
@@ -184,10 +189,10 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
   def _assertAllIs(self, a, b):
     self.assertTrue(all(x is y for x, y in zip(a, b)))
 
-  @test_util.run_in_graph_and_eager_modes()
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testTopologicalAttributesMultiOutputLayer(self):
 
-    class PowersLayer(keras.layers.Layer):
+    class PowersLayer(layers.Layer):
 
       def call(self, inputs):
         return [inputs**2, inputs**3]
@@ -201,10 +206,10 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     self.assertEqual(test_layer.input_shape, (None, 32))
     self.assertEqual(test_layer.output_shape, [(None, 32), (None, 32)])
 
-  @test_util.run_in_graph_and_eager_modes()
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testTopologicalAttributesMultiInputLayer(self):
 
-    class AddLayer(keras.layers.Layer):
+    class AddLayer(layers.Layer):
 
       def call(self, inputs):
         assert len(inputs) == 2
@@ -224,7 +229,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     with ops.Graph().as_default():
       # minimum viable network
       x = input_layer_lib.Input(shape=(32,))
-      dense = keras.layers.Dense(2)
+      dense = layers.Dense(2)
       y = dense(x)
       network = network_lib.Network(x, y, name='dense_network')
 
@@ -254,11 +259,11 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
       self._assertAllIs(network.non_trainable_weights,
                         dense.trainable_weights + dense.non_trainable_weights)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_trainable_weights(self):
-    a = keras.layers.Input(shape=(2,))
-    b = keras.layers.Dense(1)(a)
-    model = keras.models.Model(a, b)
+    a = layers.Input(shape=(2,))
+    b = layers.Dense(1)(a)
+    model = training_lib.Model(a, b)
 
     weights = model.weights
     self._assertAllIs(model.trainable_weights, weights)
@@ -277,8 +282,8 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     self._assertAllIs(model.non_trainable_weights, weights)
 
     # sequential model
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(1, input_dim=2))
+    model = sequential.Sequential()
+    model.add(layers.Dense(1, input_dim=2))
     weights = model.weights
 
     self._assertAllIs(model.trainable_weights, weights)
@@ -299,32 +304,32 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
   def test_layer_call_arguments(self):
     with ops.Graph().as_default():
       # Test the ability to pass and serialize arguments to `call`.
-      inp = keras.layers.Input(shape=(2,))
-      x = keras.layers.Dense(3)(inp)
-      x = keras.layers.Dropout(0.5)(x, training=True)
-      model = keras.models.Model(inp, x)
+      inp = layers.Input(shape=(2,))
+      x = layers.Dense(3)(inp)
+      x = layers.Dropout(0.5)(x, training=True)
+      model = training_lib.Model(inp, x)
       # Would be `dropout/cond/Merge` by default
       self.assertIn('dropout', model.output.op.name)
 
       # Test that argument is kept when applying the model
-      inp2 = keras.layers.Input(shape=(2,))
+      inp2 = layers.Input(shape=(2,))
       out2 = model(inp2)
       self.assertIn('dropout', out2.op.name)
 
       # Test that argument is kept after loading a model
       config = model.get_config()
-      model = keras.models.Model.from_config(config)
+      model = training_lib.Model.from_config(config)
       self.assertIn('dropout', model.output.op.name)
 
   def test_node_construction(self):
     # test basics
-    a = keras.layers.Input(shape=(32,), name='input_a')
-    b = keras.layers.Input(shape=(32,), name='input_b')
+    a = layers.Input(shape=(32,), name='input_a')
+    b = layers.Input(shape=(32,), name='input_b')
 
     with self.assertRaises(ValueError):
-      _ = keras.layers.Input(shape=(32,), batch_shape=(10, 32))
+      _ = layers.Input(shape=(32,), batch_shape=(10, 32))
     with self.assertRaises(ValueError):
-      _ = keras.layers.Input(shape=(32,), unknown_kwarg=None)
+      _ = layers.Input(shape=(32,), unknown_kwarg=None)
 
     self.assertListEqual(a.shape.as_list(), [None, 32])
     a_layer, a_node_index, a_tensor_index = a._keras_history
@@ -340,7 +345,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     self.assertListEqual(node.output_tensors, [a])
     self.assertListEqual(node.output_shapes, [(None, 32)])
 
-    dense = keras.layers.Dense(16, name='dense_1')
+    dense = layers.Dense(16, name='dense_1')
     a_2 = dense(a)
     b_2 = dense(b)
 
@@ -354,7 +359,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     self.assertIs(dense._inbound_nodes[1].input_tensors, b)
 
     # test layer properties
-    test_layer = keras.layers.Dense(16, name='test_layer')
+    test_layer = layers.Dense(16, name='test_layer')
     a_test = test_layer(a)
     self.assertListEqual(test_layer.kernel.shape.as_list(), [32, 16])
     self.assertIs(test_layer.input, a)
@@ -375,18 +380,18 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     self.assertEqual(dense.get_output_mask_at(0), None)
     self.assertEqual(dense.get_output_mask_at(1), None)
 
-  @test_util.run_in_graph_and_eager_modes()
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_multi_input_layer(self):
     with self.cached_session():
       # test multi-input layer
-      a = keras.layers.Input(shape=(32,), name='input_a')
-      b = keras.layers.Input(shape=(32,), name='input_b')
+      a = layers.Input(shape=(32,), name='input_a')
+      b = layers.Input(shape=(32,), name='input_b')
 
-      dense = keras.layers.Dense(16, name='dense_1')
+      dense = layers.Dense(16, name='dense_1')
       a_2 = dense(a)
       b_2 = dense(b)
 
-      merged = keras.layers.concatenate([a_2, b_2], name='merge')
+      merged = layers.concatenate([a_2, b_2], name='merge')
       self.assertListEqual(merged.shape.as_list(), [None, 16 * 2])
       merge_layer, merge_node_index, merge_tensor_index = merged._keras_history
 
@@ -399,10 +404,10 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
       self.assertEqual(len(merge_layer._inbound_nodes[0].input_tensors), 2)
       self.assertEqual(len(merge_layer._inbound_nodes[0].inbound_layers), 2)
 
-      c = keras.layers.Dense(64, name='dense_2')(merged)
-      d = keras.layers.Dense(5, name='dense_3')(c)
+      c = layers.Dense(64, name='dense_2')(merged)
+      d = layers.Dense(5, name='dense_3')(c)
 
-      model = keras.models.Model(inputs=[a, b], outputs=[c, d], name='model')
+      model = training_lib.Model(inputs=[a, b], outputs=[c, d], name='model')
       self.assertEqual(len(model.layers), 6)
       output_shapes = model.compute_output_shape([(None, 32), (None, 32)])
       self.assertListEqual(output_shapes[0].as_list(), [None, 64])
@@ -420,7 +425,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
                            ['dense_2', 'dense_3'])
 
       # actually run model
-      fn = keras.backend.function(model.inputs, model.outputs)
+      fn = backend.function(model.inputs, model.outputs)
       input_a_np = np.random.random((10, 32))
       input_b_np = np.random.random((10, 32))
       fn_outputs = fn([input_a_np, input_b_np])
@@ -431,7 +436,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
 
       # serialization / deserialization
       json_config = model.to_json()
-      recreated_model = keras.models.model_from_json(json_config)
+      recreated_model = models.model_from_json(json_config)
       recreated_model.compile('rmsprop', 'mse')
 
       self.assertListEqual([l.name for l in recreated_model.layers][2:],
@@ -441,42 +446,41 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
       self.assertListEqual([l.name for l in recreated_model._output_layers],
                            ['dense_2', 'dense_3'])
 
-      fn = keras.backend.function(recreated_model.inputs,
-                                  recreated_model.outputs)
+      fn = backend.function(recreated_model.inputs, recreated_model.outputs)
       input_a_np = np.random.random((10, 32))
       input_b_np = np.random.random((10, 32))
       fn_outputs = fn([input_a_np, input_b_np])
       self.assertListEqual([x.shape for x in fn_outputs], [(10, 64), (10, 5)])
 
   def test_multi_output_layer_output_names(self):
-    inp = keras.layers.Input(name='inp', shape=(None,), dtype=dtypes.float32)
+    inp = layers.Input(name='inp', shape=(None,), dtype=dtypes.float32)
 
-    class _MultiOutput(keras.layers.Layer):
+    class _MultiOutput(layers.Layer):
 
       def call(self, x):
         return x + 1., x + 2.
 
     out = _MultiOutput(name='out')(inp)
-    model = keras.models.Model(inp, out)
+    model = training_lib.Model(inp, out)
     self.assertEqual(['out', 'out_1'], model.output_names)
     self.assertAllClose([2., 3.], model(1.))
 
   def test_recursion(self):
     with ops.Graph().as_default(), self.cached_session():
-      a = keras.layers.Input(shape=(32,), name='input_a')
-      b = keras.layers.Input(shape=(32,), name='input_b')
+      a = layers.Input(shape=(32,), name='input_a')
+      b = layers.Input(shape=(32,), name='input_b')
 
-      dense = keras.layers.Dense(16, name='dense_1')
+      dense = layers.Dense(16, name='dense_1')
       a_2 = dense(a)
       b_2 = dense(b)
-      merged = keras.layers.concatenate([a_2, b_2], name='merge')
-      c = keras.layers.Dense(64, name='dense_2')(merged)
-      d = keras.layers.Dense(5, name='dense_3')(c)
+      merged = layers.concatenate([a_2, b_2], name='merge')
+      c = layers.Dense(64, name='dense_2')(merged)
+      d = layers.Dense(5, name='dense_3')(c)
 
-      model = keras.models.Model(inputs=[a, b], outputs=[c, d], name='model')
+      model = training_lib.Model(inputs=[a, b], outputs=[c, d], name='model')
 
-      e = keras.layers.Input(shape=(32,), name='input_e')
-      f = keras.layers.Input(shape=(32,), name='input_f')
+      e = layers.Input(shape=(32,), name='input_e')
+      f = layers.Input(shape=(32,), name='input_f')
       self.assertEqual(len(model.inputs), 2)
       g, h = model([e, f])
       self.assertEqual(len(model.inputs), 2)
@@ -486,9 +490,9 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
       self.assertListEqual(h.shape.as_list(), d.shape.as_list())
 
       # test separate manipulation of different layer outputs
-      i = keras.layers.Dense(7, name='dense_4')(h)
+      i = layers.Dense(7, name='dense_4')(h)
 
-      final_model = keras.models.Model(
+      final_model = training_lib.Model(
           inputs=[e, f], outputs=[i, g], name='final')
       self.assertEqual(len(final_model.inputs), 2)
       self.assertEqual(len(final_model.outputs), 2)
@@ -505,7 +509,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
                                                                    (10, 64)])
 
       # run recursive model
-      fn = keras.backend.function(final_model.inputs, final_model.outputs)
+      fn = backend.function(final_model.inputs, final_model.outputs)
       input_a_np = np.random.random((10, 32))
       input_b_np = np.random.random((10, 32))
       fn_outputs = fn([input_a_np, input_b_np])
@@ -513,48 +517,47 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
 
       # test serialization
       model_config = final_model.get_config()
-      recreated_model = keras.models.Model.from_config(model_config)
+      recreated_model = models.Model.from_config(model_config)
 
-      fn = keras.backend.function(recreated_model.inputs,
-                                  recreated_model.outputs)
+      fn = backend.function(recreated_model.inputs, recreated_model.outputs)
       input_a_np = np.random.random((10, 32))
       input_b_np = np.random.random((10, 32))
       fn_outputs = fn([input_a_np, input_b_np])
       self.assertListEqual([x.shape for x in fn_outputs], [(10, 7), (10, 64)])
 
-  @test_util.run_in_graph_and_eager_modes()
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_multi_input_multi_output_recursion(self):
     with self.cached_session():
       # test multi-input multi-output
-      a = keras.layers.Input(shape=(32,), name='input_a')
-      b = keras.layers.Input(shape=(32,), name='input_b')
+      a = layers.Input(shape=(32,), name='input_a')
+      b = layers.Input(shape=(32,), name='input_b')
 
-      dense = keras.layers.Dense(16, name='dense_1')
+      dense = layers.Dense(16, name='dense_1')
       a_2 = dense(a)
       b_2 = dense(b)
-      merged = keras.layers.concatenate([a_2, b_2], name='merge')
-      c = keras.layers.Dense(64, name='dense_2')(merged)
-      d = keras.layers.Dense(5, name='dense_3')(c)
+      merged = layers.concatenate([a_2, b_2], name='merge')
+      c = layers.Dense(64, name='dense_2')(merged)
+      d = layers.Dense(5, name='dense_3')(c)
 
-      model = keras.models.Model(inputs=[a, b], outputs=[c, d], name='model')
+      model = training_lib.Model(inputs=[a, b], outputs=[c, d], name='model')
 
-      j = keras.layers.Input(shape=(32,), name='input_j')
-      k = keras.layers.Input(shape=(32,), name='input_k')
+      j = layers.Input(shape=(32,), name='input_j')
+      k = layers.Input(shape=(32,), name='input_k')
       _, n = model([j, k])
 
-      o = keras.layers.Input(shape=(32,), name='input_o')
-      p = keras.layers.Input(shape=(32,), name='input_p')
+      o = layers.Input(shape=(32,), name='input_o')
+      p = layers.Input(shape=(32,), name='input_p')
       q, _ = model([o, p])
 
       self.assertListEqual(n.shape.as_list(), [None, 5])
       self.assertListEqual(q.shape.as_list(), [None, 64])
-      s = keras.layers.concatenate([n, q], name='merge_nq')
+      s = layers.concatenate([n, q], name='merge_nq')
       self.assertListEqual(s.shape.as_list(), [None, 64 + 5])
 
       # test with single output as 1-elem list
-      multi_io_model = keras.models.Model([j, k, o, p], [s])
+      multi_io_model = training_lib.Model([j, k, o, p], [s])
 
-      fn = keras.backend.function(multi_io_model.inputs, multi_io_model.outputs)
+      fn = backend.function(multi_io_model.inputs, multi_io_model.outputs)
       fn_outputs = fn([
           np.random.random((10, 32)), np.random.random((10, 32)),
           np.random.random((10, 32)), np.random.random((10, 32))
@@ -562,9 +565,9 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
       self.assertListEqual([x.shape for x in fn_outputs], [(10, 69)])
 
       # test with single output as tensor
-      multi_io_model = keras.models.Model([j, k, o, p], s)
+      multi_io_model = training_lib.Model([j, k, o, p], s)
 
-      fn = keras.backend.function(multi_io_model.inputs, multi_io_model.outputs)
+      fn = backend.function(multi_io_model.inputs, multi_io_model.outputs)
       fn_outputs = fn([
           np.random.random((10, 32)), np.random.random((10, 32)),
           np.random.random((10, 32)), np.random.random((10, 32))
@@ -574,10 +577,9 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
 
       # test serialization
       model_config = multi_io_model.get_config()
-      recreated_model = keras.models.Model.from_config(model_config)
+      recreated_model = models.Model.from_config(model_config)
 
-      fn = keras.backend.function(recreated_model.inputs,
-                                  recreated_model.outputs)
+      fn = backend.function(recreated_model.inputs, recreated_model.outputs)
       fn_outputs = fn([
           np.random.random((10, 32)), np.random.random((10, 32)),
           np.random.random((10, 32)), np.random.random((10, 32))
@@ -586,88 +588,88 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
       self.assertListEqual([x.shape for x in fn_outputs], [(10, 69)])
 
       config = model.get_config()
-      keras.models.Model.from_config(config)
+      models.Model.from_config(config)
 
       model.summary()
       json_str = model.to_json()
-      keras.models.model_from_json(json_str)
+      models.model_from_json(json_str)
 
       if yaml is not None:
         yaml_str = model.to_yaml()
-        keras.models.model_from_yaml(yaml_str)
+        models.model_from_yaml(yaml_str)
 
-  @test_util.run_in_graph_and_eager_modes()
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_invalid_graphs(self):
-    a = keras.layers.Input(shape=(32,), name='input_a')
-    b = keras.layers.Input(shape=(32,), name='input_b')
+    a = layers.Input(shape=(32,), name='input_a')
+    b = layers.Input(shape=(32,), name='input_b')
 
-    dense = keras.layers.Dense(16, name='dense_1')
+    dense = layers.Dense(16, name='dense_1')
     a_2 = dense(a)
     b_2 = dense(b)
-    merged = keras.layers.concatenate([a_2, b_2], name='merge')
-    c = keras.layers.Dense(64, name='dense_2')(merged)
-    d = keras.layers.Dense(5, name='dense_3')(c)
+    merged = layers.concatenate([a_2, b_2], name='merge')
+    c = layers.Dense(64, name='dense_2')(merged)
+    d = layers.Dense(5, name='dense_3')(c)
 
-    model = keras.models.Model(inputs=[a, b], outputs=[c, d], name='model')
+    model = training_lib.Model(inputs=[a, b], outputs=[c, d], name='model')
 
     # input is not an Input tensor
-    j = keras.layers.Input(shape=(32,), name='input_j')
-    j = keras.layers.Dense(32)(j)
-    k = keras.layers.Input(shape=(32,), name='input_k')
+    j = layers.Input(shape=(32,), name='input_j')
+    j = layers.Dense(32)(j)
+    k = layers.Input(shape=(32,), name='input_k')
     m, n = model([j, k])
 
     with self.assertRaises(Exception):
-      keras.models.Model([j, k], [m, n])
+      training_lib.Model([j, k], [m, n])
 
     # disconnected graph
-    j = keras.layers.Input(shape=(32,), name='input_j')
-    k = keras.layers.Input(shape=(32,), name='input_k')
+    j = layers.Input(shape=(32,), name='input_j')
+    k = layers.Input(shape=(32,), name='input_k')
     m, n = model([j, k])
     with self.assertRaises(Exception):
-      keras.models.Model([j], [m, n])
+      training_lib.Model([j], [m, n])
 
     # redundant outputs
-    j = keras.layers.Input(shape=(32,), name='input_j')
-    k = keras.layers.Input(shape=(32,), name='input_k')
+    j = layers.Input(shape=(32,), name='input_j')
+    k = layers.Input(shape=(32,), name='input_k')
     m, n = model([j, k])
 
-    keras.models.Model([j, k], [m, n, n])
+    training_lib.Model([j, k], [m, n, n])
 
     # redundant inputs
-    j = keras.layers.Input(shape=(32,), name='input_j')
-    k = keras.layers.Input(shape=(32,), name='input_k')
+    j = layers.Input(shape=(32,), name='input_j')
+    k = layers.Input(shape=(32,), name='input_k')
     m, n = model([j, k])
     with self.assertRaises(Exception):
-      keras.models.Model([j, k, j], [m, n])
+      training_lib.Model([j, k, j], [m, n])
 
     # i have not idea what I'm doing: garbage as inputs/outputs
-    j = keras.layers.Input(shape=(32,), name='input_j')
-    k = keras.layers.Input(shape=(32,), name='input_k')
+    j = layers.Input(shape=(32,), name='input_j')
+    k = layers.Input(shape=(32,), name='input_k')
     m, n = model([j, k])
     with self.assertRaises(Exception):
-      keras.models.Model([j, k], [m, n, 0])
+      training_lib.Model([j, k], [m, n, 0])
 
   def test_raw_tf_compatibility(self):
     with ops.Graph().as_default():
       # test calling layers/models on TF tensors
-      a = keras.layers.Input(shape=(32,), name='input_a')
-      b = keras.layers.Input(shape=(32,), name='input_b')
+      a = layers.Input(shape=(32,), name='input_a')
+      b = layers.Input(shape=(32,), name='input_b')
 
-      dense = keras.layers.Dense(16, name='dense_1')
+      dense = layers.Dense(16, name='dense_1')
       a_2 = dense(a)
       b_2 = dense(b)
-      merged = keras.layers.concatenate([a_2, b_2], name='merge')
-      c = keras.layers.Dense(64, name='dense_2')(merged)
-      d = keras.layers.Dense(5, name='dense_3')(c)
+      merged = layers.concatenate([a_2, b_2], name='merge')
+      c = layers.Dense(64, name='dense_2')(merged)
+      d = layers.Dense(5, name='dense_3')(c)
 
-      model = keras.models.Model(inputs=[a, b], outputs=[c, d], name='model')
+      model = training_lib.Model(inputs=[a, b], outputs=[c, d], name='model')
 
-      j = keras.layers.Input(shape=(32,), name='input_j')
-      k = keras.layers.Input(shape=(32,), name='input_k')
+      j = layers.Input(shape=(32,), name='input_j')
+      k = layers.Input(shape=(32,), name='input_k')
       self.assertEqual(len(model.inputs), 2)
       m, n = model([j, k])
       self.assertEqual(len(model.inputs), 2)
-      tf_model = keras.models.Model([j, k], [m, n])
+      tf_model = training_lib.Model([j, k], [m, n])
 
       j_tf = array_ops.placeholder(dtype=dtypes.float32, shape=(None, 32))
       k_tf = array_ops.placeholder(dtype=dtypes.float32, shape=(None, 32))
@@ -676,26 +678,26 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
       self.assertListEqual(n_tf.shape.as_list(), [None, 5])
 
       # test merge
-      keras.layers.concatenate([j_tf, k_tf], axis=1)
-      keras.layers.add([j_tf, k_tf])
+      layers.concatenate([j_tf, k_tf], axis=1)
+      layers.add([j_tf, k_tf])
 
       # test tensor input
       x = array_ops.placeholder(shape=(None, 2), dtype=dtypes.float32)
-      keras.layers.InputLayer(input_tensor=x)
+      layers.InputLayer(input_tensor=x)
 
-      x = keras.layers.Input(tensor=x)
-      keras.layers.Dense(2)(x)
+      x = layers.Input(tensor=x)
+      layers.Dense(2)(x)
 
-  @test_util.run_in_graph_and_eager_modes()
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_basic_masking(self):
-    a = keras.layers.Input(shape=(10, 32), name='input_a')
-    b = keras.layers.Masking()(a)
-    model = keras.models.Model(a, b)
+    a = layers.Input(shape=(10, 32), name='input_a')
+    b = layers.Masking()(a)
+    model = training_lib.Model(a, b)
     self.assertEqual(model.output_mask.shape.as_list(), [None, 10])
 
   def testMaskingSingleInput(self):
 
-    class MaskedLayer(keras.layers.Layer):
+    class MaskedLayer(layers.Layer):
 
       def call(self, inputs, mask=None):
         if mask is not None:
@@ -737,29 +739,29 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
 
     net_a_input = input_layer_lib.Input((2,))
     net_a = net_a_input
-    net_a = keras.layers.Dense(2, kernel_initializer='ones',
-                               use_bias=False,
-                               activity_regularizer=reg)(net_a)
-    model_a = keras.Model([net_a_input], [net_a])
+    net_a = layers.Dense(
+        2, kernel_initializer='ones', use_bias=False, activity_regularizer=reg)(
+            net_a)
+    model_a = training_lib.Model([net_a_input], [net_a])
 
     net_b_input = input_layer_lib.Input((2,))
     net_b = model_a(net_b_input)
-    model_b = keras.Model([net_b_input], [net_b])
+    model_b = training_lib.Model([net_b_input], [net_b])
 
     model_b.compile(optimizer='sgd', loss=None)
     x = np.ones((1, 2))
     loss = model_b.evaluate(x)
     self.assertEqual(loss, 4.)
 
-  @keras_parameterized.run_all_keras_modes
+  @combinations.generate(combinations.keras_mode_combinations())
   def test_layer_sharing_at_heterogenous_depth(self):
     x_val = np.random.random((10, 5))
 
     x = input_layer_lib.Input(shape=(5,))
-    a = keras.layers.Dense(5, name='A')
-    b = keras.layers.Dense(5, name='B')
+    a = layers.Dense(5, name='A')
+    b = layers.Dense(5, name='B')
     output = a(b(a(b(x))))
-    m = keras.models.Model(x, output)
+    m = training_lib.Model(x, output)
     m.run_eagerly = testing_utils.should_run_eagerly()
 
     output_val = m.predict(x_val)
@@ -767,26 +769,26 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     config = m.get_config()
     weights = m.get_weights()
 
-    m2 = keras.models.Model.from_config(config)
+    m2 = models.Model.from_config(config)
     m2.set_weights(weights)
 
     output_val_2 = m2.predict(x_val)
     self.assertAllClose(output_val, output_val_2, atol=1e-6)
 
-  @keras_parameterized.run_all_keras_modes
+  @combinations.generate(combinations.keras_mode_combinations())
   def test_layer_sharing_at_heterogenous_depth_with_concat(self):
     input_shape = (16, 9, 3)
     input_layer = input_layer_lib.Input(shape=input_shape)
 
-    a = keras.layers.Dense(3, name='dense_A')
-    b = keras.layers.Dense(3, name='dense_B')
-    c = keras.layers.Dense(3, name='dense_C')
+    a = layers.Dense(3, name='dense_A')
+    b = layers.Dense(3, name='dense_B')
+    c = layers.Dense(3, name='dense_C')
 
     x1 = b(a(input_layer))
     x2 = a(c(input_layer))
-    output = keras.layers.concatenate([x1, x2])
+    output = layers.concatenate([x1, x2])
 
-    m = keras.models.Model(inputs=input_layer, outputs=output)
+    m = training_lib.Model(inputs=input_layer, outputs=output)
     m.run_eagerly = testing_utils.should_run_eagerly()
 
     x_val = np.random.random((10, 16, 9, 3))
@@ -795,21 +797,21 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     config = m.get_config()
     weights = m.get_weights()
 
-    m2 = keras.models.Model.from_config(config)
+    m2 = models.Model.from_config(config)
     m2.set_weights(weights)
 
     output_val_2 = m2.predict(x_val)
     self.assertAllClose(output_val, output_val_2, atol=1e-6)
 
-  @keras_parameterized.run_all_keras_modes
+  @combinations.generate(combinations.keras_mode_combinations())
   def test_explicit_training_argument(self):
-    a = keras.layers.Input(shape=(2,))
-    b = keras.layers.Dropout(0.5)(a)
-    base_model = keras.models.Model(a, b)
+    a = layers.Input(shape=(2,))
+    b = layers.Dropout(0.5)(a)
+    base_model = training_lib.Model(a, b)
 
-    a = keras.layers.Input(shape=(2,))
+    a = layers.Input(shape=(2,))
     b = base_model(a, training=False)
-    model = keras.models.Model(a, b)
+    model = training_lib.Model(a, b)
 
     x = np.ones((100, 2))
     y = np.ones((100, 2))
@@ -820,18 +822,18 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     loss = model.train_on_batch(x, y)
     self.assertEqual(loss, 0)  # In inference mode, output is equal to input.
 
-    a = keras.layers.Input(shape=(2,))
+    a = layers.Input(shape=(2,))
     b = base_model(a, training=True)
-    model = keras.models.Model(a, b)
+    model = training_lib.Model(a, b)
     preds = model.predict(x)
     self.assertEqual(np.min(preds), 0.)  # At least one unit was dropped.
 
-  @keras_parameterized.run_all_keras_modes
+  @combinations.generate(combinations.keras_mode_combinations())
   def test_mask_derived_from_keras_layer(self):
-    inputs = keras.Input((5, 10))
-    mask = keras.Input((5,))
-    outputs = keras.layers.RNN(keras.layers.LSTMCell(100))(inputs, mask=mask)
-    model = keras.Model([inputs, mask], outputs)
+    inputs = input_layer_lib.Input((5, 10))
+    mask = input_layer_lib.Input((5,))
+    outputs = layers.RNN(layers.LSTMCell(100))(inputs, mask=mask)
+    model = training_lib.Model([inputs, mask], outputs)
     model.compile(
         'sgd',
         'mse',
@@ -849,7 +851,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     # Data is not masked, returned values are random.
     self.assertGreater(history.history['loss'][0], 0.0)
 
-    model = keras.Model.from_config(model.get_config())
+    model = training_lib.Model.from_config(model.get_config())
     model.compile(
         'sgd',
         'mse',
@@ -867,18 +869,18 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     # Data is not masked, returned values are random.
     self.assertGreater(history.history['loss'][0], 0.0)
 
-  @keras_parameterized.run_all_keras_modes
+  @combinations.generate(combinations.keras_mode_combinations())
   def test_call_arg_derived_from_keras_layer(self):
 
-    class MyAdd(keras.layers.Layer):
+    class MyAdd(layers.Layer):
 
       def call(self, x1, x2):
         return x1 + x2
 
-    input1 = keras.Input(10)
-    input2 = keras.Input(10)
+    input1 = input_layer_lib.Input(10)
+    input2 = input_layer_lib.Input(10)
     outputs = MyAdd()(input1, input2)
-    model = keras.Model([input1, input2], outputs)
+    model = training_lib.Model([input1, input2], outputs)
     model.compile(
         'sgd',
         'mse',
@@ -891,7 +893,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     self.assertEqual(history.history['loss'][0], 0.0)
 
     # Check serialization.
-    model = keras.Model.from_config(
+    model = training_lib.Model.from_config(
         model.get_config(), custom_objects={'MyAdd': MyAdd})
     model.compile(
         'sgd',
@@ -904,20 +906,20 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     # Check that second input was correctly added to first.
     self.assertEqual(history.history['loss'][0], 0.0)
 
-  @keras_parameterized.run_all_keras_modes
+  @combinations.generate(combinations.keras_mode_combinations())
   def test_call_kwarg_derived_from_keras_layer(self):
 
-    class MaybeAdd(keras.layers.Layer):
+    class MaybeAdd(layers.Layer):
 
       def call(self, x1, x2=None):
         if x2 is not None:
           return x1 + x2
         return x1
 
-    input1 = keras.Input(10)
-    input2 = keras.Input(10)
+    input1 = input_layer_lib.Input(10)
+    input2 = input_layer_lib.Input(10)
     outputs = MaybeAdd()(input1, x2=input2)
-    model = keras.Model([input1, input2], outputs)
+    model = training_lib.Model([input1, input2], outputs)
     model.compile(
         'sgd',
         'mse',
@@ -929,7 +931,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     # Check that second input was correctly added to first.
     self.assertEqual(history.history['loss'][0], 0.0)
 
-    model = keras.Model.from_config(
+    model = training_lib.Model.from_config(
         model.get_config(), custom_objects={'MaybeAdd': MaybeAdd})
     model.compile(
         'sgd',
@@ -942,12 +944,12 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     # Check that second input was correctly added to first.
     self.assertEqual(history.history['loss'][0], 0.0)
 
-  @keras_parameterized.run_all_keras_modes
+  @combinations.generate(combinations.keras_mode_combinations())
   def test_composite_call_kwarg_derived_from_keras_layer(self):
 
     # Create a test layer that accepts composite tensor inputs (note the
     # 'supports_ragged_inputs = True' in the init method.)
-    class MaybeAdd(keras.layers.Layer):
+    class MaybeAdd(layers.Layer):
 
       def __init__(self, **kwargs):
         super(MaybeAdd, self).__init__(**kwargs)
@@ -960,10 +962,10 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
           return (x1 + x2).to_tensor(default_value=0)
         return x1.to_tensor(default_value=0)
 
-    input1 = keras.Input((None,), ragged=True)
-    input2 = keras.Input((None,), ragged=True)
+    input1 = input_layer_lib.Input((None,), ragged=True)
+    input2 = input_layer_lib.Input((None,), ragged=True)
     outputs = MaybeAdd()(input1, x2=input2)
-    model = keras.Model([input1, input2], outputs)
+    model = training_lib.Model([input1, input2], outputs)
     model.compile(
         'sgd',
         'mse',
@@ -978,7 +980,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     # Check that second input was correctly added to first.
     self.assertEqual(history.history['loss'][0], 0.0)
 
-    model = keras.Model.from_config(
+    model = training_lib.Model.from_config(
         model.get_config(), custom_objects={'MaybeAdd': MaybeAdd})
     model.compile(
         'sgd',
@@ -988,10 +990,10 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     # Check that second input was correctly added to first.
     self.assertEqual(history.history['loss'][0], 0.0)
 
-  @keras_parameterized.run_all_keras_modes
+  @combinations.generate(combinations.keras_mode_combinations())
   def test_call_nested_arg_derived_from_keras_layer(self):
 
-    class AddAll(keras.layers.Layer):
+    class AddAll(layers.Layer):
 
       def call(self, x1, x2, x3=None):
         out = x1 + x2
@@ -1000,9 +1002,9 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
             out += t
         return out
 
-    input1 = keras.Input(10)
-    input2 = keras.Input(10)
-    input3 = keras.Input(10)
+    input1 = input_layer_lib.Input(10)
+    input2 = input_layer_lib.Input(10)
+    input3 = input_layer_lib.Input(10)
     outputs = AddAll()(
         input1,
         4 * array_ops.ones((1, 10)),
@@ -1011,7 +1013,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
             'b': input3,
             'c': 5 * array_ops.ones((1, 10))
         })
-    model = keras.Model([input1, input2, input3], outputs)
+    model = training_lib.Model([input1, input2, input3], outputs)
     model.compile(
         'sgd',
         'mse',
@@ -1023,7 +1025,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     # Check that all inputs were correctly added.
     self.assertEqual(history.history['loss'][0], 0.0)
 
-    model = keras.Model.from_config(
+    model = training_lib.Model.from_config(
         model.get_config(), custom_objects={'AddAll': AddAll})
     model.compile(
         'sgd',
@@ -1036,7 +1038,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     # Check that all inputs were correctly added.
     self.assertEqual(history.history['loss'][0], 0.0)
 
-  @keras_parameterized.run_all_keras_modes
+  @combinations.generate(combinations.keras_mode_combinations())
   def test_multi_output_model_with_none_masking(self):
     def func(x):
       return [x * 0.2, x * 0.3]
@@ -1044,19 +1046,19 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     def output_shape(input_shape):
       return [input_shape, input_shape]
 
-    i = keras.layers.Input(shape=(3, 2, 1))
-    o = keras.layers.Lambda(function=func, output_shape=output_shape)(i)
+    i = layers.Input(shape=(3, 2, 1))
+    o = layers.Lambda(function=func, output_shape=output_shape)(i)
 
-    self.assertEqual(keras.backend.int_shape(o[0]), (None, 3, 2, 1))
-    self.assertEqual(keras.backend.int_shape(o[1]), (None, 3, 2, 1))
+    self.assertEqual(backend.int_shape(o[0]), (None, 3, 2, 1))
+    self.assertEqual(backend.int_shape(o[1]), (None, 3, 2, 1))
 
-    o = keras.layers.add(o)
-    model = keras.Model(i, o)
+    o = layers.add(o)
+    model = training_lib.Model(i, o)
     model.run_eagerly = testing_utils.should_run_eagerly()
 
-    i2 = keras.layers.Input(shape=(3, 2, 1))
+    i2 = layers.Input(shape=(3, 2, 1))
     o2 = model(i2)
-    model2 = keras.Model(i2, o2)
+    model2 = training_lib.Model(i2, o2)
     model2.run_eagerly = testing_utils.should_run_eagerly()
 
     x = np.random.random((4, 3, 2, 1))
@@ -1064,13 +1066,12 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     assert out.shape == (4, 3, 2, 1)
     self.assertAllClose(out, x * 0.2 + x * 0.3, atol=1e-4)
 
-  @keras_parameterized.run_all_keras_modes
+  @combinations.generate(combinations.keras_mode_combinations())
   def test_constant_initializer_with_numpy(self):
-    initializer = keras.initializers.Constant(np.ones((3, 2)))
-    model = keras.models.Sequential()
-    model.add(
-        keras.layers.Dense(2, input_shape=(3,), kernel_initializer=initializer))
-    model.add(keras.layers.Dense(3))
+    initializer = initializers.Constant(np.ones((3, 2)))
+    model = sequential.Sequential()
+    model.add(layers.Dense(2, input_shape=(3,), kernel_initializer=initializer))
+    model.add(layers.Dense(3))
     model.compile(
         loss='mse',
         optimizer='sgd',
@@ -1078,70 +1079,70 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
         run_eagerly=testing_utils.should_run_eagerly())
 
     json_str = model.to_json()
-    keras.models.model_from_json(json_str)
+    models.model_from_json(json_str)
 
     if yaml is not None:
       yaml_str = model.to_yaml()
-      keras.models.model_from_yaml(yaml_str)
+      models.model_from_yaml(yaml_str)
 
   def test_subclassed_error_if_init_not_called(self):
 
     class MyNetwork(network_lib.Network):
 
       def __init__(self):
-        self._foo = [keras.layers.Dense(10), keras.layers.Dense(10)]
+        self._foo = [layers.Dense(10), layers.Dense(10)]
 
     with self.assertRaisesRegexp(RuntimeError, 'forgot to call'):
       MyNetwork()
 
-  @test_util.run_in_graph_and_eager_modes()
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_int_input_shape(self):
-    inputs = keras.Input(10)
+    inputs = input_layer_lib.Input(10)
     self.assertEqual([None, 10], inputs.shape.as_list())
 
-    inputs_with_batch = keras.Input(batch_size=20, shape=5)
+    inputs_with_batch = input_layer_lib.Input(batch_size=20, shape=5)
     self.assertEqual([20, 5], inputs_with_batch.shape.as_list())
 
-  @test_util.run_in_graph_and_eager_modes()
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_model_initialization(self):
     # Functional model
     inputs = input_layer_lib.Input(shape=(32,))
-    outputs = keras.layers.Dense(4)(inputs)
+    outputs = layers.Dense(4)(inputs)
 
     with self.assertRaisesRegexp(TypeError, 'unexpected argument'):
-      model = training.Model(inputs, outputs, name='m', trainable=False,
-                             dtype='int64')
+      model = training_lib.Model(
+          inputs, outputs, name='m', trainable=False, dtype='int64')
     with self.assertRaisesRegexp(TypeError, 'unexpected argument'):
-      model = training.Model(inputs, outputs, name='m', trainable=False,
-                             dynamic=False)
+      model = training_lib.Model(
+          inputs, outputs, name='m', trainable=False, dynamic=False)
 
-    model = training.Model(inputs, outputs, name='m', trainable=False)
+    model = training_lib.Model(inputs, outputs, name='m', trainable=False)
     self.assertEqual('m', model.name)
     self.assertFalse(model.trainable)
     self.assertFalse(model.dynamic)
 
     # Subclassed model
-    model = training.Model(name='subclassed', trainable=True, dtype='int64',
-                           dynamic=True)
+    model = training_lib.Model(
+        name='subclassed', trainable=True, dtype='int64', dynamic=True)
     self.assertEqual('subclassed', model.name)
     self.assertTrue(model.dynamic)
     self.assertTrue(model.trainable)
-    w = model.add_weight('w', [], initializer=keras.initializers.Constant(1))
+    w = model.add_weight('w', [], initializer=initializers.Constant(1))
     self.assertEqual(dtypes.int64, w.dtype)
 
   def test_disconnected_inputs(self):
     input_tensor1 = input_layer_lib.Input(shape=[200], name='a')
     input_tensor2 = input_layer_lib.Input(shape=[10], name='b')
-    output_tensor1 = keras.layers.Dense(units=10)(input_tensor1)
+    output_tensor1 = layers.Dense(units=10)(input_tensor1)
 
-    net = keras.engine.network.Network(
+    net = network_lib.Network(
         inputs=[input_tensor1, input_tensor2], outputs=[output_tensor1])
-    net2 = keras.engine.network.Network.from_config(net.get_config())
+    net2 = network_lib.Network.from_config(net.get_config())
     self.assertLen(net2.inputs, 2)
     self.assertEqual('a', net2.layers[0].name)
     self.assertEqual('b', net2.layers[1].name)
 
-  @keras_parameterized.run_with_all_model_types
+  @combinations.generate(combinations.keras_model_type_combinations())
   def test_dependency_tracking(self):
     model = testing_utils.get_small_mlp(1, 4, input_dim=3)
     model.trackable = Checkpoint()
@@ -1149,21 +1150,21 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     self.assertEqual(model.trackable, model._lookup_dependency('trackable'))
 
 
-class DeferredModeTest(test.TestCase):
+class DeferredModeTest(keras_parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes()
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testSimpleNetworkBuilding(self):
     inputs = input_layer_lib.Input(shape=(32,))
     if context.executing_eagerly():
       self.assertEqual(inputs.dtype.name, 'float32')
       self.assertEqual(inputs.shape.as_list(), [None, 32])
 
-    x = keras.layers.Dense(2)(inputs)
+    x = layers.Dense(2)(inputs)
     if context.executing_eagerly():
       self.assertEqual(x.dtype.name, 'float32')
       self.assertEqual(x.shape.as_list(), [None, 2])
 
-    outputs = keras.layers.Dense(4)(x)
+    outputs = layers.Dense(4)(x)
     network = network_lib.Network(inputs, outputs)
     self.assertIsInstance(network, network_lib.Network)
 
@@ -1174,19 +1175,19 @@ class DeferredModeTest(test.TestCase):
       outputs = network(inputs)
       self.assertEqual(outputs.shape.as_list(), [10, 4])
 
-  @test_util.run_in_graph_and_eager_modes()
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testMultiIONetworkBuilding(self):
     input_a = input_layer_lib.Input(shape=(32,))
     input_b = input_layer_lib.Input(shape=(16,))
-    a = keras.layers.Dense(16)(input_a)
+    a = layers.Dense(16)(input_a)
 
-    class AddLayer(keras.layers.Layer):
+    class AddLayer(layers.Layer):
 
       def call(self, inputs):
         return inputs[0] + inputs[1]
 
     c = AddLayer()([a, input_b])  # pylint: disable=not-callable
-    c = keras.layers.Dense(2)(c)
+    c = layers.Dense(2)(c)
 
     network = network_lib.Network([input_a, input_b], [a, c])
     if context.executing_eagerly():
@@ -1207,16 +1208,16 @@ class DefaultShapeInferenceBehaviorTest(keras_parameterized.TestCase):
     output_value = model.predict(input_value)
     self.assertEqual(output_value.shape, expected_output_shape)
 
-  @test_util.run_in_graph_and_eager_modes()
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testSingleInputCase(self):
 
-    class LayerWithOneInput(keras.layers.Layer):
+    class LayerWithOneInput(layers.Layer):
 
       def build(self, input_shape):
         self.w = array_ops.ones(shape=(3, 4))
 
       def call(self, inputs):
-        return keras.backend.dot(inputs, self.w)
+        return backend.dot(inputs, self.w)
 
     inputs = input_layer_lib.Input(shape=(3,))
     layer = LayerWithOneInput()
@@ -1232,57 +1233,57 @@ class DefaultShapeInferenceBehaviorTest(keras_parameterized.TestCase):
           layer.compute_output_shape((6, 3)).as_list(), [6, 4])
 
     outputs = layer(inputs)
-    model = keras.Model(inputs, outputs)
+    model = training_lib.Model(inputs, outputs)
     self._testShapeInference(model, (2, 3), (2, 4))
 
-  @test_util.run_in_graph_and_eager_modes()
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testMultiInputOutputCase(self):
 
-    class MultiInputOutputLayer(keras.layers.Layer):
+    class MultiInputOutputLayer(layers.Layer):
 
       def build(self, input_shape):
         self.w = array_ops.ones(shape=(3, 4))
 
       def call(self, inputs):
-        a = keras.backend.dot(inputs[0], self.w)
+        a = backend.dot(inputs[0], self.w)
         b = a + inputs[1]
         return [a, b]
 
     input_a = input_layer_lib.Input(shape=(3,))
     input_b = input_layer_lib.Input(shape=(4,))
     output_a, output_b = MultiInputOutputLayer()([input_a, input_b])
-    model = keras.Model([input_a, input_b], [output_a, output_b])
+    model = training_lib.Model([input_a, input_b], [output_a, output_b])
     output_a_val, output_b_val = model.predict(
         [np.random.random((2, 3)), np.random.random((2, 4))])
     self.assertEqual(output_a_val.shape, (2, 4))
     self.assertEqual(output_b_val.shape, (2, 4))
 
-  @test_util.run_in_graph_and_eager_modes()
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testTrainingArgument(self):
 
-    class LayerWithTrainingArg(keras.layers.Layer):
+    class LayerWithTrainingArg(layers.Layer):
 
       def build(self, input_shape):
         self.w = array_ops.ones(shape=(3, 4))
 
       def call(self, inputs, training):
-        return keras.backend.dot(inputs, self.w)
+        return backend.dot(inputs, self.w)
 
     inputs = input_layer_lib.Input(shape=(3,))
     outputs = LayerWithTrainingArg()(inputs, training=False)
-    model = keras.Model(inputs, outputs)
+    model = training_lib.Model(inputs, outputs)
     self._testShapeInference(model, (2, 3), (2, 4))
 
-  @test_util.run_in_graph_and_eager_modes()
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testNoneInShape(self):
 
-    class Model(keras.Model):
+    class Model(training_lib.Model):
 
       def __init__(self):
         super(Model, self).__init__()
-        self.conv1 = keras.layers.Conv2D(8, 3)
-        self.pool = keras.layers.GlobalAveragePooling2D()
-        self.fc = keras.layers.Dense(3)
+        self.conv1 = layers.Conv2D(8, 3)
+        self.pool = layers.GlobalAveragePooling2D()
+        self.fc = layers.Dense(3)
 
       def call(self, x):
         x = self.conv1(x)
@@ -1300,16 +1301,16 @@ class DefaultShapeInferenceBehaviorTest(keras_parameterized.TestCase):
     output = model(sample_input)
     self.assertEqual(output.shape, (1, 3))
 
-  @test_util.run_in_graph_and_eager_modes()
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testNoneInShapeWithCompoundModel(self):
 
-    class BasicBlock(keras.Model):
+    class BasicBlock(training_lib.Model):
 
       def __init__(self):
         super(BasicBlock, self).__init__()
-        self.conv1 = keras.layers.Conv2D(8, 3)
-        self.pool = keras.layers.GlobalAveragePooling2D()
-        self.dense = keras.layers.Dense(3)
+        self.conv1 = layers.Conv2D(8, 3)
+        self.pool = layers.GlobalAveragePooling2D()
+        self.dense = layers.Dense(3)
 
       def call(self, x):
         x = self.conv1(x)
@@ -1317,7 +1318,7 @@ class DefaultShapeInferenceBehaviorTest(keras_parameterized.TestCase):
         x = self.dense(x)
         return x
 
-    class CompoundModel(keras.Model):
+    class CompoundModel(training_lib.Model):
 
       def __init__(self):
         super(CompoundModel, self).__init__()
@@ -1337,27 +1338,27 @@ class DefaultShapeInferenceBehaviorTest(keras_parameterized.TestCase):
     output = model(sample_input)  # pylint: disable=not-callable
     self.assertEqual(output.shape, (1, 3))
 
-  @test_util.run_in_graph_and_eager_modes()
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testNoneInShapeWithFunctionalAPI(self):
 
-    class BasicBlock(keras.Model):
-      # Inheriting from keras.layers.Layer since we are calling this layer
+    class BasicBlock(training_lib.Model):
+      # Inheriting from layers.Layer since we are calling this layer
       # inside a model created using functional API.
 
       def __init__(self):
         super(BasicBlock, self).__init__()
-        self.conv1 = keras.layers.Conv2D(8, 3)
+        self.conv1 = layers.Conv2D(8, 3)
 
       def call(self, x):
         x = self.conv1(x)
         return x
 
-    input_layer = keras.layers.Input(shape=(None, None, 1))
+    input_layer = layers.Input(shape=(None, None, 1))
     x = BasicBlock()(input_layer)
-    x = keras.layers.GlobalAveragePooling2D()(x)
-    output_layer = keras.layers.Dense(3)(x)
+    x = layers.GlobalAveragePooling2D()(x)
+    output_layer = layers.Dense(3)(x)
 
-    model = keras.Model(inputs=input_layer, outputs=output_layer)
+    model = training_lib.Model(inputs=input_layer, outputs=output_layer)
 
     model.build(tensor_shape.TensorShape((None, None, None, 1)))
     self.assertTrue(model.built, 'Model should be built')
@@ -1368,16 +1369,16 @@ class DefaultShapeInferenceBehaviorTest(keras_parameterized.TestCase):
     output = model(sample_input)
     self.assertEqual(output.shape, (1, 3))
 
-  @keras_parameterized.run_all_keras_modes
+  @combinations.generate(combinations.keras_mode_combinations())
   def test_sequential_as_downstream_of_masking_layer(self):
-    inputs = keras.layers.Input(shape=(3, 4))
-    x = keras.layers.Masking(mask_value=0., input_shape=(3, 4))(inputs)
+    inputs = layers.Input(shape=(3, 4))
+    x = layers.Masking(mask_value=0., input_shape=(3, 4))(inputs)
 
-    s = keras.Sequential()
-    s.add(keras.layers.Dense(5, input_shape=(4,)))
+    s = sequential.Sequential()
+    s.add(layers.Dense(5, input_shape=(4,)))
 
-    x = keras.layers.wrappers.TimeDistributed(s)(x)
-    model = keras.Model(inputs=inputs, outputs=x)
+    x = layers.wrappers.TimeDistributed(s)(x)
+    model = training_lib.Model(inputs=inputs, outputs=x)
     model.compile(
         optimizer='rmsprop',
         loss='mse',
@@ -1396,16 +1397,16 @@ class DefaultShapeInferenceBehaviorTest(keras_parameterized.TestCase):
       mask_outputs = [model.layers[1].compute_mask(model.layers[1].input)]
       mask_outputs += [model.layers[2].compute_mask(
           model.layers[2].input, mask_outputs[-1])]
-      func = keras.backend.function([model.input], mask_outputs)
+      func = backend.function([model.input], mask_outputs)
       mask_outputs_val = func([model_input])
       self.assertAllClose(mask_outputs_val[0], np.any(model_input, axis=-1))
       self.assertAllClose(mask_outputs_val[1], np.any(model_input, axis=-1))
 
-  @test_util.run_in_graph_and_eager_modes()
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_external_keras_serialization_compat_input_layers(self):
-    inputs = keras.Input(shape=(10,))
-    outputs = keras.layers.Dense(1)(inputs)
-    model = keras.Model(inputs, outputs)
+    inputs = input_layer_lib.Input(shape=(10,))
+    outputs = layers.Dense(1)(inputs)
+    model = training_lib.Model(inputs, outputs)
     config = model.get_config()
     # Checks that single inputs and outputs are still saved as 1-element lists.
     # Saving as 1-element lists or not is equivalent in TF Keras, but only the
@@ -1413,20 +1414,20 @@ class DefaultShapeInferenceBehaviorTest(keras_parameterized.TestCase):
     self.assertLen(config['input_layers'], 1)
     self.assertLen(config['output_layers'], 1)
 
-  @test_util.run_in_graph_and_eager_modes()
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_external_keras_serialization_compat_inbound_nodes(self):
     # Check single Tensor input.
-    inputs = keras.Input(shape=(10,), name='in')
-    outputs = keras.layers.Dense(1)(inputs)
-    model = keras.Model(inputs, outputs)
+    inputs = input_layer_lib.Input(shape=(10,), name='in')
+    outputs = layers.Dense(1)(inputs)
+    model = training_lib.Model(inputs, outputs)
     config = model.get_config()
     self.assertEqual(config['layers'][1]['inbound_nodes'], [[['in', 0, 0, {}]]])
 
     # Check multiple Tensor input.
-    inputs1 = keras.Input(shape=(10,), name='in1')
-    inputs2 = keras.Input(shape=(10,), name='in2')
-    outputs = keras.layers.Add()([inputs1, inputs2])
-    model = keras.Model([inputs1, inputs2], outputs)
+    inputs1 = input_layer_lib.Input(shape=(10,), name='in1')
+    inputs2 = input_layer_lib.Input(shape=(10,), name='in2')
+    outputs = layers.Add()([inputs1, inputs2])
+    model = training_lib.Model([inputs1, inputs2], outputs)
     config = model.get_config()
     self.assertEqual(config['layers'][2]['inbound_nodes'],
                      [[['in1', 0, 0, {}], ['in2', 0, 0, {}]]])
@@ -1447,28 +1448,30 @@ class GraphUtilsTest(test.TestCase):
       x_5 = x_3 * pl_1
 
       self.assertEqual(
-          keras.utils.tf_utils.get_reachable_from_inputs([pl_1]),
+          tf_utils.get_reachable_from_inputs([pl_1]),
           {pl_1, x_1, x_4, x_5, x_1.op, x_4.op, x_5.op})
       self.assertEqual(
-          keras.utils.tf_utils.get_reachable_from_inputs([pl_1, pl_2]),
+          tf_utils.get_reachable_from_inputs([pl_1, pl_2]),
           {pl_1, pl_2, x_1, x_2, x_4, x_5, x_1.op, x_2.op, x_4.op, x_5.op})
       self.assertEqual(
-          keras.utils.tf_utils.get_reachable_from_inputs([pl_3]),
+          tf_utils.get_reachable_from_inputs([pl_3]),
           {pl_3, x_3, x_5, x_3.op, x_5.op})
       self.assertEqual(
-          keras.utils.tf_utils.get_reachable_from_inputs([x_3]),
-          {x_3, x_5, x_5.op})
+          tf_utils.get_reachable_from_inputs([x_3]), {x_3, x_5, x_5.op})
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class NestedNetworkTest(test.TestCase):
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
+class NestedNetworkTest(keras_parameterized.TestCase):
 
   def test_nested_inputs_network(self):
-    inputs = {'x1': keras.Input(shape=(1,)), 'x2': keras.Input(shape=(1,))}
-    outputs = keras.layers.Add()([inputs['x1'], inputs['x2']])
-    network = keras.engine.network.Network(inputs, outputs)
+    inputs = {
+        'x1': input_layer_lib.Input(shape=(1,)),
+        'x2': input_layer_lib.Input(shape=(1,))
+    }
+    outputs = layers.Add()([inputs['x1'], inputs['x2']])
+    network = network_lib.Network(inputs, outputs)
 
-    network = keras.engine.network.Network.from_config(network.get_config())
+    network = network_lib.Network.from_config(network.get_config())
 
     result_tensor = network({
         'x': array_ops.ones((1, 1), 'float32'),
@@ -1485,15 +1488,15 @@ class NestedNetworkTest(test.TestCase):
     self.assertListEqual(output_shape.as_list(), [None, 1])
 
   def test_nested_outputs_network(self):
-    inputs = keras.Input(shape=(1,))
+    inputs = input_layer_lib.Input(shape=(1,))
     outputs = {
-        'x+x': keras.layers.Add()([inputs, inputs]),
-        'x*x': keras.layers.Multiply()([inputs, inputs])
+        'x+x': layers.Add()([inputs, inputs]),
+        'x*x': layers.Multiply()([inputs, inputs])
     }
 
-    network = keras.engine.network.Network(inputs, outputs)
+    network = network_lib.Network(inputs, outputs)
 
-    network = keras.engine.network.Network.from_config(network.get_config())
+    network = network_lib.Network.from_config(network.get_config())
 
     result_tensor = network(array_ops.ones((1, 1), 'float32'))
     result = self.evaluate(result_tensor)
@@ -1506,23 +1509,24 @@ class NestedNetworkTest(test.TestCase):
 
   def test_nested_network_inside_network(self):
     inner_inputs = {
-        'x1': keras.Input(shape=(1,)),
-        'x2': keras.Input(shape=(1,))
+        'x1': input_layer_lib.Input(shape=(1,)),
+        'x2': input_layer_lib.Input(shape=(1,))
     }
     inner_outputs = {
-        'x1+x2':
-            keras.layers.Add()([inner_inputs['x1'], inner_inputs['x2']]),
-        'x1*x2':
-            keras.layers.Multiply()([inner_inputs['x1'], inner_inputs['x2']])
+        'x1+x2': layers.Add()([inner_inputs['x1'], inner_inputs['x2']]),
+        'x1*x2': layers.Multiply()([inner_inputs['x1'], inner_inputs['x2']])
     }
-    inner_network = keras.engine.network.Network(inner_inputs, inner_outputs)
+    inner_network = network_lib.Network(inner_inputs, inner_outputs)
 
-    inputs = [keras.Input(shape=(1,)), keras.Input(shape=(1,))]
+    inputs = [
+        input_layer_lib.Input(shape=(1,)),
+        input_layer_lib.Input(shape=(1,))
+    ]
     middle = inner_network({'x1': inputs[0], 'x2': inputs[1]})
-    outputs = keras.layers.Add()([middle['x1+x2'], middle['x1*x2']])
-    network = keras.engine.network.Network(inputs, outputs)
+    outputs = layers.Add()([middle['x1+x2'], middle['x1*x2']])
+    network = network_lib.Network(inputs, outputs)
 
-    network = keras.engine.network.Network.from_config(network.get_config())
+    network = network_lib.Network.from_config(network.get_config())
 
     # Computes: `(x1+x2) + (x1*x2)`
     result_tensor = network(
@@ -1534,14 +1538,14 @@ class NestedNetworkTest(test.TestCase):
     output_shape = network.compute_output_shape([(None, 1), (None, 1)])
     self.assertListEqual(output_shape.as_list(), [None, 1])
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_updates_with_direct_call(self):
-    inputs = keras.Input(shape=(10,))
-    x = keras.layers.BatchNormalization()(inputs)
-    x = keras.layers.Dense(10)(x)
-    model = keras.Model(inputs, x)
+    inputs = input_layer_lib.Input(shape=(10,))
+    x = layers.BatchNormalization()(inputs)
+    x = layers.Dense(10)(x)
+    model = training_lib.Model(inputs, x)
 
-    ph = keras.backend.placeholder(shape=(10, 10))
+    ph = backend.placeholder(shape=(10, 10))
     model(ph)
 
     self.assertLen(model.get_updates_for(ph), 2)
@@ -1549,7 +1553,7 @@ class NestedNetworkTest(test.TestCase):
 
   def test_dict_mapping_input(self):
 
-    class ReturnFirst(keras.layers.Layer):
+    class ReturnFirst(layers.Layer):
 
       def call(self, inputs):
         b, _ = inputs
@@ -1557,30 +1561,44 @@ class NestedNetworkTest(test.TestCase):
 
     # Checks that inputs are put in same order as the
     # Model was constructed with.
-    b = keras.Input(shape=(10,), name='b')
-    a = keras.Input(shape=(10,), name='a')
+    b = input_layer_lib.Input(shape=(10,), name='b')
+    a = input_layer_lib.Input(shape=(10,), name='a')
     outputs = ReturnFirst()([b, a])
 
     b_val = array_ops.ones((10, 10))
     a_val = array_ops.zeros((10, 10))
 
-    model = keras.Model([b, a], outputs)
+    model = training_lib.Model([b, a], outputs)
     res = model({'a': a_val, 'b': b_val})
     self.assertAllClose(self.evaluate(res), self.evaluate(b_val))
 
-    reversed_model = keras.Model([a, b], outputs)
+    reversed_model = training_lib.Model([a, b], outputs)
     res = reversed_model({'a': a_val, 'b': b_val})
     self.assertAllClose(self.evaluate(res), self.evaluate(b_val))
 
+  def test_dict_mapping_single_input(self):
+    b = input_layer_lib.Input(shape=(1,), name='b')
+    outputs = b * 2
+    model = training_lib.Model(b, outputs)
 
-@keras_parameterized.run_all_keras_modes
+    b_val = array_ops.ones((1, 1))
+    extra_val = array_ops.ones((1, 10))
+
+    inputs = {'a': extra_val, 'b': b_val}
+    res = model(inputs)
+
+    # Check that 'b' was used and 'a' was ignored.
+    self.assertEqual(res.shape.as_list(), [1, 1])
+
+
+@combinations.generate(combinations.keras_mode_combinations())
 class AddLossTest(keras_parameterized.TestCase):
 
   def test_add_loss_outside_call_only_loss(self):
-    inputs = keras.Input((10,))
-    mid = keras.layers.Dense(10)(inputs)
-    outputs = keras.layers.Dense(1)(mid)
-    model = keras.Model(inputs, outputs)
+    inputs = input_layer_lib.Input((10,))
+    mid = layers.Dense(10)(inputs)
+    outputs = layers.Dense(1)(mid)
+    model = training_lib.Model(inputs, outputs)
     model.add_loss(math_ops.reduce_mean(outputs))
     self.assertLen(model.losses, 1)
 
@@ -1604,11 +1622,11 @@ class AddLossTest(keras_parameterized.TestCase):
     self.assertAllClose(model.get_weights(), model2.get_weights())
 
   def test_add_loss_outside_call_multiple_losses(self):
-    inputs = keras.Input((10,))
-    x1 = keras.layers.Dense(10)(inputs)
-    x2 = keras.layers.Dense(10)(x1)
-    outputs = keras.layers.Dense(1)(x2)
-    model = keras.Model(inputs, outputs)
+    inputs = input_layer_lib.Input((10,))
+    x1 = layers.Dense(10)(inputs)
+    x2 = layers.Dense(10)(x1)
+    outputs = layers.Dense(1)(x2)
+    model = training_lib.Model(inputs, outputs)
     model.add_loss(math_ops.reduce_sum(x1 * x2))
     model.add_loss(math_ops.reduce_mean(outputs))
     self.assertLen(model.losses, 2)
@@ -1633,38 +1651,39 @@ class AddLossTest(keras_parameterized.TestCase):
     self.assertAllClose(model.get_weights(), model2.get_weights())
 
 
-@keras_parameterized.run_all_keras_modes
+@combinations.generate(combinations.keras_mode_combinations())
 class WeightAccessTest(keras_parameterized.TestCase):
 
   def test_functional_model(self):
-    inputs = keras.Input((10,))
-    x1 = keras.layers.Dense(10)(inputs)
-    x2 = keras.layers.Dense(10)(x1)
-    outputs = keras.layers.Dense(1)(x2)
-    model = keras.Model(inputs, outputs)
+    inputs = input_layer_lib.Input((10,))
+    x1 = layers.Dense(10)(inputs)
+    x2 = layers.Dense(10)(x1)
+    outputs = layers.Dense(1)(x2)
+    model = training_lib.Model(inputs, outputs)
 
     self.assertEqual(len(model.weights), 6)
 
   def test_sequential_model_with_input_shape(self):
-    x1 = keras.layers.Dense(10, input_shape=(10,))
-    x2 = keras.layers.Dense(10)
-    x3 = keras.layers.Dense(1)
-    model = keras.models.Sequential([x1, x2, x3])
+    x1 = layers.Dense(10, input_shape=(10,))
+    x2 = layers.Dense(10)
+    x3 = layers.Dense(1)
+    model = sequential.Sequential([x1, x2, x3])
 
     self.assertEqual(len(model.weights), 6)
 
   def test_sequential_model_without_input_shape(self):
-    x1 = keras.layers.Dense(10)
-    x2 = keras.layers.Dense(10)
-    x3 = keras.layers.Dense(1)
-    model = keras.models.Sequential([x1, x2, x3])
+    x1 = layers.Dense(10)
+    x2 = layers.Dense(10)
+    x3 = layers.Dense(1)
+    model = sequential.Sequential([x1, x2, x3])
 
     with self.assertRaisesRegexp(
         ValueError, 'Weights for model .* have not yet been created'):
       _ = model.weights
 
   def test_subclass_model_with_build_method(self):
-    class SubclassModel(keras.models.Model):
+
+    class SubclassModel(models.Model):
 
       def build(self, input_shape):
         self.w = self.add_weight(shape=input_shape[-1], initializer='ones')
@@ -1678,11 +1697,12 @@ class WeightAccessTest(keras_parameterized.TestCase):
         ValueError, 'Weights for model .* have not yet been created'):
       _ = model.weights
 
-    model(keras.Input((10,)))
+    model(input_layer_lib.Input((10,)))
     self.assertEqual(len(model.weights), 1)
 
   def test_subclass_model_without_build_method(self):
-    class SubclassModel(keras.models.Model):
+
+    class SubclassModel(models.Model):
 
       def __init__(self):
         super(SubclassModel, self).__init__()
@@ -1695,13 +1715,13 @@ class WeightAccessTest(keras_parameterized.TestCase):
     self.assertEqual(len(model.weights), 1)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class DTypeTest(keras_parameterized.TestCase):
 
   @testing_utils.enable_v2_dtype_behavior
   def test_graph_network_dtype(self):
-    inputs = keras.Input((10,))
-    outputs = keras.layers.Dense(10)(inputs)
+    inputs = input_layer_lib.Input((10,))
+    outputs = layers.Dense(10)(inputs)
     network = network_lib.Network(inputs, outputs)
     self.assertEqual(network.dtype, 'float32')
 
@@ -1748,6 +1768,7 @@ class AttrTrackingLayer(base_layer.Layer):
 
 
 class CacheCorrectnessTest(keras_parameterized.TestCase):
+
   def layer_and_network_test(self):
     # Top level layer
     network = network_lib.Network()
@@ -1850,7 +1871,7 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
   def test_compute_output_shape_cache(self):
     # See https://github.com/tensorflow/tensorflow/issues/32029.
     x = input_layer_lib.Input(shape=(None, 32))
-    dense = keras.layers.Dense(2)
+    dense = layers.Dense(2)
     y = dense(x)
     network = network_lib.Network(x, y, name='dense_network')
 
@@ -1875,6 +1896,44 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
     y = net(x)
     self.assertEqual(y.shape.rank, 2)
 
+  def test_training_passed_during_construction(self):
+
+    class MyLayer(base_layer.Layer):
+
+      def call(self, x, training=None):
+        self.training = training
+        return x
+
+    my_layer = MyLayer()
+    x = np.ones((1, 10))
+
+    inputs = input_layer_lib.Input(10)
+    outputs = my_layer(inputs, training=True)
+    network = network_lib.Network(inputs, outputs)
+
+    network(x, training=False)
+    # Hard-coded value passed during construction is respected.
+    self.assertTrue(my_layer.training)
+
+    inputs = input_layer_lib.Input(10)
+    outputs = my_layer(inputs, training=False)
+    network = network_lib.Network(inputs, outputs)
+
+    network(x, training=True)
+    # Hard-coded value passed during construction is respected.
+    self.assertFalse(my_layer.training)
+
+    inputs = input_layer_lib.Input(10)
+    outputs = my_layer(inputs, training=None)
+    network = network_lib.Network(inputs, outputs)
+
+    network(x, training=True)
+    # `None` value passed during construction is overridden.
+    self.assertTrue(my_layer.training)
+    network(x, training=False)
+    # `None` value passed during construction is overridden.
+    self.assertFalse(my_layer.training)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/node.py b/tensorflow/python/keras/engine/node.py
index 4e005071c6e..c8255764efc 100644
--- a/tensorflow/python/keras/engine/node.py
+++ b/tensorflow/python/keras/engine/node.py
@@ -76,6 +76,12 @@ class Node(object):
       raise ValueError('`outbound_layer` should be a layer instance, '
                        'not a list, tuple, or, dict.')
 
+    # These arguments are user-provided. Copy them here so that future
+    # user modifications do not affect the node's metadata.
+    input_tensors = nest.map_structure(lambda t: t, input_tensors)
+    output_tensors = nest.map_structure(lambda t: t, output_tensors)
+    arguments = nest.map_structure(lambda t: t, arguments)
+
     # this is the layer that takes a nested structure of input tensors
     # and turns them into a nested structure of output tensors.
     # the current node will be added to
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index 1ad9d0054f9..667899660c1 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -23,7 +23,6 @@ import copy
 
 from tensorflow.python.keras import layers as layer_module
 from tensorflow.python.keras.engine import base_layer
-from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.engine import training_utils
@@ -255,7 +254,7 @@ class Sequential(training.Model):
       self._init_graph_network(self.inputs, self.outputs, name=self.name)
       self.built = True
 
-  @base_layer_utils.default
+  @generic_utils.default
   def build(self, input_shape=None):
     if self._is_graph_network:
       self._init_graph_network(self.inputs, self.outputs, name=self.name)
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 8c14e504297..7dcf10a506c 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -18,7 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
+
 from tensorflow.python.distribute import distribute_coordinator as dc
+from tensorflow.python.distribute import distribute_coordinator_context as dc_context
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import values as ds_values
 from tensorflow.python.eager import backprop
@@ -42,7 +45,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.ragged import ragged_concat_ops
 from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.profiler import traceme
+from tensorflow.python.profiler import trace
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
@@ -61,6 +64,10 @@ def enable_multi_worker(method):
     if not self._in_multi_worker_mode():  # pylint: disable=protected-access
       return method(self, *args, **kwargs)
 
+    # Running inside `run_distribute_coordinator` already.
+    if dc_context.get_current_worker_context():
+      return method(self, *args, **kwargs)
+
     return dc.run_distribute_coordinator(
         lambda _: method(self, *args, **kwargs),
         self.distribute_strategy,
@@ -493,7 +500,7 @@ class Model(network.Network, version_utils.ModelVersionSelector):
 
     def train_function(iterator):
       data = next(iterator)
-      outputs = self.distribute_strategy.experimental_run_v2(
+      outputs = self.distribute_strategy.run(
           self.train_step, args=(data,))
       outputs = reduce_per_replica(
           outputs, self.distribute_strategy, reduction='first')
@@ -743,14 +750,15 @@ class Model(network.Network, version_utils.ModelVersionSelector):
           model=self)
 
       # Container that configures and calls `tf.keras.Callback`s.
-      callbacks = callbacks_module.CallbackList(
-          callbacks,
-          add_history=True,
-          add_progbar=True,
-          model=self,
-          verbose=verbose,
-          epochs=epochs,
-          steps=data_handler._steps)  # pylint: disable=protected-access
+      if not isinstance(callbacks, callbacks_module.CallbackList):
+        callbacks = callbacks_module.CallbackList(
+            callbacks,
+            add_history=True,
+            add_progbar=verbose != 0,
+            model=self,
+            verbose=verbose,
+            epochs=epochs,
+            steps=data_handler.inferred_steps)
 
       self.stop_training = False
       train_function = self.make_train_function()
@@ -765,7 +773,7 @@ class Model(network.Network, version_utils.ModelVersionSelector):
         callbacks.on_epoch_begin(epoch)
         with data_handler.catch_stop_iteration():
           for step in data_handler.steps():
-            with traceme.TraceMe(
+            with trace.Trace(
                 'TraceContext',
                 graph_type='train',
                 epoch_num=epoch,
@@ -773,12 +781,14 @@ class Model(network.Network, version_utils.ModelVersionSelector):
                 batch_size=batch_size):
               callbacks.on_train_batch_begin(step)
               tmp_logs = train_function(iterator)
-              # Catch possible OutOfRangeError here.
-              # TODO(b/150292341): Allow multiple async steps.
-              context.async_wait()
-              logs = tmp_logs
+              # Catch OutOfRangeError for Datasets of unknown size.
+              # This blocks until the batch has finished executing.
+              # TODO(b/150292341): Allow multiple async steps here.
+              if not data_handler.inferred_steps:
+                context.async_wait()
+              logs = tmp_logs  # No error, now safe to assign to logs.
               callbacks.on_train_batch_end(step, logs)
-        epoch_logs = {m.name: m.result() for m in self.metrics}
+        epoch_logs = copy.copy(logs)
 
         # Run validation.
         if validation_data and self._should_eval(epoch, validation_freq):
@@ -863,7 +873,7 @@ class Model(network.Network, version_utils.ModelVersionSelector):
 
     def test_function(iterator):
       data = next(iterator)
-      outputs = self.distribute_strategy.experimental_run_v2(
+      outputs = self.distribute_strategy.run(
           self.test_step, args=(data,))
       outputs = reduce_per_replica(
           outputs, self.distribute_strategy, reduction='first')
@@ -986,11 +996,11 @@ class Model(network.Network, version_utils.ModelVersionSelector):
         callbacks = callbacks_module.CallbackList(
             callbacks,
             add_history=True,
-            add_progbar=True,
+            add_progbar=verbose != 0,
             model=self,
             verbose=verbose,
             epochs=1,
-            steps=data_handler._steps)  # pylint: disable=protected-access
+            steps=data_handler.inferred_steps)
 
       test_function = self.make_test_function()
       callbacks.on_test_begin()
@@ -998,14 +1008,15 @@ class Model(network.Network, version_utils.ModelVersionSelector):
         self.reset_metrics()
         with data_handler.catch_stop_iteration():
           for step in data_handler.steps():
-            with traceme.TraceMe(
-                'TraceContext',
-                graph_type='test',
-                step_num=step):
+            with trace.Trace('TraceContext', graph_type='test', step_num=step):
               callbacks.on_test_batch_begin(step)
               tmp_logs = test_function(iterator)
-              context.async_wait()  # Possible OutOfRangeError here.
-              logs = tmp_logs
+              # Catch OutOfRangeError for Datasets of unknown size.
+              # This blocks until the batch has finished executing.
+              # TODO(b/150292341): Allow multiple async steps here.
+              if not data_handler.inferred_steps:
+                context.async_wait()
+              logs = tmp_logs  # No error, now safe to assign to logs.
               callbacks.on_test_batch_end(step, logs)
       callbacks.on_test_end()
 
@@ -1065,7 +1076,7 @@ class Model(network.Network, version_utils.ModelVersionSelector):
 
     def predict_function(iterator):
       data = next(iterator)
-      outputs = self.distribute_strategy.experimental_run_v2(
+      outputs = self.distribute_strategy.run(
           self.predict_step, args=(data,))
       outputs = reduce_per_replica(
           outputs, self.distribute_strategy, reduction='concat')
@@ -1170,14 +1181,15 @@ class Model(network.Network, version_utils.ModelVersionSelector):
           model=self)
 
       # Container that configures and calls `tf.keras.Callback`s.
-      callbacks = callbacks_module.CallbackList(
-          callbacks,
-          add_history=True,
-          add_progbar=True,
-          model=self,
-          verbose=verbose,
-          epochs=1,
-          steps=data_handler._steps)  # pylint: disable=protected-access
+      if not isinstance(callbacks, callbacks_module.CallbackList):
+        callbacks = callbacks_module.CallbackList(
+            callbacks,
+            add_history=True,
+            add_progbar=verbose != 0,
+            model=self,
+            verbose=verbose,
+            epochs=1,
+            steps=data_handler.inferred_steps)
 
       predict_function = self.make_predict_function()
       callbacks.on_predict_begin()
@@ -1186,8 +1198,12 @@ class Model(network.Network, version_utils.ModelVersionSelector):
           for step in data_handler.steps():
             callbacks.on_predict_batch_begin(step)
             tmp_batch_outputs = predict_function(iterator)
-            context.async_wait()  # Possible OutOfRangeError here.
-            batch_outputs = tmp_batch_outputs
+            # Catch OutOfRangeError for Datasets of unknown size.
+            # This blocks until the batch has finished executing.
+            # TODO(b/150292341): Allow multiple async steps here.
+            if not data_handler.inferred_steps:
+              context.async_wait()
+            batch_outputs = tmp_batch_outputs  # No error, now safe to assign.
             if outputs is None:
               outputs = nest.map_structure(lambda batch_output: [batch_output],
                                            batch_outputs)
diff --git a/tensorflow/python/keras/engine/training_distributed.py b/tensorflow/python/keras/engine/training_distributed.py
index e0f5028ab72..b33a90bd533 100644
--- a/tensorflow/python/keras/engine/training_distributed.py
+++ b/tensorflow/python/keras/engine/training_distributed.py
@@ -338,7 +338,7 @@ def experimental_tpu_test_loop(model,
       return [array_ops.identity(out) for out in outputs]
 
   test_input_data = iterator.get_next()
-  per_replica_outputs = current_strategy.experimental_run_v2(
+  per_replica_outputs = current_strategy.run(
       _test_step_fn, args=(test_input_data,))
   output_tensors = {}
   for label, output in zip(out_labels, per_replica_outputs):
@@ -488,7 +488,7 @@ def experimental_tpu_predict_loop(model,
   # use numpy arrays directly to avoid cumulating unnecessary input pipeline
   # ops.
   predict_input_data = iterator.get_next()
-  per_replica_outputs = current_strategy.experimental_run_v2(
+  per_replica_outputs = current_strategy.run(
       _predict_step_fn, args=(predict_input_data,))
   output_tensors = dist_utils.flatten_per_replica_values(
       current_strategy, per_replica_outputs)
diff --git a/tensorflow/python/keras/engine/training_generator_test.py b/tensorflow/python/keras/engine/training_generator_test.py
index dd25fb778ef..0844523f81b 100644
--- a/tensorflow/python/keras/engine/training_generator_test.py
+++ b/tensorflow/python/keras/engine/training_generator_test.py
@@ -23,14 +23,17 @@ import itertools
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
-from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import layers as layers_module
+from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.engine import input_layer
+from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.engine import training_generator
 from tensorflow.python.keras.optimizer_v2 import rmsprop
 from tensorflow.python.keras.utils import data_utils
@@ -367,11 +370,13 @@ class TestGeneratorMethods(keras_parameterized.TestCase):
         yield pack_and_pad(queue)
 
     model = testing_utils.get_model_from_layers([
-        keras.layers.Embedding(input_dim=len(vocab) + 1, output_dim=4),
-        keras.layers.SimpleRNN(units=1),
-        keras.layers.Activation('sigmoid')], input_shape=(None,))
+        layers_module.Embedding(input_dim=len(vocab) + 1, output_dim=4),
+        layers_module.SimpleRNN(units=1),
+        layers_module.Activation('sigmoid')
+    ],
+                                                input_shape=(None,))
 
-    model.compile(loss=keras.losses.binary_crossentropy, optimizer='sgd')
+    model.compile(loss=losses.binary_crossentropy, optimizer='sgd')
     model.fit(data_gen(), epochs=1, steps_per_epoch=5)
 
 
@@ -471,16 +476,16 @@ class TestGeneratorMethodsWithSequences(keras_parameterized.TestCase):
       def on_epoch_end(self):
         self.epochs += 1
 
-    inputs = keras.Input(10)
-    outputs = keras.layers.Dense(1)(inputs)
-    model = keras.Model(inputs, outputs)
+    inputs = input_layer.Input(10)
+    outputs = layers_module.Dense(1)(inputs)
+    model = training.Model(inputs, outputs)
     model.compile('sgd', 'mse')
     my_seq = MySequence()
     model.fit(my_seq, epochs=2)
     self.assertEqual(my_seq.epochs, 2)
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class TestConvertToGeneratorLike(test.TestCase, parameterized.TestCase):
   simple_inputs = (np.ones((10, 10)), np.ones((10, 1)))
   nested_inputs = ((np.ones((10, 10)), np.ones((10, 20))), (np.ones((10, 1)),
diff --git a/tensorflow/python/keras/engine/training_gpu_test.py b/tensorflow/python/keras/engine/training_gpu_test.py
index ce9c2a0d7d0..996e281bf0c 100644
--- a/tensorflow/python/keras/engine/training_gpu_test.py
+++ b/tensorflow/python/keras/engine/training_gpu_test.py
@@ -18,18 +18,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python import keras
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import combinations
+from tensorflow.python.keras.engine import input_layer
+from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers.convolutional import Conv2D
 from tensorflow.python.platform import test
 
 
-class TrainingGPUTest(test.TestCase):
+class TrainingGPUTest(test.TestCase, parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_model_with_crossentropy_losses_channels_first(self):
     """Tests use of all crossentropy losses with `channels_first`.
 
@@ -63,8 +66,7 @@ class TrainingGPUTest(test.TestCase):
                            activation=activation,
                            kernel_initializer='ones',
                            bias_initializer='ones')(input_tensor)
-      simple_model = keras.models.Model(inputs=input_tensor,
-                                        outputs=predictions)
+      simple_model = training.Model(inputs=input_tensor, outputs=predictions)
       simple_model.compile(optimizer='rmsprop', loss=loss)
       return simple_model
 
@@ -96,7 +98,7 @@ class TrainingGPUTest(test.TestCase):
         data = np.moveaxis(data_channels_first, 1, -1)
         for index, loss_function in enumerate(losses_to_test):
           labels = np.moveaxis(labels_channels_first[index], 1, -1)
-          inputs = keras.Input(shape=(3, 3, 1))
+          inputs = input_layer.Input(shape=(3, 3, 1))
           model = prepare_simple_model(inputs, loss_function, labels)
           loss_channels_last[index] = model.evaluate(x=data, y=labels,
                                                      batch_size=1, verbose=0)
@@ -107,7 +109,7 @@ class TrainingGPUTest(test.TestCase):
         data = data_channels_first
         for index, loss_function in enumerate(losses_to_test):
           labels = labels_channels_first[index]
-          inputs = keras.Input(shape=(1, 3, 3))
+          inputs = input_layer.Input(shape=(1, 3, 3))
           model = prepare_simple_model(inputs, loss_function, labels)
           loss_channels_first[index] = model.evaluate(x=data, y=labels,
                                                       batch_size=1, verbose=0)
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 369002bb91d..35497721f6d 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -26,19 +26,25 @@ from absl.testing import parameterized
 import numpy as np
 import six
 
-from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import layers as layers_module
+from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras import optimizer_v2
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.callbacks import Callback
+from tensorflow.python.keras.engine import input_layer
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.engine import training as training_module
 from tensorflow.python.keras.engine import training_utils
-from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import np_utils
 from tensorflow.python.ops import array_ops
@@ -63,7 +69,7 @@ class TrainingTest(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_fit_training_arg(self):
 
-    class ReturnTraining(keras.layers.Layer):
+    class ReturnTraining(layers_module.Layer):
 
       def call(self, inputs, training):
         if training:
@@ -71,7 +77,7 @@ class TrainingTest(keras_parameterized.TestCase):
         else:
           return inputs + array_ops.constant([0], 'float32')
 
-    model = keras.Sequential([ReturnTraining()])
+    model = sequential.Sequential([ReturnTraining()])
     model.compile(
         'sgd',
         'mse',
@@ -82,14 +88,13 @@ class TrainingTest(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_fit_and_validate_learning_phase(self):
 
-    class ReturnTraining(keras.layers.Layer):
+    class ReturnTraining(layers_module.Layer):
 
       def call(self, inputs):
-        return keras.backend.in_train_phase(
-            lambda: array_ops.ones_like(inputs),
-            lambda: array_ops.zeros_like(inputs))
+        return backend.in_train_phase(lambda: array_ops.ones_like(inputs),
+                                      lambda: array_ops.zeros_like(inputs))
 
-    model = keras.Sequential([ReturnTraining(input_shape=(2,))])
+    model = sequential.Sequential([ReturnTraining(input_shape=(2,))])
     model.compile(
         'sgd',
         loss='mae',
@@ -114,15 +119,15 @@ class TrainingTest(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_fit_and_validate_training_arg(self):
 
-    class ReturnTraining(keras.layers.Layer):
+    class ReturnTraining(layers_module.Layer):
 
       def call(self, inputs, training=None):
-        return keras.backend.in_train_phase(
+        return backend.in_train_phase(
             lambda: array_ops.ones_like(inputs),
             lambda: array_ops.zeros_like(inputs),
             training=training)
 
-    model = keras.Sequential([ReturnTraining(input_shape=(2,))])
+    model = sequential.Sequential([ReturnTraining(input_shape=(2,))])
     model.compile(
         'sgd',
         loss='mae',
@@ -152,8 +157,10 @@ class TrainingTest(keras_parameterized.TestCase):
       self.assertEqual(labels.dtype, preds.dtype)
       return labels - preds
 
-    layers = [keras.layers.Dense(10, dtype=np.float64),
-              keras.layers.Dense(10, dtype=np.float64)]
+    layers = [
+        layers_module.Dense(10, dtype=np.float64),
+        layers_module.Dense(10, dtype=np.float64)
+    ]
     model = testing_utils.get_model_from_layers(layers, input_shape=(1,))
     inputs = np.ones(10, dtype=np.float64)
     targets = np.ones(10, dtype=np.float64)
@@ -168,15 +175,15 @@ class TrainingTest(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_fit_and_validate_nested_training_arg(self):
 
-    class NestedReturnTraining(keras.layers.Layer):
+    class NestedReturnTraining(layers_module.Layer):
 
       def call(self, inputs, training=None):
-        return keras.backend.in_train_phase(
+        return backend.in_train_phase(
             lambda: array_ops.ones_like(inputs),
             lambda: array_ops.zeros_like(inputs),
             training=training)
 
-    class ReturnTraining(keras.layers.Layer):
+    class ReturnTraining(layers_module.Layer):
 
       def __init__(self, input_shape=None, **kwargs):
         super(ReturnTraining, self).__init__(input_shape=input_shape, **kwargs)
@@ -189,7 +196,7 @@ class TrainingTest(keras_parameterized.TestCase):
       def call(self, inputs):
         return self._nested_layer(inputs)
 
-    model = keras.Sequential([ReturnTraining(input_shape=(2,))])
+    model = sequential.Sequential([ReturnTraining(input_shape=(2,))])
     model.compile(
         'sgd',
         loss='mae',
@@ -214,11 +221,11 @@ class TrainingTest(keras_parameterized.TestCase):
   @keras_parameterized.run_with_all_model_types(exclude_models='sequential')
   @keras_parameterized.run_all_keras_modes
   def test_fit_on_arrays(self):
-    input_a = keras.layers.Input(shape=(3,), name='input_a')
-    input_b = keras.layers.Input(shape=(3,), name='input_b')
+    input_a = layers_module.Input(shape=(3,), name='input_a')
+    input_b = layers_module.Input(shape=(3,), name='input_b')
 
-    dense = keras.layers.Dense(4, name='dense')
-    dropout = keras.layers.Dropout(0.5, name='dropout')
+    dense = layers_module.Dense(4, name='dense')
+    dropout = layers_module.Dropout(0.5, name='dropout')
     branch_a = [input_a, dense]
     branch_b = [input_b, dense, dropout]
 
@@ -372,9 +379,9 @@ class TrainingTest(keras_parameterized.TestCase):
         verbose=0)
 
     # Build single-input model
-    x = keras.layers.Input(shape=(3,), name='input_a')
-    y = keras.layers.Dense(4)(x)
-    model = keras.models.Model(x, y)
+    x = layers_module.Input(shape=(3,), name='input_a')
+    y = layers_module.Dense(4)(x)
+    model = training_module.Model(x, y)
     model.compile(
         optimizer,
         loss='mse',
@@ -413,15 +420,15 @@ class TrainingTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes
   def test_evaluate_predict_on_arrays(self):
-    a = keras.layers.Input(shape=(3,), name='input_a')
-    b = keras.layers.Input(shape=(3,), name='input_b')
+    a = layers_module.Input(shape=(3,), name='input_a')
+    b = layers_module.Input(shape=(3,), name='input_b')
 
-    dense = keras.layers.Dense(4, name='dense')
+    dense = layers_module.Dense(4, name='dense')
     c = dense(a)
     d = dense(b)
-    e = keras.layers.Dropout(0.5, name='dropout')(c)
+    e = layers_module.Dropout(0.5, name='dropout')(c)
 
-    model = keras.models.Model([a, b], [d, e])
+    model = training_module.Model([a, b], [d, e])
 
     optimizer = RMSPropOptimizer(learning_rate=0.001)
     loss = 'mse'
@@ -598,7 +605,7 @@ class TrainingTest(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_custom_mapping_in_config(self):
 
-    class MyModel(keras.Model):
+    class MyModel(training_module.Model):
 
       def call(self, inputs):
         return inputs
@@ -621,11 +628,11 @@ class TrainingTest(keras_parameterized.TestCase):
       test_outputs = [
           scipy_sparse.random(6, i, density=0.25).tocsr() for i in range(3, 5)
       ]
-      in1 = keras.layers.Input(shape=(3,))
-      in2 = keras.layers.Input(shape=(3,))
-      out1 = keras.layers.Dropout(0.5, name='dropout')(in1)
-      out2 = keras.layers.Dense(4, name='dense_1')(in2)
-      model = keras.Model([in1, in2], [out1, out2])
+      in1 = layers_module.Input(shape=(3,))
+      in2 = layers_module.Input(shape=(3,))
+      out1 = layers_module.Dropout(0.5, name='dropout')(in1)
+      out2 = layers_module.Dense(4, name='dense_1')(in2)
+      model = training_module.Model([in1, in2], [out1, out2])
       model.predict(test_inputs, batch_size=2)
       optimizer = 'rmsprop'
       model.compile(
@@ -638,12 +645,12 @@ class TrainingTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes
   def test_compile_with_sparse_placeholders(self):
-    input_layer = keras.layers.Input(shape=(10,), sparse=True)
+    inputs = layers_module.Input(shape=(10,), sparse=True)
     weights = variables_lib.Variable(
         np.ones((10, 1)).astype(np.float32), name='weights')
     weights_mult = lambda x: sparse_ops.sparse_tensor_dense_matmul(x, weights)
-    output_layer = keras.layers.Lambda(weights_mult)(input_layer)
-    model = keras.Model([input_layer], output_layer)
+    output_layer = layers_module.Lambda(weights_mult)(inputs)
+    model = training_module.Model([inputs], output_layer)
     model.compile(
         loss='binary_crossentropy',
         optimizer='adam',
@@ -655,10 +662,10 @@ class TrainingTest(keras_parameterized.TestCase):
     val_a = np.random.random((10, 4))
     val_out = np.random.random((10, 4))
 
-    a = keras.layers.Input(shape=(4,))
-    layer = keras.layers.BatchNormalization(input_shape=(4,))
+    a = layers_module.Input(shape=(4,))
+    layer = layers_module.BatchNormalization(input_shape=(4,))
     b = layer(a)
-    model = keras.Model(a, b)
+    model = training_module.Model(a, b)
 
     model.trainable = False
     assert not model.updates
@@ -698,25 +705,26 @@ class TrainingTest(keras_parameterized.TestCase):
     self.assertAllClose(x1, x2, atol=1e-7)
 
   def test_weight_deduplication_in_methods(self):
-    inp = keras.layers.Input(shape=(1,))
-    bn = keras.layers.BatchNormalization()
-    d = keras.layers.Dense(1)
+    inp = layers_module.Input(shape=(1,))
+    bn = layers_module.BatchNormalization()
+    d = layers_module.Dense(1)
 
-    m0 = keras.models.Model(inp, d(bn(inp)))
-    m1 = keras.models.Model(inp, d(bn(inp)))
+    m0 = training_module.Model(inp, d(bn(inp)))
+    m1 = training_module.Model(inp, d(bn(inp)))
 
     x0 = m0(inp)
     x1 = m1(inp)
-    x = keras.layers.Add()([x0, x1])
+    x = layers_module.Add()([x0, x1])
 
-    model = keras.models.Model(inp, x)
+    model = training_module.Model(inp, x)
     self.assertLen(model.trainable_weights, 4)
     self.assertLen(model.non_trainable_weights, 2)
     self.assertLen(model.weights, 6)
 
   @keras_parameterized.run_all_keras_modes
   def test_weight_deduplication(self):
-    class WatchingLayer(keras.layers.Layer):
+
+    class WatchingLayer(layers_module.Layer):
 
       def __init__(self, dense_to_track):
         # This will cause the kernel and bias to be double counted, effectively
@@ -725,21 +733,23 @@ class TrainingTest(keras_parameterized.TestCase):
         self._bias = dense_to_track.bias
         super(WatchingLayer, self).__init__()
 
-    inp = keras.layers.Input(shape=(1,))
-    dense_layer = keras.layers.Dense(1)
+    inp = layers_module.Input(shape=(1,))
+    dense_layer = layers_module.Dense(1)
     dense_output = dense_layer(inp)  # This will build the dense kernel
 
     # Deterministically set weights to make the test repeatable.
     dense_layer.set_weights([np.ones((1, 1)), np.zeros((1,))])
     output = WatchingLayer(dense_layer)(dense_output)
 
-    model = keras.models.Model(inp, output)
+    model = training_module.Model(inp, output)
 
     # 0.25 is the edge of the radius of convergence for the double apply case.
     # At lr=0.24, the double apply case will very slowly descend while the
     # correct case will drop very quickly.
-    model.compile(loss='mse', optimizer=gradient_descent.SGD(0.24),
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='mse',
+        optimizer=optimizer_v2.gradient_descent.SGD(0.24),
+        run_eagerly=testing_utils.should_run_eagerly())
 
     x = np.ones((64 * 2,))
     y = 4.5 * x - 3.
@@ -753,7 +763,7 @@ class TrainingTest(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_weight_shared_across_layers(self):
 
-    class AddWeightLayer(keras.layers.Layer):
+    class AddWeightLayer(layers_module.Layer):
 
       def __init__(self, trainable_var, non_trainable_var):
         self.trainable_var = trainable_var
@@ -763,7 +773,7 @@ class TrainingTest(keras_parameterized.TestCase):
       def call(self, inputs):
         return inputs + self.trainable_var
 
-    class LayerWithWeightSharedLayers(keras.layers.Layer):
+    class LayerWithWeightSharedLayers(layers_module.Layer):
 
       def __init__(self):
         super(LayerWithWeightSharedLayers, self).__init__()
@@ -844,9 +854,9 @@ class TrainingTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes
   def test_mismatched_output_shape_and_target_shape(self):
-    model = keras.Sequential([
-        keras.layers.Dense(2, input_shape=(3, 4)),
-        keras.layers.Dense(5),
+    model = sequential.Sequential([
+        layers_module.Dense(2, input_shape=(3, 4)),
+        layers_module.Dense(5),
     ])
     model.compile(
         RMSPropOptimizer(learning_rate=0.001),
@@ -875,7 +885,7 @@ class TrainingTest(keras_parameterized.TestCase):
 
   def test_losses_in_defun(self):
     with context.eager_mode():
-      layer = keras.layers.Dense(1, kernel_regularizer='l1')
+      layer = layers_module.Dense(1, kernel_regularizer='l1')
       layer(array_ops.ones([1, 10]))
 
       @function.defun
@@ -888,9 +898,9 @@ class TrainingTest(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_logging(self):
     mock_stdout = io.BytesIO() if six.PY2 else io.StringIO()
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(10, activation='relu'))
-    model.add(keras.layers.Dense(1, activation='sigmoid'))
+    model = sequential.Sequential()
+    model.add(layers_module.Dense(10, activation='relu'))
+    model.add(layers_module.Dense(1, activation='sigmoid'))
     model.compile(
         RMSPropOptimizer(learning_rate=0.001),
         loss='binary_crossentropy',
@@ -900,21 +910,21 @@ class TrainingTest(keras_parameterized.TestCase):
           np.ones((10, 10), 'float32'), np.ones((10, 1), 'float32'), epochs=10)
     self.assertTrue('Epoch 5/10' in mock_stdout.getvalue())
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_training_with_loss_instance(self):
-    a = keras.layers.Input(shape=(3,), name='input_a')
-    b = keras.layers.Input(shape=(3,), name='input_b')
+    a = layers_module.Input(shape=(3,), name='input_a')
+    b = layers_module.Input(shape=(3,), name='input_b')
 
-    dense = keras.layers.Dense(4, name='dense')
+    dense = layers_module.Dense(4, name='dense')
     c = dense(a)
     d = dense(b)
-    e = keras.layers.Dropout(0.5, name='dropout')(c)
+    e = layers_module.Dropout(0.5, name='dropout')(c)
 
-    model = keras.models.Model([a, b], [d, e])
+    model = training_module.Model([a, b], [d, e])
     loss_weights = [1., 0.5]
     model.compile(
         RMSPropOptimizer(learning_rate=0.001),
-        loss=keras.losses.MeanSquaredError(),
+        loss=losses.MeanSquaredError(),
         metrics=[metrics_module.CategoricalAccuracy(), 'mae'],
         loss_weights=loss_weights)
 
@@ -928,12 +938,12 @@ class TrainingTest(keras_parameterized.TestCase):
               epochs=1,
               batch_size=5)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_static_batch_in_input_layer(self):
     if context.executing_eagerly():
       self.skipTest('Not inferred in eager.')
 
-    class Counter(keras.callbacks.Callback):
+    class Counter(Callback):
 
       def __init__(self):
         self.batches = 0
@@ -944,55 +954,55 @@ class TrainingTest(keras_parameterized.TestCase):
     x, y = np.ones((64, 10), 'float32'), np.ones((64, 1), 'float32')
 
     for batch_size, expected_batches in [(None, 2), (4, 16)]:
-      inputs = keras.Input(batch_size=batch_size, shape=(10,))
-      outputs = keras.layers.Dense(1, activation='sigmoid')(inputs)
-      model = keras.Model(inputs, outputs)
+      inputs = input_layer.Input(batch_size=batch_size, shape=(10,))
+      outputs = layers_module.Dense(1, activation='sigmoid')(inputs)
+      model = training_module.Model(inputs, outputs)
 
-      model.compile(keras.optimizer_v2.adam.Adam(0.001), 'binary_crossentropy')
+      model.compile(optimizer_v2.adam.Adam(0.001), 'binary_crossentropy')
       counter = Counter()
       model.fit(x, y, callbacks=[counter])
       self.assertEqual(counter.batches, expected_batches)
 
-      model = keras.Sequential(
-          [keras.layers.Dense(1, batch_input_shape=(batch_size, 10))])
-      model.compile(keras.optimizer_v2.adam.Adam(0.001), 'binary_crossentropy')
+      model = sequential.Sequential(
+          [layers_module.Dense(1, batch_input_shape=(batch_size, 10))])
+      model.compile(optimizer_v2.adam.Adam(0.001), 'binary_crossentropy')
       counter = Counter()
       model.fit(x, y, callbacks=[counter])
       self.assertEqual(counter.batches, expected_batches)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_static_batch_in_input_layer_consistency_checks(self):
     if context.executing_eagerly():
       self.skipTest('Not inferred in eager.')
     x, y = np.ones((64, 10), 'float32'), np.ones((64, 1), 'float32')
 
-    inputs = keras.Input(batch_size=2, shape=(10,))
-    outputs = keras.layers.Dense(1, activation='sigmoid')(inputs)
-    model = keras.Model(inputs, outputs)
-    model.compile(keras.optimizer_v2.adam.Adam(0.001), 'binary_crossentropy')
+    inputs = input_layer.Input(batch_size=2, shape=(10,))
+    outputs = layers_module.Dense(1, activation='sigmoid')(inputs)
+    model = training_module.Model(inputs, outputs)
+    model.compile(optimizer_v2.adam.Adam(0.001), 'binary_crossentropy')
     with self.assertRaisesRegexp(ValueError,
                                  'incompatible with the specified batch size'):
       model.fit(x, y, batch_size=4)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_compatible_batch_size_functional_model(self):
 
-    class MyLayer(keras.layers.Layer):
+    class MyLayer(layers_module.Layer):
 
       def call(self, inputs):
         return array_ops.concat(inputs, axis=0)
 
-    input1 = keras.Input(batch_size=2, shape=(10,))
-    input2 = keras.Input(batch_size=3, shape=(10,))
+    input1 = input_layer.Input(batch_size=2, shape=(10,))
+    input2 = input_layer.Input(batch_size=3, shape=(10,))
     outputs = MyLayer()([input1, input2])
     with self.assertRaisesRegexp(ValueError,
                                  'specified batch sizes of the Input Layers'):
-      keras.Model([input1, input2], outputs)
+      training_module.Model([input1, input2], outputs)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_calling_subclass_model_on_different_datasets(self):
 
-    class SubclassedModel(keras.models.Model):
+    class SubclassedModel(training_module.Model):
 
       def call(self, inputs):
         return inputs * 2
@@ -1082,7 +1092,7 @@ class TrainingTest(keras_parameterized.TestCase):
         'mse',
         run_eagerly=testing_utils.should_run_eagerly())
 
-    class ValCounter(keras.callbacks.Callback):
+    class ValCounter(Callback):
 
       def __init__(self):
         self.val_runs = 0
@@ -1121,7 +1131,7 @@ class TrainingTest(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_layer_with_variable_output(self):
 
-    class VariableOutputLayer(keras.layers.Layer):
+    class VariableOutputLayer(layers_module.Layer):
 
       def build(self, input_shape):
         self.v = self.add_weight('output_var', shape=(2, 5), initializer='ones')
@@ -1130,7 +1140,7 @@ class TrainingTest(keras_parameterized.TestCase):
         return self.v
 
     model = testing_utils.get_model_from_layers(
-        [VariableOutputLayer(), keras.layers.Dense(1)], input_shape=(10,))
+        [VariableOutputLayer(), layers_module.Dense(1)], input_shape=(10,))
     # TODO(omalleyt): Make this work with `run_eagerly=True`.
     model.compile('sgd', 'mse', run_eagerly=False)
     model.fit(np.ones((10, 10)), np.ones((10, 1)), batch_size=2, epochs=5)
@@ -1142,7 +1152,7 @@ class TrainingTest(keras_parameterized.TestCase):
   @testing_utils.enable_v2_dtype_behavior
   def test_model_dtype(self):
 
-    class AssertTypeLayer(keras.layers.Layer):
+    class AssertTypeLayer(layers_module.Layer):
 
       def call(self, inputs):
         assert inputs.dtype.name == self.dtype, (
@@ -1181,13 +1191,14 @@ class TrainingTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_subclassed_model_with_training_arg(self):
-    class LayerWithTrainingArg(keras.layers.Layer):
+
+    class LayerWithTrainingArg(layers_module.Layer):
 
       def call(self, inputs, training=None):
         self.training = training
         return inputs
 
-    class ModelWithTrainingArg(keras.Model):
+    class ModelWithTrainingArg(training_module.Model):
 
       def __init__(self):
         super(ModelWithTrainingArg, self).__init__()
@@ -1209,20 +1220,20 @@ class TrainingTest(keras_parameterized.TestCase):
     if context.executing_eagerly():
       expected_training_arg = True
     else:
-      expected_training_arg = keras.backend.symbolic_learning_phase()
+      expected_training_arg = backend.symbolic_learning_phase()
 
     self.assertIs(model.training, expected_training_arg)
     self.assertIs(model.l1.training, expected_training_arg)
 
   @keras_parameterized.run_all_keras_modes
   def test_error_when_model_is_not_compiled(self):
-    inputs = keras.Input(shape=(1,))
-    outputs = keras.layers.Dense(1)(inputs)
-    model = keras.Model(inputs, outputs)
+    inputs = input_layer.Input(shape=(1,))
+    outputs = layers_module.Dense(1)(inputs)
+    model = training_module.Model(inputs, outputs)
     with self.assertRaisesRegex(RuntimeError, 'must compile your model'):
       model.fit(np.ones((1, 1)), np.ones((1, 1)))
 
-    class MyModel(keras.Model):
+    class MyModel(training_module.Model):
 
       def call(self, x):
         self.add_loss(math_ops.reduce_sum(x))
@@ -1235,10 +1246,14 @@ class TrainingTest(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   @testing_utils.enable_v2_dtype_behavior
   def test_losses_of_different_dtypes(self):
-    inp = keras.Input(shape=(2,))
-    out_1 = keras.layers.Dense(2, dtype='float32', kernel_regularizer='l2')(inp)
-    out_2 = keras.layers.Dense(2, dtype='float16', kernel_regularizer='l2')(inp)
-    model = keras.Model(inp, [out_1, out_2])
+    inp = input_layer.Input(shape=(2,))
+    out_1 = layers_module.Dense(
+        2, dtype='float32', kernel_regularizer='l2')(
+            inp)
+    out_2 = layers_module.Dense(
+        2, dtype='float16', kernel_regularizer='l2')(
+            inp)
+    model = training_module.Model(inp, [out_1, out_2])
     extra_loss = math_ops.reduce_sum(math_ops.cast(out_2, 'float64'))
     model.add_loss(extra_loss)
     model.compile('sgd', ['mse', 'mse'],
@@ -1249,10 +1264,11 @@ class TrainingTest(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   @testing_utils.enable_v2_dtype_behavior
   def test_losses_of_different_dtypes_with_subclassed_model(self):
-    class MyModel(keras.Model):
+
+    class MyModel(training_module.Model):
 
       def build(self, _):
-        self.dense = keras.layers.Dense(2)
+        self.dense = layers_module.Dense(2)
 
       def call(self, inputs):
         self.add_loss(math_ops.cast(nn_ops.l2_loss(inputs), 'float64'))
@@ -1266,12 +1282,15 @@ class TrainingTest(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   @testing_utils.enable_v2_dtype_behavior
   def test_regularizer_of_different_dtype(self):
-    inp = keras.Input(shape=(2,))
+    inp = input_layer.Input(shape=(2,))
+
     def regularizer(weight):
       return math_ops.cast(nn_ops.l2_loss(weight), 'float64')
-    out = keras.layers.Dense(2, dtype='float32',
-                             kernel_regularizer=regularizer)(inp)
-    model = keras.Model(inp, out)
+
+    out = layers_module.Dense(
+        2, dtype='float32', kernel_regularizer=regularizer)(
+            inp)
+    model = training_module.Model(inp, out)
     model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
     x, y = np.ones((10, 2)), np.ones((10, 2))
     model.fit(x, y)
@@ -1279,7 +1298,7 @@ class TrainingTest(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_outputs_are_floats(self):
     x, y = np.ones((10, 1)), np.ones((10, 1))
-    model = keras.Sequential([keras.layers.Dense(1)])
+    model = sequential.Sequential([layers_module.Dense(1)])
     model.compile('sgd', 'mse', metrics=['accuracy'],
                   run_eagerly=testing_utils.should_run_eagerly())
 
@@ -1302,7 +1321,7 @@ class TrainingTest(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_int_output(self):
     x, y = np.ones((10, 1)), np.ones((10, 1))
-    model = keras.Sequential([keras.layers.Dense(1)])
+    model = sequential.Sequential([layers_module.Dense(1)])
 
     class MyMetric(metrics_module.Metric):
 
@@ -1320,7 +1339,7 @@ class TrainingTest(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_calling_aggregate_gradient(self):
 
-    class _Optimizer(gradient_descent.SGD):
+    class _Optimizer(optimizer_v2.gradient_descent.SGD):
       """Mock optimizer to check if _aggregate_gradient is called."""
 
       _HAS_ALL_REDUCE_SUM_GRAD = True
@@ -1335,8 +1354,8 @@ class TrainingTest(keras_parameterized.TestCase):
 
     mock_optimizer = _Optimizer()
 
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(10, activation='relu'))
+    model = sequential.Sequential()
+    model.add(layers_module.Dense(10, activation='relu'))
 
     model.compile(mock_optimizer, 'mse',
                   run_eagerly=testing_utils.should_run_eagerly())
@@ -1370,10 +1389,12 @@ class TestExceptionsAndWarnings(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_compile_warning_for_loss_missing_output(self):
     with self.cached_session():
-      inp = keras.layers.Input(shape=(16,), name='input_a')
-      out_1 = keras.layers.Dense(8, name='dense_1')(inp)
-      out_2 = keras.layers.Dense(3, activation='softmax', name='dense_2')(out_1)
-      model = keras.models.Model(inputs=[inp], outputs=[out_1, out_2])
+      inp = layers_module.Input(shape=(16,), name='input_a')
+      out_1 = layers_module.Dense(8, name='dense_1')(inp)
+      out_2 = layers_module.Dense(
+          3, activation='softmax', name='dense_2')(
+              out_1)
+      model = training_module.Model(inputs=[inp], outputs=[out_1, out_2])
       optimizer = RMSPropOptimizer(learning_rate=0.001)
 
       model.compile(
@@ -1390,14 +1411,14 @@ class TestExceptionsAndWarnings(keras_parameterized.TestCase):
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_sparse_op_with_op_layer(self):
-    inputs = keras.layers.Input(shape=(2,), sparse=True, name='sparse_tensor')
+    inputs = layers_module.Input(shape=(2,), sparse=True, name='sparse_tensor')
     output = sparse_ops.sparse_minimum(inputs, inputs)
     with self.assertRaisesRegexp(
         ValueError,
         'Sparse ops are not supported with functional models with built-in '
         'layer wrapping'
     ):
-      keras.Model([inputs], output)
+      training_module.Model([inputs], output)
 
 
 class LossWeightingTest(keras_parameterized.TestCase):
@@ -1554,12 +1575,12 @@ class LossWeightingTest(keras_parameterized.TestCase):
     learning_rate = 0.001
 
     with self.cached_session():
-      model = keras.models.Sequential()
+      model = sequential.Sequential()
       model.add(
-          keras.layers.TimeDistributed(
-              keras.layers.Dense(num_classes),
+          layers_module.TimeDistributed(
+              layers_module.Dense(num_classes),
               input_shape=(timesteps, input_dim)))
-      model.add(keras.layers.Activation('softmax'))
+      model.add(layers_module.Activation('softmax'))
 
       np.random.seed(1337)
       (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
@@ -1635,11 +1656,11 @@ class LossWeightingTest(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   @keras_parameterized.run_with_all_model_types(exclude_models='sequential')
   def test_fit_with_incorrect_weights(self):
-    input_a = keras.layers.Input(shape=(3,), name='input_a')
-    input_b = keras.layers.Input(shape=(3,), name='input_b')
+    input_a = layers_module.Input(shape=(3,), name='input_a')
+    input_b = layers_module.Input(shape=(3,), name='input_b')
 
-    dense = keras.layers.Dense(2, name='output_1')
-    dropout = keras.layers.Dropout(0.5, name='output_2')
+    dense = layers_module.Dense(2, name='output_1')
+    dropout = layers_module.Dropout(0.5, name='output_2')
     branch_a = [input_a, dense]
     branch_b = [input_b, dense, dropout]
 
@@ -1666,10 +1687,10 @@ class LossWeightingTest(keras_parameterized.TestCase):
     learning_rate = 0.001
 
     with self.cached_session():
-      model = keras.models.Sequential()
+      model = sequential.Sequential()
       model.add(
-          keras.layers.TimeDistributed(
-              keras.layers.Dense(num_classes),
+          layers_module.TimeDistributed(
+              layers_module.Dense(num_classes),
               input_shape=(timesteps, input_dim)))
 
       x = np.random.random((10, timesteps, input_dim))
@@ -1728,8 +1749,8 @@ class LossWeightingTest(keras_parameterized.TestCase):
     """Tests that sample weight may be defined as a tensor in the graph."""
     with ops.get_default_graph().as_default():
       # Create a simple pass-through model
-      input_layer = keras.layers.Input(shape=1, name='input_layer')
-      model = keras.Model(inputs=input_layer, outputs=input_layer)
+      inputs = layers_module.Input(shape=1, name='input_layer')
+      model = training_module.Model(inputs=inputs, outputs=inputs)
       model.compile(
           loss='mean_absolute_error',
           optimizer='adam')
@@ -1759,9 +1780,9 @@ class MaskingTest(keras_parameterized.TestCase):
 
   def _get_model(self, input_shape=None):
     layers = [
-        keras.layers.Masking(mask_value=0),
-        keras.layers.TimeDistributed(
-            keras.layers.Dense(1, kernel_initializer='one'))
+        layers_module.Masking(mask_value=0),
+        layers_module.TimeDistributed(
+            layers_module.Dense(1, kernel_initializer='one'))
     ]
     model = testing_utils.get_model_from_layers(layers, input_shape)
     model.compile(
@@ -1790,7 +1811,7 @@ class MaskingTest(keras_parameterized.TestCase):
     # Test that the mask argument gets correctly passed to a layer in the
     # functional API.
 
-    class CustomMaskedLayer(keras.layers.Layer):
+    class CustomMaskedLayer(layers_module.Layer):
 
       def __init__(self):
         super(CustomMaskedLayer, self).__init__()
@@ -1804,11 +1825,11 @@ class MaskingTest(keras_parameterized.TestCase):
         return input_shape
 
     x = np.random.random((5, 3))
-    inputs = keras.layers.Input((3,))
-    masked = keras.layers.Masking(mask_value=0)(inputs)
+    inputs = layers_module.Input((3,))
+    masked = layers_module.Masking(mask_value=0)(inputs)
     outputs = CustomMaskedLayer()(masked)
 
-    model = keras.Model(inputs, outputs)
+    model = training_module.Model(inputs, outputs)
     model.compile(
         loss='mse',
         optimizer=RMSPropOptimizer(learning_rate=0.001),
@@ -1824,8 +1845,8 @@ class TestDynamicTrainability(keras_parameterized.TestCase):
     x = np.random.random((5, 3))
     y = np.random.random((5, 2))
 
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(2, input_dim=3))
+    model = sequential.Sequential()
+    model.add(layers_module.Dense(2, input_dim=3))
     model.trainable = False
     model.compile(
         'rmsprop',
@@ -1840,8 +1861,8 @@ class TestDynamicTrainability(keras_parameterized.TestCase):
       x = np.random.random((5, 3))
       y = np.random.random((5, 2))
 
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_dim=3, trainable=False))
+      model = sequential.Sequential()
+      model.add(layers_module.Dense(2, input_dim=3, trainable=False))
       model.compile(
           'rmsprop',
           'mse',
@@ -1852,9 +1873,9 @@ class TestDynamicTrainability(keras_parameterized.TestCase):
       self.assertAllClose(out, out_2)
 
       # test with nesting
-      inputs = keras.layers.Input(shape=(3,))
+      inputs = layers_module.Input(shape=(3,))
       output = model(inputs)
-      model = keras.models.Model(inputs, output)
+      model = training_module.Model(inputs, output)
       model.compile(
           'rmsprop',
           'mse',
@@ -1866,55 +1887,55 @@ class TestDynamicTrainability(keras_parameterized.TestCase):
 
   def test_layer_trainability_switch(self):
     # with constructor argument, in Sequential
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(2, trainable=False, input_dim=1))
+    model = sequential.Sequential()
+    model.add(layers_module.Dense(2, trainable=False, input_dim=1))
     self.assertListEqual(model.trainable_weights, [])
 
     # by setting the `trainable` argument, in Sequential
-    model = keras.models.Sequential()
-    layer = keras.layers.Dense(2, input_dim=1)
+    model = sequential.Sequential()
+    layer = layers_module.Dense(2, input_dim=1)
     model.add(layer)
     self.assertListEqual(model.trainable_weights, layer.trainable_weights)
     layer.trainable = False
     self.assertListEqual(model.trainable_weights, [])
 
     # with constructor argument, in Model
-    x = keras.layers.Input(shape=(1,))
-    y = keras.layers.Dense(2, trainable=False)(x)
-    model = keras.models.Model(x, y)
+    x = layers_module.Input(shape=(1,))
+    y = layers_module.Dense(2, trainable=False)(x)
+    model = training_module.Model(x, y)
     self.assertListEqual(model.trainable_weights, [])
 
     # by setting the `trainable` argument, in Model
-    x = keras.layers.Input(shape=(1,))
-    layer = keras.layers.Dense(2)
+    x = layers_module.Input(shape=(1,))
+    layer = layers_module.Dense(2)
     y = layer(x)
-    model = keras.models.Model(x, y)
+    model = training_module.Model(x, y)
     self.assertListEqual(model.trainable_weights, layer.trainable_weights)
     layer.trainable = False
     self.assertListEqual(model.trainable_weights, [])
 
   def test_model_trainability_switch(self):
     # a non-trainable model has no trainable weights
-    x = keras.layers.Input(shape=(1,))
-    y = keras.layers.Dense(2)(x)
-    model = keras.models.Model(x, y)
+    x = layers_module.Input(shape=(1,))
+    y = layers_module.Dense(2)(x)
+    model = training_module.Model(x, y)
     model.trainable = False
     self.assertListEqual(model.trainable_weights, [])
 
     # same for Sequential
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(2, input_dim=1))
+    model = sequential.Sequential()
+    model.add(layers_module.Dense(2, input_dim=1))
     model.trainable = False
     self.assertListEqual(model.trainable_weights, [])
 
   def test_nested_model_trainability(self):
     # a Sequential inside a Model
-    inner_model = keras.models.Sequential()
-    inner_model.add(keras.layers.Dense(2, input_dim=1))
+    inner_model = sequential.Sequential()
+    inner_model.add(layers_module.Dense(2, input_dim=1))
 
-    x = keras.layers.Input(shape=(1,))
+    x = layers_module.Input(shape=(1,))
     y = inner_model(x)
-    outer_model = keras.models.Model(x, y)
+    outer_model = training_module.Model(x, y)
     self.assertListEqual(outer_model.trainable_weights,
                          inner_model.trainable_weights)
     inner_model.trainable = False
@@ -1924,9 +1945,9 @@ class TestDynamicTrainability(keras_parameterized.TestCase):
     self.assertListEqual(outer_model.trainable_weights, [])
 
     # a Sequential inside a Sequential
-    inner_model = keras.models.Sequential()
-    inner_model.add(keras.layers.Dense(2, input_dim=1))
-    outer_model = keras.models.Sequential()
+    inner_model = sequential.Sequential()
+    inner_model.add(layers_module.Dense(2, input_dim=1))
+    outer_model = sequential.Sequential()
     outer_model.add(inner_model)
     self.assertListEqual(outer_model.trainable_weights,
                          inner_model.trainable_weights)
@@ -1937,12 +1958,12 @@ class TestDynamicTrainability(keras_parameterized.TestCase):
     self.assertListEqual(outer_model.trainable_weights, [])
 
     # a Model inside a Model
-    x = keras.layers.Input(shape=(1,))
-    y = keras.layers.Dense(2)(x)
-    inner_model = keras.models.Model(x, y)
-    x = keras.layers.Input(shape=(1,))
+    x = layers_module.Input(shape=(1,))
+    y = layers_module.Dense(2)(x)
+    inner_model = training_module.Model(x, y)
+    x = layers_module.Input(shape=(1,))
     y = inner_model(x)
-    outer_model = keras.models.Model(x, y)
+    outer_model = training_module.Model(x, y)
     self.assertListEqual(outer_model.trainable_weights,
                          inner_model.trainable_weights)
     inner_model.trainable = False
@@ -1952,10 +1973,10 @@ class TestDynamicTrainability(keras_parameterized.TestCase):
     self.assertListEqual(outer_model.trainable_weights, [])
 
     # a Model inside a Sequential
-    x = keras.layers.Input(shape=(1,))
-    y = keras.layers.Dense(2)(x)
-    inner_model = keras.models.Model(x, y)
-    outer_model = keras.models.Sequential()
+    x = layers_module.Input(shape=(1,))
+    y = layers_module.Dense(2)(x)
+    inner_model = training_module.Model(x, y)
+    outer_model = sequential.Sequential()
     outer_model.add(inner_model)
     self.assertListEqual(outer_model.trainable_weights,
                          inner_model.trainable_weights)
@@ -1966,20 +1987,20 @@ class TestDynamicTrainability(keras_parameterized.TestCase):
     self.assertListEqual(outer_model.trainable_weights, [])
 
   def test_gan_workflow(self):
-    shared_layer = keras.layers.BatchNormalization()
+    shared_layer = layers_module.BatchNormalization()
 
-    inputs1 = keras.Input(10)
+    inputs1 = input_layer.Input(10)
     outputs1 = shared_layer(inputs1)
-    model1 = keras.Model(inputs1, outputs1)
+    model1 = training_module.Model(inputs1, outputs1)
     shared_layer.trainable = False
     model1.compile(
         'sgd',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly())
 
-    inputs2 = keras.Input(10)
+    inputs2 = input_layer.Input(10)
     outputs2 = shared_layer(inputs2)
-    model2 = keras.Model(inputs2, outputs2)
+    model2 = training_module.Model(inputs2, outputs2)
     shared_layer.trainable = True
     model2.compile(
         'sgd',
@@ -1999,13 +2020,13 @@ class TestDynamicTrainability(keras_parameterized.TestCase):
     self.assertNotAllClose(out2_0, out2_1)
 
   def test_toggle_value(self):
-    input_0 = keras.layers.Input(shape=(1,))
-    dense_0 = keras.layers.Dense(1, kernel_initializer='ones',
-                                 bias_initializer='ones')
-    dense_1 = keras.layers.Dense(1, kernel_initializer='ones',
-                                 bias_initializer='ones')
-    result = keras.layers.Add()([dense_0(input_0), dense_1(input_0)])
-    model = keras.models.Model(input_0, result)
+    input_0 = layers_module.Input(shape=(1,))
+    dense_0 = layers_module.Dense(
+        1, kernel_initializer='ones', bias_initializer='ones')
+    dense_1 = layers_module.Dense(
+        1, kernel_initializer='ones', bias_initializer='ones')
+    result = layers_module.Add()([dense_0(input_0), dense_1(input_0)])
+    model = training_module.Model(input_0, result)
     dense_0.trainable = False
     model.compile(
         'sgd',
@@ -2028,9 +2049,9 @@ class TestTrainingWithDataTensors(keras_parameterized.TestCase):
 
   def test_training_and_eval_methods_on_symbolic_tensors_single_io(self):
     with ops.Graph().as_default():
-      x = keras.layers.Input(shape=(3,), name='input')
-      y = keras.layers.Dense(4, name='dense')(x)
-      model = keras.Model(x, y)
+      x = layers_module.Input(shape=(3,), name='input')
+      y = layers_module.Dense(4, name='dense')(x)
+      model = training_module.Model(x, y)
 
       optimizer = RMSPropOptimizer(learning_rate=0.001)
       loss = 'mse'
@@ -2039,8 +2060,8 @@ class TestTrainingWithDataTensors(keras_parameterized.TestCase):
           loss,
           metrics=['mae', metrics_module.CategoricalAccuracy()])
 
-      inputs = keras.backend.zeros(shape=(10, 3))
-      targets = keras.backend.zeros(shape=(10, 4))
+      inputs = backend.zeros(shape=(10, 3))
+      targets = backend.zeros(shape=(10, 4))
 
       model.fit(inputs, targets, epochs=1, steps_per_epoch=2, verbose=0)
       model.evaluate(inputs, targets, steps=2, verbose=0)
@@ -2067,15 +2088,15 @@ class TestTrainingWithDataTensors(keras_parameterized.TestCase):
                 validation_data=(inputs, targets), validation_steps=2)
 
   def test_training_and_eval_methods_on_symbolic_tensors_multi_io(self):
-    a = keras.layers.Input(shape=(3,), name='input_a')
-    b = keras.layers.Input(shape=(3,), name='input_b')
+    a = layers_module.Input(shape=(3,), name='input_a')
+    b = layers_module.Input(shape=(3,), name='input_b')
 
-    dense = keras.layers.Dense(4, name='dense')
+    dense = layers_module.Dense(4, name='dense')
     c = dense(a)
     d = dense(b)
-    e = keras.layers.Dropout(0.5, name='dropout')(c)
+    e = layers_module.Dropout(0.5, name='dropout')(c)
 
-    model = keras.models.Model([a, b], [d, e])
+    model = training_module.Model([a, b], [d, e])
 
     optimizer = 'rmsprop'
     loss = 'mse'
@@ -2165,17 +2186,16 @@ class TestTrainingWithDataTensors(keras_parameterized.TestCase):
       output_a_np = np.random.random((10, 4))
       output_b_np = np.random.random((10, 3))
 
-      input_v = keras.backend.variables_module.Variable(
-          input_a_np, dtype='float32')
+      input_v = backend.variables_module.Variable(input_a_np, dtype='float32')
       self.evaluate(variables_lib.variables_initializer([input_v]))
-      a = keras.Input(tensor=input_v)
-      b = keras.Input(shape=(3,), name='input_b')
+      a = input_layer.Input(tensor=input_v)
+      b = input_layer.Input(shape=(3,), name='input_b')
 
-      a_2 = keras.layers.Dense(4, name='dense_1')(a)
-      dp = keras.layers.Dropout(0.5, name='dropout')
+      a_2 = layers_module.Dense(4, name='dense_1')(a)
+      dp = layers_module.Dropout(0.5, name='dropout')
       b_2 = dp(b)
 
-      model = keras.models.Model([a, b], [a_2, b_2])
+      model = training_module.Model([a, b], [a_2, b_2])
       model.summary()
 
       optimizer = 'rmsprop'
@@ -2214,10 +2234,10 @@ class TestTrainingWithDataTensors(keras_parameterized.TestCase):
       # Now test a model with a single input
       # i.e. we don't pass any data to fit the model.
       self.evaluate(variables_lib.variables_initializer([input_v]))
-      a = keras.Input(tensor=input_v)
-      a_2 = keras.layers.Dense(4, name='dense_1')(a)
-      a_2 = keras.layers.Dropout(0.5, name='dropout')(a_2)
-      model = keras.models.Model(a, a_2)
+      a = input_layer.Input(tensor=input_v)
+      a_2 = layers_module.Dense(4, name='dense_1')(a)
+      a_2 = layers_module.Dropout(0.5, name='dropout')(a_2)
+      model = training_module.Model(a, a_2)
       model.summary()
 
       optimizer = 'rmsprop'
@@ -2253,9 +2273,9 @@ class TestTrainingWithDataTensors(keras_parameterized.TestCase):
       # Same, without learning phase
       # i.e. we don't pass any data to fit the model.
       self.evaluate(variables_lib.variables_initializer([input_v]))
-      a = keras.Input(tensor=input_v)
-      a_2 = keras.layers.Dense(4, name='dense_1')(a)
-      model = keras.models.Model(a, a_2)
+      a = input_layer.Input(tensor=input_v)
+      a_2 = layers_module.Dense(4, name='dense_1')(a)
+      model = training_module.Model(a, a_2)
       model.summary()
 
       optimizer = 'rmsprop'
@@ -2291,11 +2311,11 @@ class TestTrainingWithDataTensors(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_model_with_partial_loss(self):
     with self.cached_session():
-      a = keras.Input(shape=(3,), name='input_a')
-      a_2 = keras.layers.Dense(4, name='dense_1')(a)
-      dp = keras.layers.Dropout(0.5, name='dropout')
+      a = input_layer.Input(shape=(3,), name='input_a')
+      a_2 = layers_module.Dense(4, name='dense_1')(a)
+      dp = layers_module.Dropout(0.5, name='dropout')
       a_3 = dp(a_2)
-      model = keras.models.Model(a, [a_2, a_3])
+      model = training_module.Model(a, [a_2, a_3])
 
       optimizer = 'rmsprop'
       loss = {'dropout': 'mse'}
@@ -2308,15 +2328,15 @@ class TestTrainingWithDataTensors(keras_parameterized.TestCase):
       _ = model.train_on_batch(input_a_np, output_a_np)
       _ = model.test_on_batch(input_a_np, output_a_np)
       # fit
-      _ = model.fit(input_a_np, [output_a_np])
+      _ = model.fit(input_a_np, output_a_np)
       # evaluate
-      _ = model.evaluate(input_a_np, [output_a_np])
+      _ = model.evaluate(input_a_np, output_a_np)
 
       # Same without dropout.
-      a = keras.Input(shape=(3,), name='input_a')
-      a_2 = keras.layers.Dense(4, name='dense_1')(a)
-      a_3 = keras.layers.Dense(4, name='dense_2')(a_2)
-      model = keras.models.Model(a, [a_2, a_3])
+      a = input_layer.Input(shape=(3,), name='input_a')
+      a_2 = layers_module.Dense(4, name='dense_1')(a)
+      a_3 = layers_module.Dense(4, name='dense_2')(a_2)
+      model = training_module.Model(a, [a_2, a_3])
 
       optimizer = 'rmsprop'
       loss = {'dense_2': 'mse'}
@@ -2326,21 +2346,21 @@ class TestTrainingWithDataTensors(keras_parameterized.TestCase):
       _ = model.train_on_batch(input_a_np, output_a_np)
       _ = model.test_on_batch(input_a_np, output_a_np)
       # fit
-      _ = model.fit(input_a_np, [output_a_np])
+      _ = model.fit(input_a_np, output_a_np)
       # evaluate
-      _ = model.evaluate(input_a_np, [output_a_np])
+      _ = model.evaluate(input_a_np, output_a_np)
 
   def test_model_with_external_loss(self):
     with ops.Graph().as_default(), self.cached_session():
       # None loss, only regularization loss.
-      a = keras.Input(shape=(3,), name='input_a')
-      a_2 = keras.layers.Dense(4, name='dense_1',
-                               kernel_regularizer='l1',
-                               bias_regularizer='l2')(a)
-      dp = keras.layers.Dropout(0.5, name='dropout')
+      a = input_layer.Input(shape=(3,), name='input_a')
+      a_2 = layers_module.Dense(
+          4, name='dense_1', kernel_regularizer='l1', bias_regularizer='l2')(
+              a)
+      dp = layers_module.Dropout(0.5, name='dropout')
       a_3 = dp(a_2)
 
-      model = keras.models.Model(a, [a_2, a_3])
+      model = training_module.Model(a, [a_2, a_3])
 
       optimizer = 'rmsprop'
       loss = None
@@ -2357,12 +2377,12 @@ class TestTrainingWithDataTensors(keras_parameterized.TestCase):
       out = model.evaluate(input_a_np, None)
 
       # No dropout, external loss.
-      a = keras.Input(shape=(3,), name='input_a')
-      a_2 = keras.layers.Dense(4, name='dense_1')(a)
-      a_3 = keras.layers.Dense(4, name='dense_2')(a)
+      a = input_layer.Input(shape=(3,), name='input_a')
+      a_2 = layers_module.Dense(4, name='dense_1')(a)
+      a_3 = layers_module.Dense(4, name='dense_2')(a)
 
-      model = keras.models.Model(a, [a_2, a_3])
-      model.add_loss(keras.backend.mean(a_3 + a_2))
+      model = training_module.Model(a, [a_2, a_3])
+      model.add_loss(backend.mean(a_3 + a_2))
 
       optimizer = 'rmsprop'
       loss = None
@@ -2377,14 +2397,13 @@ class TestTrainingWithDataTensors(keras_parameterized.TestCase):
       out = model.evaluate(input_a_np, None)
 
       # Test model with no external data at all.
-      input_v = keras.backend.variables_module.Variable(
-          input_a_np, dtype='float32')
+      input_v = backend.variables_module.Variable(input_a_np, dtype='float32')
       self.evaluate(variables_lib.variables_initializer([input_v]))
-      a = keras.Input(tensor=input_v)
-      a_2 = keras.layers.Dense(4, name='dense_1')(a)
-      a_2 = keras.layers.Dropout(0.5, name='dropout')(a_2)
-      model = keras.models.Model(a, a_2)
-      model.add_loss(keras.backend.mean(a_2))
+      a = input_layer.Input(tensor=input_v)
+      a_2 = layers_module.Dense(4, name='dense_1')(a)
+      a_2 = layers_module.Dropout(0.5, name='dropout')(a_2)
+      model = training_module.Model(a, a_2)
+      model.add_loss(backend.mean(a_2))
 
       model.compile(optimizer='rmsprop',
                     loss=None,
@@ -2397,11 +2416,11 @@ class TestTrainingWithDataTensors(keras_parameterized.TestCase):
 
       # Test multi-output model with no external data at all.
       self.evaluate(variables_lib.variables_initializer([input_v]))
-      a = keras.Input(tensor=input_v)
-      a_1 = keras.layers.Dense(4, name='dense_1')(a)
-      a_2 = keras.layers.Dropout(0.5, name='dropout')(a_1)
-      model = keras.models.Model(a, [a_1, a_2])
-      model.add_loss(keras.backend.mean(a_2))
+      a = input_layer.Input(tensor=input_v)
+      a_1 = layers_module.Dense(4, name='dense_1')(a)
+      a_2 = layers_module.Dropout(0.5, name='dropout')(a_1)
+      model = training_module.Model(a, [a_1, a_2])
+      model.add_loss(backend.mean(a_2))
 
       model.compile(optimizer='rmsprop',
                     loss=None,
@@ -2420,11 +2439,11 @@ class TestTrainingWithDataTensors(keras_parameterized.TestCase):
   def test_target_tensors(self):
     with ops.Graph().as_default(), self.cached_session():
       # single-output, as list
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(4, input_shape=(4,), name='dense'))
+      model = sequential.Sequential()
+      model.add(layers_module.Dense(4, input_shape=(4,), name='dense'))
       input_val = np.random.random((10, 4))
       target_val = np.random.random((10, 4))
-      target = keras.backend.variable(target_val)
+      target = backend.variable(target_val)
       model.compile(optimizer='rmsprop', loss='mse', target_tensors=[target])
       model.train_on_batch(input_val, None)
 
@@ -2456,13 +2475,13 @@ class TestTrainingWithDataTensors(keras_parameterized.TestCase):
       input_val = np.random.random((10, 4))
       target_val_a = np.random.random((10, 4))
       target_val_b = np.random.random((10, 4))
-      target_a = keras.backend.variable(target_val_a)
-      target_b = keras.backend.variable(target_val_b)
+      target_a = backend.variable(target_val_a)
+      target_b = backend.variable(target_val_b)
 
-      inputs = keras.layers.Input(shape=(4,))
-      output_a = keras.layers.Dense(4, name='dense_a')(inputs)
-      output_b = keras.layers.Dense(4, name='dense_b')(inputs)
-      model = keras.models.Model(inputs, [output_a, output_b])
+      inputs = layers_module.Input(shape=(4,))
+      output_a = layers_module.Dense(4, name='dense_a')(inputs)
+      output_b = layers_module.Dense(4, name='dense_b')(inputs)
+      model = training_module.Model(inputs, [output_a, output_b])
       model.compile(optimizer='rmsprop', loss='mse',
                     target_tensors=[target_a, target_b])
       model.train_on_batch(input_val, None)
@@ -2484,17 +2503,17 @@ class TestTrainingWithDataTensors(keras_parameterized.TestCase):
 
   def test_model_custom_target_tensors(self):
     with ops.Graph().as_default(), self.cached_session():
-      a = keras.Input(shape=(3,), name='input_a')
-      b = keras.Input(shape=(3,), name='input_b')
+      a = input_layer.Input(shape=(3,), name='input_a')
+      b = input_layer.Input(shape=(3,), name='input_b')
 
-      a_2 = keras.layers.Dense(4, name='dense_1')(a)
-      dp = keras.layers.Dropout(0.5, name='dropout')
+      a_2 = layers_module.Dense(4, name='dense_1')(a)
+      dp = layers_module.Dropout(0.5, name='dropout')
       b_2 = dp(b)
 
-      y = keras.backend.placeholder([10, 4], name='y')
-      y1 = keras.backend.placeholder([10, 3], name='y1')
-      y2 = keras.backend.placeholder([7, 5], name='y2')
-      model = keras.models.Model([a, b], [a_2, b_2])
+      y = backend.placeholder([10, 4], name='y')
+      y1 = backend.placeholder([10, 3], name='y1')
+      y2 = backend.placeholder([7, 5], name='y2')
+      model = training_module.Model([a, b], [a_2, b_2])
 
       optimizer = 'rmsprop'
       loss = 'mse'
@@ -2537,8 +2556,7 @@ class TestTrainingWithDataTensors(keras_parameterized.TestCase):
                                })
 
       # test with custom TF placeholder as target
-      pl_target_a = keras.backend.array_ops.placeholder('float32',
-                                                        shape=(None, 4))
+      pl_target_a = backend.array_ops.placeholder('float32', shape=(None, 4))
       model.compile(optimizer='rmsprop', loss='mse',
                     target_tensors={'dense_1': pl_target_a})
       model.train_on_batch([input_a_np, input_b_np],
@@ -2550,15 +2568,15 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes
   def test_metrics_names(self):
-    a = keras.layers.Input(shape=(3,), name='input_a')
-    b = keras.layers.Input(shape=(3,), name='input_b')
+    a = layers_module.Input(shape=(3,), name='input_a')
+    b = layers_module.Input(shape=(3,), name='input_b')
 
-    dense = keras.layers.Dense(4, name='dense')
+    dense = layers_module.Dense(4, name='dense')
     c = dense(a)
     d = dense(b)
-    e = keras.layers.Dropout(0.5, name='dropout')(c)
+    e = layers_module.Dropout(0.5, name='dropout')(c)
 
-    model = keras.models.Model([a, b], [d, e])
+    model = training_module.Model([a, b], [d, e])
 
     optimizer = RMSPropOptimizer(learning_rate=0.001)
     metrics = ['mse', metrics_module.BinaryAccuracy()]
@@ -2588,9 +2606,9 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes
   def test_metric_state_reset_between_fit_and_evaluate(self):
-    model = keras.Sequential()
-    model.add(keras.layers.Dense(3, activation='relu', input_dim=4))
-    model.add(keras.layers.Dense(1, activation='sigmoid'))
+    model = sequential.Sequential()
+    model.add(layers_module.Dense(3, activation='relu', input_dim=4))
+    model.add(layers_module.Dense(1, activation='sigmoid'))
     acc_obj = metrics_module.BinaryAccuracy()
     model.compile(
         loss='mae',
@@ -2611,12 +2629,12 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
   @keras_parameterized.run_with_all_model_types(exclude_models=['sequential'])
   @keras_parameterized.run_all_keras_modes
   def test_metrics_valid_compile_input_formats(self):
-    inp_1 = keras.layers.Input(shape=(1,), name='input_1')
-    inp_2 = keras.layers.Input(shape=(1,), name='input_2')
-    x = keras.layers.Dense(3, kernel_initializer='ones', trainable=False)
-    out_1 = keras.layers.Dense(
+    inp_1 = layers_module.Input(shape=(1,), name='input_1')
+    inp_2 = layers_module.Input(shape=(1,), name='input_2')
+    x = layers_module.Dense(3, kernel_initializer='ones', trainable=False)
+    out_1 = layers_module.Dense(
         1, kernel_initializer='ones', name='output_1', trainable=False)
-    out_2 = keras.layers.Dense(
+    out_2 = layers_module.Dense(
         1, kernel_initializer='ones', name='output_2', trainable=False)
 
     branch_a = [inp_1, x, out_1]
@@ -2627,8 +2645,8 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     model.compile(
         optimizer='rmsprop',
         loss='mse',
-        metrics=[keras.metrics.MeanSquaredError()],
-        weighted_metrics=[keras.metrics.MeanSquaredError()],
+        metrics=[metrics_module.MeanSquaredError()],
+        weighted_metrics=[metrics_module.MeanSquaredError()],
         run_eagerly=testing_utils.should_run_eagerly())
 
     # list of list of metrics.
@@ -2636,14 +2654,14 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
         optimizer='rmsprop',
         loss='mse',
         metrics=[
-            keras.metrics.MeanSquaredError(),
-            [keras.metrics.MeanSquaredError(),
-             keras.metrics.Accuracy()]
+            metrics_module.MeanSquaredError(),
+            [metrics_module.MeanSquaredError(),
+             metrics_module.Accuracy()]
         ],
         weighted_metrics=[
-            keras.metrics.MeanSquaredError(),
-            [keras.metrics.MeanSquaredError(),
-             keras.metrics.Accuracy()]
+            metrics_module.MeanSquaredError(),
+            [metrics_module.MeanSquaredError(),
+             metrics_module.Accuracy()]
         ],
         run_eagerly=testing_utils.should_run_eagerly())
 
@@ -2653,18 +2671,18 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
         loss='mse',
         metrics={
             'output_1':
-                keras.metrics.MeanSquaredError(),
+                metrics_module.MeanSquaredError(),
             'output_2': [
-                keras.metrics.MeanSquaredError(),
-                keras.metrics.Accuracy()
+                metrics_module.MeanSquaredError(),
+                metrics_module.Accuracy()
             ],
         },
         weighted_metrics={
             'output_1':
-                keras.metrics.MeanSquaredError(),
+                metrics_module.MeanSquaredError(),
             'output_2': [
-                keras.metrics.MeanSquaredError(),
-                keras.metrics.Accuracy()
+                metrics_module.MeanSquaredError(),
+                metrics_module.Accuracy()
             ],
         },
         run_eagerly=testing_utils.should_run_eagerly())
@@ -2672,11 +2690,11 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_metrics_masking(self):
     np.random.seed(1337)
-    model = keras.models.Sequential()
-    model.add(keras.layers.Masking(mask_value=0, input_shape=(2, 1)))
+    model = sequential.Sequential()
+    model.add(layers_module.Masking(mask_value=0, input_shape=(2, 1)))
     model.add(
-        keras.layers.TimeDistributed(
-            keras.layers.Dense(1, kernel_initializer='ones')))
+        layers_module.TimeDistributed(
+            layers_module.Dense(1, kernel_initializer='ones')))
     model.compile(
         RMSPropOptimizer(learning_rate=0.001),
         loss='mse',
@@ -2696,9 +2714,9 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes
   def test_add_metric_with_tensor_on_model(self):
-    x = keras.layers.Input(shape=(1,))
-    y = keras.layers.Dense(1, kernel_initializer='ones')(x)
-    model = keras.models.Model(x, y)
+    x = layers_module.Input(shape=(1,))
+    y = layers_module.Dense(1, kernel_initializer='ones')(x)
+    model = training_module.Model(x, y)
     model.add_metric(
         math_ops.reduce_sum(y), name='metric_1', aggregation='mean')
 
@@ -2711,7 +2729,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
 
     with self.assertRaisesRegex(
         ValueError, 'Using the result of calling a `Metric` object '):
-      with keras.backend.get_graph().as_default():
+      with backend.get_graph().as_default():
         model.add_metric(metrics_module.Mean(name='metric_2')(y))
 
     model.compile(
@@ -2740,11 +2758,11 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_add_metric_in_model_call(self):
 
-    class TestModel(keras.Model):
+    class TestModel(training_module.Model):
 
       def __init__(self):
         super(TestModel, self).__init__(name='test_model')
-        self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+        self.dense1 = layers_module.Dense(2, kernel_initializer='ones')
         self.mean = metrics_module.Mean(name='metric_1')
 
       def call(self, x):
@@ -2781,7 +2799,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_add_metric_in_layer_call(self):
 
-    class TestLayer(keras.layers.Layer):
+    class TestLayer(layers_module.Layer):
 
       def build(self, input_shape):
         self.a = self.add_variable(
@@ -2795,7 +2813,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
 
     layers = [
         TestLayer(input_shape=(1,)),
-        keras.layers.Dense(2, kernel_initializer='ones')
+        layers_module.Dense(2, kernel_initializer='ones')
     ]
     model = testing_utils.get_model_from_layers(layers, input_shape=(1,))
     model.compile(
@@ -2812,11 +2830,11 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_model_metrics_list(self):
 
-    class LayerWithAddMetric(keras.layers.Layer):
+    class LayerWithAddMetric(layers_module.Layer):
 
       def __init__(self):
         super(LayerWithAddMetric, self).__init__()
-        self.dense = keras.layers.Dense(1, kernel_initializer='ones')
+        self.dense = layers_module.Dense(1, kernel_initializer='ones')
 
       def __call__(self, inputs):
         outputs = self.dense(inputs)
@@ -2824,7 +2842,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
             math_ops.reduce_sum(outputs), name='metric_1', aggregation='mean')
         return outputs
 
-    class LayerWithNestedAddMetricLayer(keras.layers.Layer):
+    class LayerWithNestedAddMetricLayer(layers_module.Layer):
 
       def __init__(self):
         super(LayerWithNestedAddMetricLayer, self).__init__()
@@ -2836,10 +2854,10 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
             math_ops.reduce_sum(outputs), name='metric_2', aggregation='mean')
         return outputs
 
-    x = keras.layers.Input(shape=(1,))
+    x = layers_module.Input(shape=(1,))
     y = LayerWithNestedAddMetricLayer()(x)
 
-    model = keras.models.Model(x, y)
+    model = training_module.Model(x, y)
     model.add_metric(
         math_ops.reduce_sum(y), name='metric_3', aggregation='mean')
 
@@ -2852,7 +2870,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
 
     with self.assertRaisesRegex(
         ValueError, 'Using the result of calling a `Metric` object '):
-      with keras.backend.get_graph().as_default():
+      with backend.get_graph().as_default():
         model.add_metric(metrics_module.Mean(name='metric_4')(y))
 
     model.compile(
@@ -2871,11 +2889,11 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_model_metrics_list_in_call(self):
 
-    class TestModel(keras.Model):
+    class TestModel(training_module.Model):
 
       def __init__(self):
         super(TestModel, self).__init__(name='test_model')
-        self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+        self.dense1 = layers_module.Dense(2, kernel_initializer='ones')
 
       def call(self, x):
         self.add_metric(
@@ -2898,11 +2916,11 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_multiple_add_metric_calls(self):
 
-    class TestModel(keras.Model):
+    class TestModel(training_module.Model):
 
       def __init__(self):
         super(TestModel, self).__init__(name='test_model')
-        self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+        self.dense1 = layers_module.Dense(2, kernel_initializer='ones')
         self.mean1 = metrics_module.Mean(name='metric_1')
         self.mean2 = metrics_module.Mean(name='metric_2')
 
@@ -2936,11 +2954,11 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_duplicate_metric_name_in_add_metric(self):
 
-    class TestModel(keras.Model):
+    class TestModel(training_module.Model):
 
       def __init__(self):
         super(TestModel, self).__init__(name='test_model')
-        self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+        self.dense1 = layers_module.Dense(2, kernel_initializer='ones')
         self.mean = metrics_module.Mean(name='metric_1')
         self.mean2 = metrics_module.Mean(name='metric_1')
 
@@ -2965,11 +2983,11 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_add_metric_without_name(self):
 
-    class TestModel(keras.Model):
+    class TestModel(training_module.Model):
 
       def __init__(self):
         super(TestModel, self).__init__(name='test_model')
-        self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+        self.dense1 = layers_module.Dense(2, kernel_initializer='ones')
 
       def call(self, x):
         self.add_metric(math_ops.reduce_sum(x), aggregation='mean')
@@ -2989,10 +3007,10 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes
   def test_add_metric_correctness(self):
-    inputs = keras.Input(shape=(1,))
-    targets = keras.Input(shape=(1,))
+    inputs = input_layer.Input(shape=(1,))
+    targets = input_layer.Input(shape=(1,))
 
-    class Bias(keras.layers.Layer):
+    class Bias(layers_module.Layer):
 
       def build(self, input_shape):
         self.bias = self.add_variable('bias', (1,), initializer='zeros')
@@ -3005,7 +3023,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
         return outputs
 
     outputs = Bias()([inputs, targets])
-    model = keras.Model([inputs, targets], outputs)
+    model = training_module.Model([inputs, targets], outputs)
 
     model.add_metric(
         metrics_module.mean_absolute_error(targets, outputs),
@@ -3014,7 +3032,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
 
     model.compile(
         loss='mae',
-        optimizer=keras.optimizer_v2.gradient_descent.SGD(0.1),
+        optimizer=optimizer_v2.gradient_descent.SGD(0.1),
         metrics=[metrics_module.MeanAbsoluteError(name='mae_3')],
         run_eagerly=testing_utils.should_run_eagerly())
 
@@ -3029,14 +3047,14 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_add_metric_order(self):
 
-    class MyLayer(keras.layers.Layer):
+    class MyLayer(layers_module.Layer):
 
       def call(self, inputs, training=None, mask=None):
         self.add_metric(
             array_ops.ones([32]) * 2.0, name='two', aggregation='mean')
         return inputs
 
-    class MyModel(keras.Model):
+    class MyModel(training_module.Model):
 
       def __init__(self, **kwargs):
         super(MyModel, self).__init__(**kwargs)
@@ -3071,11 +3089,11 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_model_with_nested_compiled_model(self):
 
-    class LayerWithAddMetric(keras.layers.Layer):
+    class LayerWithAddMetric(layers_module.Layer):
 
       def __init__(self):
         super(LayerWithAddMetric, self).__init__()
-        self.dense = keras.layers.Dense(1, kernel_initializer='ones')
+        self.dense = layers_module.Dense(1, kernel_initializer='ones')
 
       def call(self, inputs):
         outputs = self.dense(inputs)
@@ -3083,10 +3101,10 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
             math_ops.reduce_sum(outputs), name='mean', aggregation='mean')
         return outputs
 
-    x = keras.layers.Input(shape=(1,))
+    x = layers_module.Input(shape=(1,))
     y = LayerWithAddMetric()(x)
 
-    inner_model = keras.models.Model(x, y)
+    inner_model = training_module.Model(x, y)
     inner_model.add_metric(
         math_ops.reduce_sum(y), name='mean1', aggregation='mean')
 
@@ -3100,9 +3118,9 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     self.assertEqual([m.name for m in inner_model.metrics],
                      ['loss', 'acc', 'mean', 'mean1'])
 
-    x = keras.layers.Input(shape=[1])
+    x = layers_module.Input(shape=[1])
     y = inner_model(x)
-    outer_model = keras.Model(x, y)
+    outer_model = training_module.Model(x, y)
     outer_model.add_metric(
         math_ops.reduce_sum(y), name='mean2', aggregation='mean')
 
@@ -3116,7 +3134,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
                      ['loss', 'acc2', 'mean', 'mean1', 'mean2'])
 
 
-class BareUpdateLayer(keras.layers.Layer):
+class BareUpdateLayer(layers_module.Layer):
 
   def build(self, input_shape):
     self.counter = self.add_weight(
@@ -3131,7 +3149,7 @@ class BareUpdateLayer(keras.layers.Layer):
     return math_ops.cast(self.counter, inputs.dtype) * inputs
 
 
-class LambdaUpdateLayer(keras.layers.Layer):
+class LambdaUpdateLayer(layers_module.Layer):
 
   def build(self, input_shape):
     self.counter = self.add_weight(
@@ -3147,7 +3165,7 @@ class LambdaUpdateLayer(keras.layers.Layer):
     return math_ops.cast(self.counter, inputs.dtype) * inputs
 
 
-class NestedUpdateLayer(keras.layers.Layer):
+class NestedUpdateLayer(layers_module.Layer):
 
   def build(self, input_shape):
     self.layer = BareUpdateLayer()
@@ -3161,7 +3179,7 @@ class NestedUpdateLayer(keras.layers.Layer):
     return self.layer(inputs)
 
 
-class SubgraphUpdateLayer(keras.layers.Layer):
+class SubgraphUpdateLayer(layers_module.Layer):
 
   def build(self, input_shape):
     self.counter = self.add_weight(
@@ -3173,7 +3191,7 @@ class SubgraphUpdateLayer(keras.layers.Layer):
 
   def call(self, inputs, training=None):
     if training is None:
-      training = keras.backend.learning_phase()
+      training = backend.learning_phase()
 
     if training:
       self.counter.assign(self.counter + 1)
@@ -3192,7 +3210,7 @@ class TestAutoUpdates(keras_parameterized.TestCase):
     layer = layer_builder()
     x, y = np.ones((10, 10)), np.ones((10, 1))
     model = testing_utils.get_model_from_layers(
-        [layer, keras.layers.Dense(1)], input_shape=(10,))
+        [layer, layers_module.Dense(1)], input_shape=(10,))
     model.compile(
         'sgd',
         'mse',
@@ -3205,7 +3223,7 @@ class TestAutoUpdates(keras_parameterized.TestCase):
     x, y = np.ones((10, 10)), np.ones((10, 1))
     layer = LambdaUpdateLayer()
     model = testing_utils.get_model_from_layers(
-        [layer, keras.layers.Dense(1)], input_shape=(10,))
+        [layer, layers_module.Dense(1)], input_shape=(10,))
     model.compile(
         'sgd',
         'mse',
@@ -3225,7 +3243,7 @@ class TestAutoUpdates(keras_parameterized.TestCase):
     layer = SubgraphUpdateLayer()
     x, y = np.ones((10, 10)), np.ones((10, 1))
     model = testing_utils.get_model_from_layers(
-        [layer, keras.layers.Dense(1)], input_shape=(10,))
+        [layer, layers_module.Dense(1)], input_shape=(10,))
     model.compile(
         'sgd',
         'mse',
@@ -3257,8 +3275,8 @@ class TestAutoUpdates(keras_parameterized.TestCase):
 
   @keras_parameterized.run_with_all_model_types
   def test_batchnorm_trainable_false(self):
-    bn = keras.layers.BatchNormalization()
-    model = testing_utils.get_model_from_layers([bn, keras.layers.Dense(1)],
+    bn = layers_module.BatchNormalization()
+    model = testing_utils.get_model_from_layers([bn, layers_module.Dense(1)],
                                                 input_shape=(10,))
     bn.trainable = False
     model.compile(
diff --git a/tensorflow/python/keras/initializers.py b/tensorflow/python/keras/initializers.py
index a7f69f01de1..58a90ccadc3 100644
--- a/tensorflow/python/keras/initializers.py
+++ b/tensorflow/python/keras/initializers.py
@@ -14,6 +14,11 @@
 # ==============================================================================
 """Keras initializer serialization / deserialization.
 """
+# pylint: disable=unused-import
+# pylint: disable=line-too-long
+# pylint: disable=g-import-not-at-top
+# pylint: disable=g-bad-import-order
+# pylint: disable=invalid-name
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -28,23 +33,6 @@ from tensorflow.python.ops import init_ops_v2
 
 # These imports are brought in so that keras.initializers.deserialize
 # has them available in module_objects.
-from tensorflow.python.ops.init_ops import Constant
-from tensorflow.python.ops.init_ops import GlorotNormal
-from tensorflow.python.ops.init_ops import GlorotUniform
-from tensorflow.python.ops.init_ops import he_normal  # pylint: disable=unused-import
-from tensorflow.python.ops.init_ops import he_uniform  # pylint: disable=unused-import
-from tensorflow.python.ops.init_ops import Identity
-from tensorflow.python.ops.init_ops import Initializer  # pylint: disable=unused-import
-from tensorflow.python.ops.init_ops import lecun_normal  # pylint: disable=unused-import
-from tensorflow.python.ops.init_ops import lecun_uniform  # pylint: disable=unused-import
-from tensorflow.python.ops.init_ops import Ones
-from tensorflow.python.ops.init_ops import Orthogonal
-from tensorflow.python.ops.init_ops import RandomNormal as TFRandomNormal
-from tensorflow.python.ops.init_ops import RandomUniform as TFRandomUniform
-from tensorflow.python.ops.init_ops import TruncatedNormal as TFTruncatedNormal
-from tensorflow.python.ops.init_ops import VarianceScaling  # pylint: disable=unused-import
-from tensorflow.python.ops.init_ops import Zeros
-# pylint: disable=unused-import, disable=line-too-long
 from tensorflow.python.ops.init_ops_v2 import Constant as ConstantV2
 from tensorflow.python.ops.init_ops_v2 import GlorotNormal as GlorotNormalV2
 from tensorflow.python.ops.init_ops_v2 import GlorotUniform as GlorotUniformV2
@@ -61,14 +49,46 @@ from tensorflow.python.ops.init_ops_v2 import RandomUniform as RandomUniformV2
 from tensorflow.python.ops.init_ops_v2 import TruncatedNormal as TruncatedNormalV2
 from tensorflow.python.ops.init_ops_v2 import VarianceScaling as VarianceScalingV2
 from tensorflow.python.ops.init_ops_v2 import Zeros as ZerosV2
-# pylint: enable=unused-import, enable=line-too-long
+
+if tf2.enabled():
+  Constant = ConstantV2
+  GlorotNormal = GlorotNormalV2
+  GlorotUniform = GlorotUniformV2
+  he_normal = he_normalV2
+  he_uniform = he_uniformV2
+  Identity = IdentityV2
+  Initializer = InitializerV2
+  lecun_normal = lecun_normalV2
+  lecun_uniform = lecun_uniformV2
+  Ones = OnesV2
+  Orthogonal = OrthogonalV2
+  VarianceScaling = VarianceScalingV2
+  Zeros = ZerosV2
+else:
+  from tensorflow.python.ops.init_ops import Constant
+  from tensorflow.python.ops.init_ops import GlorotNormal
+  from tensorflow.python.ops.init_ops import GlorotUniform
+  from tensorflow.python.ops.init_ops import he_normal
+  from tensorflow.python.ops.init_ops import he_uniform
+  from tensorflow.python.ops.init_ops import Identity
+  from tensorflow.python.ops.init_ops import Initializer
+  from tensorflow.python.ops.init_ops import lecun_normal
+  from tensorflow.python.ops.init_ops import lecun_uniform
+  from tensorflow.python.ops.init_ops import Ones
+  from tensorflow.python.ops.init_ops import Orthogonal
+  from tensorflow.python.ops.init_ops import VarianceScaling
+  from tensorflow.python.ops.init_ops import Zeros
+
+from tensorflow.python.ops.init_ops import RandomNormal as TFRandomNormalV1
+from tensorflow.python.ops.init_ops import RandomUniform as TFRandomUniformV1
+from tensorflow.python.ops.init_ops import TruncatedNormal as TFTruncatedNormalV1
 
 from tensorflow.python.util.tf_export import keras_export
 
 
 @keras_export(v1=['keras.initializers.TruncatedNormal',
                   'keras.initializers.truncated_normal'])
-class TruncatedNormal(TFTruncatedNormal):
+class TruncatedNormalV1(TFTruncatedNormalV1):
   """Initializer that generates a truncated normal distribution.
 
   These values are similar to values from a `random_normal_initializer`
@@ -84,20 +104,20 @@ class TruncatedNormal(TFTruncatedNormal):
     seed: A Python integer. Used to create random seeds. See
       `tf.compat.v1.set_random_seed` for behavior.
     dtype: The data type. Only floating point types are supported.
-    
+
   Returns:
     A TruncatedNormal instance.
   """
 
   def __init__(self, mean=0.0, stddev=0.05, seed=None, dtype=dtypes.float32):
-    super(TruncatedNormal, self).__init__(
+    super(TruncatedNormalV1, self).__init__(
         mean=mean, stddev=stddev, seed=seed, dtype=dtype)
 
 
 @keras_export(v1=['keras.initializers.RandomUniform',
                   'keras.initializers.uniform',
                   'keras.initializers.random_uniform'])
-class RandomUniform(TFRandomUniform):
+class RandomUniformV1(TFRandomUniformV1):
   """Initializer that generates tensors with a uniform distribution.
 
   Args:
@@ -108,21 +128,21 @@ class RandomUniform(TFRandomUniform):
     seed: A Python integer. Used to create random seeds. See
       `tf.compat.v1.set_random_seed` for behavior.
     dtype: The data type.
-    
+
   Returns:
     A RandomUniform instance.
   """
 
   def __init__(self, minval=-0.05, maxval=0.05, seed=None,
                dtype=dtypes.float32):
-    super(RandomUniform, self).__init__(
+    super(RandomUniformV1, self).__init__(
         minval=minval, maxval=maxval, seed=seed, dtype=dtype)
 
 
 @keras_export(v1=['keras.initializers.RandomNormal',
                   'keras.initializers.normal',
                   'keras.initializers.random_normal'])
-class RandomNormal(TFRandomNormal):
+class RandomNormalV1(TFRandomNormalV1):
   """Initializer that generates tensors with a normal distribution.
 
   Args:
@@ -139,13 +159,20 @@ class RandomNormal(TFRandomNormal):
   """
 
   def __init__(self, mean=0.0, stddev=0.05, seed=None, dtype=dtypes.float32):
-    super(RandomNormal, self).__init__(
+    super(RandomNormalV1, self).__init__(
         mean=mean, stddev=stddev, seed=seed, dtype=dtype)
 
 
-# Compatibility aliases
+if tf2.enabled():
+  RandomNormal = RandomNormalV2
+  RandomUniform = RandomUniformV2
+  TruncatedNormal = TruncatedNormalV2
+else:
+  RandomNormal = RandomNormalV1
+  RandomUniform = RandomUniformV1
+  TruncatedNormal = TruncatedNormalV1
 
-# pylint: disable=invalid-name
+# Compatibility aliases
 zero = zeros = Zeros
 one = ones = Ones
 constant = Constant
@@ -157,7 +184,6 @@ orthogonal = Orthogonal
 glorot_normal = GlorotNormal
 glorot_uniform = GlorotUniform
 
-
 # Utility functions
 
 
@@ -206,6 +232,3 @@ def get(identifier):
   else:
     raise ValueError('Could not interpret initializer identifier: ' +
                      str(identifier))
-
-
-# pylint: enable=invalid-name
diff --git a/tensorflow/python/keras/initializers_test.py b/tensorflow/python/keras/initializers_test.py
index 59682e09021..fd8f50850a9 100644
--- a/tensorflow/python/keras/initializers_test.py
+++ b/tensorflow/python/keras/initializers_test.py
@@ -20,33 +20,38 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python import keras
 from tensorflow.python import tf2
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import combinations
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras import models
+from tensorflow.python.keras.engine import input_layer
+from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.platform import test
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class KerasInitializersTest(test.TestCase):
 
   def _runner(self, init, shape, target_mean=None, target_std=None,
               target_max=None, target_min=None):
-    variable = keras.backend.variable(init(shape))
-    output = keras.backend.get_value(variable)
+    variable = backend.variable(init(shape))
+    output = backend.get_value(variable)
     # Test serialization (assumes deterministic behavior).
     config = init.get_config()
     reconstructed_init = init.__class__.from_config(config)
-    variable = keras.backend.variable(reconstructed_init(shape))
-    output_2 = keras.backend.get_value(variable)
+    variable = backend.variable(reconstructed_init(shape))
+    output_2 = backend.get_value(variable)
     self.assertAllClose(output, output_2, atol=1e-4)
 
   def test_uniform(self):
     tensor_shape = (9, 6, 7)
     with self.cached_session():
       self._runner(
-          keras.initializers.RandomUniformV2(minval=-1, maxval=1, seed=124),
+          initializers.RandomUniformV2(minval=-1, maxval=1, seed=124),
           tensor_shape,
           target_mean=0.,
           target_max=1,
@@ -56,7 +61,7 @@ class KerasInitializersTest(test.TestCase):
     tensor_shape = (8, 12, 99)
     with self.cached_session():
       self._runner(
-          keras.initializers.RandomNormalV2(mean=0, stddev=1, seed=153),
+          initializers.RandomNormalV2(mean=0, stddev=1, seed=153),
           tensor_shape,
           target_mean=0.,
           target_std=1)
@@ -65,7 +70,7 @@ class KerasInitializersTest(test.TestCase):
     tensor_shape = (12, 99, 7)
     with self.cached_session():
       self._runner(
-          keras.initializers.TruncatedNormalV2(mean=0, stddev=1, seed=126),
+          initializers.TruncatedNormalV2(mean=0, stddev=1, seed=126),
           tensor_shape,
           target_mean=0.,
           target_max=2,
@@ -75,7 +80,7 @@ class KerasInitializersTest(test.TestCase):
     tensor_shape = (5, 6, 4)
     with self.cached_session():
       self._runner(
-          keras.initializers.ConstantV2(2.),
+          initializers.ConstantV2(2.),
           tensor_shape,
           target_mean=2,
           target_max=2,
@@ -87,7 +92,7 @@ class KerasInitializersTest(test.TestCase):
       fan_in, _ = init_ops._compute_fans(tensor_shape)
       std = np.sqrt(1. / fan_in)
       self._runner(
-          keras.initializers.lecun_uniformV2(seed=123),
+          initializers.lecun_uniformV2(seed=123),
           tensor_shape,
           target_mean=0.,
           target_std=std)
@@ -98,7 +103,7 @@ class KerasInitializersTest(test.TestCase):
       fan_in, fan_out = init_ops._compute_fans(tensor_shape)
       std = np.sqrt(2. / (fan_in + fan_out))
       self._runner(
-          keras.initializers.GlorotUniformV2(seed=123),
+          initializers.GlorotUniformV2(seed=123),
           tensor_shape,
           target_mean=0.,
           target_std=std)
@@ -109,7 +114,7 @@ class KerasInitializersTest(test.TestCase):
       fan_in, _ = init_ops._compute_fans(tensor_shape)
       std = np.sqrt(2. / fan_in)
       self._runner(
-          keras.initializers.he_uniformV2(seed=123),
+          initializers.he_uniformV2(seed=123),
           tensor_shape,
           target_mean=0.,
           target_std=std)
@@ -120,7 +125,7 @@ class KerasInitializersTest(test.TestCase):
       fan_in, _ = init_ops._compute_fans(tensor_shape)
       std = np.sqrt(1. / fan_in)
       self._runner(
-          keras.initializers.lecun_normalV2(seed=123),
+          initializers.lecun_normalV2(seed=123),
           tensor_shape,
           target_mean=0.,
           target_std=std)
@@ -131,7 +136,7 @@ class KerasInitializersTest(test.TestCase):
       fan_in, fan_out = init_ops._compute_fans(tensor_shape)
       std = np.sqrt(2. / (fan_in + fan_out))
       self._runner(
-          keras.initializers.GlorotNormalV2(seed=123),
+          initializers.GlorotNormalV2(seed=123),
           tensor_shape,
           target_mean=0.,
           target_std=std)
@@ -142,7 +147,7 @@ class KerasInitializersTest(test.TestCase):
       fan_in, _ = init_ops._compute_fans(tensor_shape)
       std = np.sqrt(2. / fan_in)
       self._runner(
-          keras.initializers.he_normalV2(seed=123),
+          initializers.he_normalV2(seed=123),
           tensor_shape,
           target_mean=0.,
           target_std=std)
@@ -151,23 +156,21 @@ class KerasInitializersTest(test.TestCase):
     tensor_shape = (20, 20)
     with self.cached_session():
       self._runner(
-          keras.initializers.OrthogonalV2(seed=123),
-          tensor_shape,
-          target_mean=0.)
+          initializers.OrthogonalV2(seed=123), tensor_shape, target_mean=0.)
 
   def test_identity(self):
     with self.cached_session():
       tensor_shape = (3, 4, 5)
       with self.assertRaises(ValueError):
         self._runner(
-            keras.initializers.IdentityV2(),
+            initializers.IdentityV2(),
             tensor_shape,
             target_mean=1. / tensor_shape[0],
             target_max=1.)
 
       tensor_shape = (3, 3)
       self._runner(
-          keras.initializers.IdentityV2(),
+          initializers.IdentityV2(),
           tensor_shape,
           target_mean=1. / tensor_shape[0],
           target_max=1.)
@@ -176,32 +179,26 @@ class KerasInitializersTest(test.TestCase):
     tensor_shape = (4, 5)
     with self.cached_session():
       self._runner(
-          keras.initializers.ZerosV2(),
-          tensor_shape,
-          target_mean=0.,
-          target_max=0.)
+          initializers.ZerosV2(), tensor_shape, target_mean=0., target_max=0.)
 
   def test_one(self):
     tensor_shape = (4, 5)
     with self.cached_session():
       self._runner(
-          keras.initializers.OnesV2(),
-          tensor_shape,
-          target_mean=1.,
-          target_max=1.)
+          initializers.OnesV2(), tensor_shape, target_mean=1., target_max=1.)
 
   def test_default_random_uniform(self):
-    ru = keras.initializers.get('uniform')
+    ru = initializers.get('uniform')
     self.assertEqual(ru.minval, -0.05)
     self.assertEqual(ru.maxval, 0.05)
 
   def test_default_random_normal(self):
-    rn = keras.initializers.get('normal')
+    rn = initializers.get('normal')
     self.assertEqual(rn.mean, 0.0)
     self.assertEqual(rn.stddev, 0.05)
 
   def test_default_truncated_normal(self):
-    tn = keras.initializers.get('truncated_normal')
+    tn = initializers.get('truncated_normal')
     self.assertEqual(tn.mean, 0.0)
     self.assertEqual(tn.stddev, 0.05)
 
@@ -209,7 +206,7 @@ class KerasInitializersTest(test.TestCase):
     tf2_force_enabled = tf2._force_enable  # pylint: disable=protected-access
     try:
       tf2.enable()
-      rn = keras.initializers.get('random_normal')
+      rn = initializers.get('random_normal')
       self.assertIn('init_ops_v2', rn.__class__.__module__)
     finally:
       tf2._force_enable = tf2_force_enabled  # pylint: disable=protected-access
@@ -219,9 +216,9 @@ class KerasInitializersTest(test.TestCase):
     def my_initializer(shape, dtype=None):
       return array_ops.ones(shape, dtype=dtype)
 
-    inputs = keras.Input((10,))
-    outputs = keras.layers.Dense(1, kernel_initializer=my_initializer)(inputs)
-    model = keras.Model(inputs, outputs)
+    inputs = input_layer.Input((10,))
+    outputs = core.Dense(1, kernel_initializer=my_initializer)(inputs)
+    model = models.Model(inputs, outputs)
     model2 = model.from_config(
         model.get_config(), custom_objects={'my_initializer': my_initializer})
     self.assertEqual(model2.layers[1].kernel_initializer, my_initializer)
@@ -237,7 +234,7 @@ class KerasInitializersTest(test.TestCase):
             'seed': None
         }
     }
-    initializer = keras.initializers.deserialize(external_serialized_json)
+    initializer = initializers.deserialize(external_serialized_json)
     self.assertEqual(initializer.distribution, 'truncated_normal')
 
 
diff --git a/tensorflow/python/keras/keras_parameterized.py b/tensorflow/python/keras/keras_parameterized.py
index 7006fcd0f87..850cf915665 100644
--- a/tensorflow/python/keras/keras_parameterized.py
+++ b/tensorflow/python/keras/keras_parameterized.py
@@ -333,8 +333,7 @@ def run_all_keras_modes(test_or_class=None,
       metrics = ['mae']
       model.compile(
           optimizer, loss, metrics=metrics,
-          run_eagerly=testing_utils.should_run_eagerly(),
-          experimental_run_tf_function=testing_utils.should_run_tf_function())
+          run_eagerly=testing_utils.should_run_eagerly())
 
       inputs = np.zeros((10, 3))
       targets = np.zeros((10, 4))
@@ -402,23 +401,20 @@ def run_all_keras_modes(test_or_class=None,
 def _v1_session_test(f, test_or_class, config, *args, **kwargs):
   with ops.get_default_graph().as_default():
     with testing_utils.run_eagerly_scope(False):
-      with testing_utils.experimental_run_tf_function_scope(False):
-        with test_or_class.test_session(use_gpu=True, config=config):
-          f(test_or_class, *args, **kwargs)
+      with test_or_class.test_session(use_gpu=True, config=config):
+        f(test_or_class, *args, **kwargs)
 
 
 def _v2_eager_test(f, test_or_class, *args, **kwargs):
   with context.eager_mode():
     with testing_utils.run_eagerly_scope(True):
-      with testing_utils.experimental_run_tf_function_scope(True):
-        f(test_or_class, *args, **kwargs)
+      f(test_or_class, *args, **kwargs)
 
 
 def _v2_function_test(f, test_or_class, *args, **kwargs):
   with context.eager_mode():
     with testing_utils.run_eagerly_scope(False):
-      with testing_utils.experimental_run_tf_function_scope(True):
-        f(test_or_class, *args, **kwargs)
+      f(test_or_class, *args, **kwargs)
 
 
 def _test_or_class_decorator(test_or_class, single_method_decorator):
diff --git a/tensorflow/python/keras/keras_parameterized_test.py b/tensorflow/python/keras/keras_parameterized_test.py
index b750f4ca9f5..4d5e93da929 100644
--- a/tensorflow/python/keras/keras_parameterized_test.py
+++ b/tensorflow/python/keras/keras_parameterized_test.py
@@ -210,8 +210,7 @@ class KerasParameterizedTest(keras_parameterized.TestCase):
       def testBody(self):
         mode = "eager" if context.executing_eagerly() else "graph"
         should_run_eagerly = testing_utils.should_run_eagerly()
-        should_run_tf_function = testing_utils.should_run_tf_function()
-        l.append((mode, should_run_eagerly, should_run_tf_function))
+        l.append((mode, should_run_eagerly))
 
     e = ExampleTest()
     if not tf2.enabled():
@@ -222,9 +221,9 @@ class KerasParameterizedTest(keras_parameterized.TestCase):
     if not tf2.enabled():
       self.assertLen(l, 3)
       self.assertAllEqual(l, [
-          ("graph", False, False),
-          ("eager", True, True),
-          ("eager", False, True),
+          ("graph", False),
+          ("eager", True),
+          ("eager", False),
       ])
 
       ts = unittest.makeSuite(ExampleTest)
@@ -234,8 +233,8 @@ class KerasParameterizedTest(keras_parameterized.TestCase):
     else:
       self.assertLen(l, 2)
       self.assertAllEqual(l, [
-          ("eager", True, True),
-          ("eager", False, True),
+          ("eager", True),
+          ("eager", False),
       ])
 
       ts = unittest.makeSuite(ExampleTest)
@@ -259,9 +258,7 @@ class KerasParameterizedTest(keras_parameterized.TestCase):
         mode = "eager" if context.executing_eagerly() else "graph"
         with_brackets = "with_brackets" if with_brackets else "without_brackets"
         should_run_eagerly = testing_utils.should_run_eagerly()
-        should_run_tf_function = testing_utils.should_run_tf_function()
-        l.append(
-            (with_brackets, mode, should_run_eagerly, should_run_tf_function))
+        l.append((with_brackets, mode, should_run_eagerly))
 
     e = ExampleTest()
     if not tf2.enabled():
@@ -274,16 +271,16 @@ class KerasParameterizedTest(keras_parameterized.TestCase):
     e.testBody_1_v2_function()
 
     expected_combinations = {
-        ("with_brackets", "eager", True, True),
-        ("with_brackets", "eager", False, True),
-        ("without_brackets", "eager", True, True),
-        ("without_brackets", "eager", False, True),
+        ("with_brackets", "eager", True),
+        ("with_brackets", "eager", False),
+        ("without_brackets", "eager", True),
+        ("without_brackets", "eager", False),
     }
 
     if not tf2.enabled():
       expected_combinations = expected_combinations.union({
-          ("with_brackets", "graph", False, False),
-          ("without_brackets", "graph", False, False),
+          ("with_brackets", "graph", False),
+          ("without_brackets", "graph", False),
       })
 
     self.assertLen(l, len(expected_combinations))
@@ -307,8 +304,7 @@ class KerasParameterizedTest(keras_parameterized.TestCase):
       def testBody(self):
         mode = "eager" if context.executing_eagerly() else "graph"
         should_run_eagerly = testing_utils.should_run_eagerly()
-        should_run_tf_function = testing_utils.should_run_tf_function()
-        l.append((mode, should_run_eagerly, should_run_tf_function))
+        l.append((mode, should_run_eagerly))
 
     e = ExampleTest()
     if hasattr(e, "testBody_v1_session"):
@@ -321,8 +317,8 @@ class KerasParameterizedTest(keras_parameterized.TestCase):
     self.assertLen(l, 2)
     self.assertEqual(
         set(l), {
-            ("eager", True, True),
-            ("eager", False, True),
+            ("eager", True),
+            ("eager", False),
         })
 
   def test_run_all_keras_modes_with_all_model_types(self):
@@ -338,9 +334,7 @@ class KerasParameterizedTest(keras_parameterized.TestCase):
       def testBody(self):
         mode = "eager" if context.executing_eagerly() else "graph"
         should_run_eagerly = testing_utils.should_run_eagerly()
-        should_run_tf_function = testing_utils.should_run_tf_function()
-        l.append((mode, should_run_eagerly, should_run_tf_function,
-                  testing_utils.get_model_type()))
+        l.append((mode, should_run_eagerly, testing_utils.get_model_type()))
 
     e = ExampleTest()
     e.testBody_v2_eager_functional()
@@ -356,19 +350,19 @@ class KerasParameterizedTest(keras_parameterized.TestCase):
       e.testBody_v1_session_subclass()
 
     expected_combinations = {
-        ("eager", True, True, "functional"),
-        ("eager", False, True, "functional"),
-        ("eager", True, True, "sequential"),
-        ("eager", False, True, "sequential"),
-        ("eager", True, True, "subclass"),
-        ("eager", False, True, "subclass"),
+        ("eager", True, "functional"),
+        ("eager", False, "functional"),
+        ("eager", True, "sequential"),
+        ("eager", False, "sequential"),
+        ("eager", True, "subclass"),
+        ("eager", False, "subclass"),
     }
 
     if not tf2.enabled():
       expected_combinations = expected_combinations.union({
-          ("graph", False, False, "functional"),
-          ("graph", False, False, "sequential"),
-          ("graph", False, False, "subclass"),
+          ("graph", False, "functional"),
+          ("graph", False, "sequential"),
+          ("graph", False, "subclass"),
       })
 
     self.assertLen(l, len(expected_combinations))
@@ -393,9 +387,7 @@ class KerasParameterizedTest(keras_parameterized.TestCase):
       def testBody(self):
         mode = "eager" if context.executing_eagerly() else "graph"
         should_run_eagerly = testing_utils.should_run_eagerly()
-        should_run_tf_function = testing_utils.should_run_tf_function()
-        l.append((mode, should_run_eagerly, should_run_tf_function,
-                  testing_utils.get_model_type()))
+        l.append((mode, should_run_eagerly, testing_utils.get_model_type()))
 
     e = ExampleTest()
     e.testBody_functional_v2_eager()
@@ -411,19 +403,19 @@ class KerasParameterizedTest(keras_parameterized.TestCase):
       e.testBody_subclass_v1_session()
 
     expected_combinations = {
-        ("eager", True, True, "functional"),
-        ("eager", False, True, "functional"),
-        ("eager", True, True, "sequential"),
-        ("eager", False, True, "sequential"),
-        ("eager", True, True, "subclass"),
-        ("eager", False, True, "subclass"),
+        ("eager", True, "functional"),
+        ("eager", False, "functional"),
+        ("eager", True, "sequential"),
+        ("eager", False, "sequential"),
+        ("eager", True, "subclass"),
+        ("eager", False, "subclass"),
     }
 
     if not tf2.enabled():
       expected_combinations = expected_combinations.union({
-          ("graph", False, False, "functional"),
-          ("graph", False, False, "sequential"),
-          ("graph", False, False, "subclass"),
+          ("graph", False, "functional"),
+          ("graph", False, "sequential"),
+          ("graph", False, "subclass"),
       })
 
     self.assertLen(l, len(expected_combinations))
@@ -450,9 +442,7 @@ class KerasParameterizedTest(keras_parameterized.TestCase):
       def testBody(self, arg):
         mode = "eager" if context.executing_eagerly() else "graph"
         should_run_eagerly = testing_utils.should_run_eagerly()
-        should_run_tf_function = testing_utils.should_run_tf_function()
-        l.append((mode, should_run_eagerly, should_run_tf_function,
-                  testing_utils.get_model_type()))
+        l.append((mode, should_run_eagerly, testing_utils.get_model_type()))
 
     e = ExampleTest()
     e.testBody_arg_v2_eager_functional()
@@ -468,19 +458,19 @@ class KerasParameterizedTest(keras_parameterized.TestCase):
       e.testBody_arg_v1_session_subclass()
 
     expected_combinations = {
-        ("eager", True, True, "functional"),
-        ("eager", False, True, "functional"),
-        ("eager", True, True, "sequential"),
-        ("eager", False, True, "sequential"),
-        ("eager", True, True, "subclass"),
-        ("eager", False, True, "subclass"),
+        ("eager", True, "functional"),
+        ("eager", False, "functional"),
+        ("eager", True, "sequential"),
+        ("eager", False, "sequential"),
+        ("eager", True, "subclass"),
+        ("eager", False, "subclass"),
     }
 
     if not tf2.enabled():
       expected_combinations = expected_combinations.union({
-          ("graph", False, False, "functional"),
-          ("graph", False, False, "sequential"),
-          ("graph", False, False, "subclass"),
+          ("graph", False, "functional"),
+          ("graph", False, "sequential"),
+          ("graph", False, "subclass"),
       })
 
     self.assertLen(l, len(expected_combinations))
@@ -507,9 +497,7 @@ class KerasParameterizedTest(keras_parameterized.TestCase):
       def testBody(self, arg):
         mode = "eager" if context.executing_eagerly() else "graph"
         should_run_eagerly = testing_utils.should_run_eagerly()
-        should_run_tf_function = testing_utils.should_run_tf_function()
-        l.append((mode, should_run_eagerly, should_run_tf_function,
-                  testing_utils.get_model_type()))
+        l.append((mode, should_run_eagerly, testing_utils.get_model_type()))
 
     e = ExampleTest()
     e.testBody_arg_v2_eager_functional()
@@ -525,19 +513,19 @@ class KerasParameterizedTest(keras_parameterized.TestCase):
       e.testBody_arg_v1_session_subclass()
 
     expected_combinations = {
-        ("eager", True, True, "functional"),
-        ("eager", False, True, "functional"),
-        ("eager", True, True, "sequential"),
-        ("eager", False, True, "sequential"),
-        ("eager", True, True, "subclass"),
-        ("eager", False, True, "subclass"),
+        ("eager", True, "functional"),
+        ("eager", False, "functional"),
+        ("eager", True, "sequential"),
+        ("eager", False, "sequential"),
+        ("eager", True, "subclass"),
+        ("eager", False, "subclass"),
     }
 
     if not tf2.enabled():
       expected_combinations = expected_combinations.union({
-          ("graph", False, False, "functional"),
-          ("graph", False, False, "sequential"),
-          ("graph", False, False, "subclass"),
+          ("graph", False, "functional"),
+          ("graph", False, "sequential"),
+          ("graph", False, "subclass"),
       })
 
     self.assertLen(l, len(expected_combinations))
diff --git a/tensorflow/python/keras/layers/BUILD b/tensorflow/python/keras/layers/BUILD
index b191e2e8e90..1482e747a42 100644
--- a/tensorflow/python/keras/layers/BUILD
+++ b/tensorflow/python/keras/layers/BUILD
@@ -496,6 +496,7 @@ cuda_py_test(
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
+        "//tensorflow/python/keras:combinations",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -514,6 +515,7 @@ tf_py_test(
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
+        "//tensorflow/python/keras:combinations",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -554,6 +556,7 @@ tf_py_test(
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
+        "//tensorflow/python/keras:combinations",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -567,6 +570,7 @@ cuda_py_test(
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
+        "//tensorflow/python/keras:combinations",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -581,6 +585,7 @@ tf_py_test(
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
+        "//tensorflow/python/keras:combinations",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -594,6 +599,7 @@ tf_py_test(
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
+        "//tensorflow/python/keras:combinations",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -623,6 +629,7 @@ cuda_py_test(
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
+        "//tensorflow/python/keras:combinations",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -638,6 +645,7 @@ tf_py_test(
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
+        "//tensorflow/python/keras:combinations",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -653,6 +661,7 @@ tf_py_test(
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
+        "//tensorflow/python/keras:combinations",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -741,6 +750,7 @@ cuda_py_test(
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
+        "//tensorflow/python/keras:combinations",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -754,6 +764,7 @@ tf_py_test(
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
+        "//tensorflow/python/keras:combinations",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -779,6 +790,7 @@ tf_py_test(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/keras",
         "//tensorflow/python/keras:backend",
+        "//tensorflow/python/keras:combinations",
         "//tensorflow/python/keras:initializers",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -798,6 +810,7 @@ tf_py_test(
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
+        "//tensorflow/python/keras:combinations",
         "//tensorflow/python/ops/ragged:ragged_concat_ops",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//third_party/py/numpy",
@@ -817,6 +830,7 @@ tf_py_test(
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
+        "//tensorflow/python/keras:combinations",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index 2370e138f09..9fd902d70e9 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -29,6 +29,19 @@ from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.base_preprocessing_layer import PreprocessingLayer
 
+# Image preprocessing layers.
+from tensorflow.python.keras.layers.preprocessing.image_preprocessing import CenterCrop
+from tensorflow.python.keras.layers.preprocessing.image_preprocessing import RandomCrop
+from tensorflow.python.keras.layers.preprocessing.image_preprocessing import RandomFlip
+from tensorflow.python.keras.layers.preprocessing.image_preprocessing import RandomContrast
+from tensorflow.python.keras.layers.preprocessing.image_preprocessing import RandomHeight
+from tensorflow.python.keras.layers.preprocessing.image_preprocessing import RandomRotation
+from tensorflow.python.keras.layers.preprocessing.image_preprocessing import RandomTranslation
+from tensorflow.python.keras.layers.preprocessing.image_preprocessing import RandomWidth
+from tensorflow.python.keras.layers.preprocessing.image_preprocessing import RandomZoom
+from tensorflow.python.keras.layers.preprocessing.image_preprocessing import Resizing
+from tensorflow.python.keras.layers.preprocessing.image_preprocessing import Rescaling
+
 # Preprocessing layers.
 if tf2.enabled():
   from tensorflow.python.keras.layers.preprocessing.normalization import Normalization
@@ -44,7 +57,6 @@ else:
   from tensorflow.python.keras.layers.preprocessing.text_vectorization_v1 import TextVectorization
   from tensorflow.python.keras.layers.preprocessing.text_vectorization import TextVectorization as TextVectorizationV2
   TextVectorizationV1 = TextVectorization
-from tensorflow.python.keras.layers.preprocessing.image_preprocessing import Rescaling
 
 # Advanced activations.
 from tensorflow.python.keras.layers.advanced_activations import LeakyReLU
diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index 519915808e4..84ede8f5672 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -129,6 +129,9 @@ class Conv(Layer):
     self.filters = filters
     self.kernel_size = conv_utils.normalize_tuple(
         kernel_size, rank, 'kernel_size')
+    if not all(self.kernel_size):
+      raise ValueError('The argument `kernel_size` cannot contain 0(s). '
+                       'Received: %s' % (kernel_size,))
     self.strides = conv_utils.normalize_tuple(strides, rank, 'strides')
     self.padding = conv_utils.normalize_padding(padding)
     if (self.padding == 'causal' and not isinstance(self,
diff --git a/tensorflow/python/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/layers/convolutional_recurrent.py
index 7b8b51c5276..48f724b55e1 100644
--- a/tensorflow/python/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/layers/convolutional_recurrent.py
@@ -330,9 +330,10 @@ class ConvRNN2D(RNN):
                                          mask=mask,
                                          input_length=timesteps)
     if self.stateful:
-      updates = []
-      for i in range(len(states)):
-        updates.append(K.update(self.states[i], states[i]))
+      updates = [
+          K.update(self_state, state)
+          for self_state, state in zip(self.states, states)
+      ]
       self.add_update(updates)
 
     if self.return_sequences:
diff --git a/tensorflow/python/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
index f0fa8bb3107..d79526d6c01 100644
--- a/tensorflow/python/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -164,6 +164,11 @@ class Conv2DTest(keras_parameterized.TestCase):
       self.assertEqual(layer.kernel.constraint, k_constraint)
       self.assertEqual(layer.bias.constraint, b_constraint)
 
+  def test_conv2d_zero_kernel_size(self):
+    kwargs = {'filters': 2, 'kernel_size': 0}
+    with self.assertRaises(ValueError):
+      keras.layers.Conv2D(**kwargs)
+
 
 @keras_parameterized.run_all_keras_modes
 class Conv3DTest(keras_parameterized.TestCase):
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent.py b/tensorflow/python/keras/layers/cudnn_recurrent.py
index ddd9a7ffa04..fc393ea7290 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent.py
@@ -110,9 +110,10 @@ class _CuDNNRNN(RNN):
     output, states = self._process_batch(inputs, initial_state)
 
     if self.stateful:
-      updates = []
-      for i in range(len(states)):
-        updates.append(state_ops.assign(self.states[i], states[i]))
+      updates = [
+          state_ops.assign(self_state, state)
+          for self_state, state in zip(self.states, states)
+      ]
       self.add_update(updates)
 
     if self.return_state:
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent_test.py b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
index 84a06f9cdd3..9cf132d68df 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent_test.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
@@ -27,6 +27,7 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.optimizer_v2.rmsprop import RMSprop
@@ -244,7 +245,7 @@ class CuDNNGraphOnlyTest(keras_parameterized.TestCase):
       self.assertNotEqual(out4.max(), out5.max())
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class CuDNNV1OnlyTest(keras_parameterized.TestCase):
 
   @test_util.run_gpu_only
diff --git a/tensorflow/python/keras/layers/dense_attention_test.py b/tensorflow/python/keras/layers/dense_attention_test.py
index 54748760124..750ec0d08d1 100644
--- a/tensorflow/python/keras/layers/dense_attention_test.py
+++ b/tensorflow/python/keras/layers/dense_attention_test.py
@@ -18,19 +18,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.layers import dense_attention
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class BaseDenseAttentionTest(test.TestCase):
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
+class BaseDenseAttentionTest(test.TestCase, parameterized.TestCase):
 
   def test_one_dim_with_mask(self):
     # Scores tensor of shape [1, 1, 1]
@@ -150,8 +152,8 @@ class BaseDenseAttentionTest(test.TestCase):
     self.assertEqual(new_layer.causal, True)
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class AttentionTest(test.TestCase):
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
+class AttentionTest(test.TestCase, parameterized.TestCase):
 
   def test_calculate_scores_one_dim(self):
     # Query tensor of shape [1, 1, 1]
@@ -470,8 +472,8 @@ class AttentionTest(test.TestCase):
     self.assertEqual(new_layer.use_scale, True)
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class AdditiveAttentionTest(test.TestCase):
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
+class AdditiveAttentionTest(test.TestCase, parameterized.TestCase):
 
   def test_calculate_scores_one_dim(self):
     # Query tensor of shape [1, 1, 1]
@@ -716,8 +718,8 @@ class AdditiveAttentionTest(test.TestCase):
     self.assertEqual(new_layer.use_scale, True)
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class LowerTriangularMaskTest(test.TestCase):
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
+class LowerTriangularMaskTest(test.TestCase, parameterized.TestCase):
 
   def test_square_shape(self):
     actual = dense_attention._lower_triangular_mask([3, 3])
diff --git a/tensorflow/python/keras/layers/embeddings_test.py b/tensorflow/python/keras/layers/embeddings_test.py
index 82116277d08..ee067e786b0 100644
--- a/tensorflow/python/keras/layers/embeddings_test.py
+++ b/tensorflow/python/keras/layers/embeddings_test.py
@@ -24,6 +24,7 @@ from tensorflow.python import keras
 from tensorflow.python.eager import backprop
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops.ragged import ragged_factory_ops
@@ -85,7 +86,7 @@ class EmbeddingTest(keras_parameterized.TestCase):
     outputs = model.predict(np.array([[0, 1, 0]], dtype='int32'))
     self.assertAllClose(outputs, [[[1, 1], [2, 2], [1, 1]]])
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_eager_gpu_cpu(self):
     l = keras.layers.Embedding(output_dim=2, input_dim=2)
     l.build((None, 2))
diff --git a/tensorflow/python/keras/layers/gru_test.py b/tensorflow/python/keras/layers/gru_test.py
index 914eb7294c7..38ab32b2314 100644
--- a/tensorflow/python/keras/layers/gru_test.py
+++ b/tensorflow/python/keras/layers/gru_test.py
@@ -25,6 +25,7 @@ from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.utils import np_utils
@@ -224,7 +225,7 @@ class GRULayerTest(keras_parameterized.TestCase):
     self.assertEqual(state[0].shape, initial_state.shape)
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class GRULayerGenericTest(test.TestCase):
 
   def test_constraints_GRU(self):
diff --git a/tensorflow/python/keras/layers/gru_v2_test.py b/tensorflow/python/keras/layers/gru_v2_test.py
index 0b0e3312f07..be71a4100bd 100644
--- a/tensorflow/python/keras/layers/gru_v2_test.py
+++ b/tensorflow/python/keras/layers/gru_v2_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.layers import recurrent as rnn_v1
@@ -612,32 +613,31 @@ class GRUV2Test(keras_parameterized.TestCase):
       model.fit(dataset)
 
 
-class GRULayerGradientTapeTest(test.TestCase):
+class GRULayerGradientTapeTest(keras_parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes(config=_config)
+  @combinations.generate(combinations.combine(mode=['eager']))
   def test_in_tape(self):
-    if not context.executing_eagerly():
-      self.skipTest('bloo')
-    time_steps = 10
-    embedding_size = 11
-    gru_unit_size = 12
+    with self.test_session(config=_config):
+      time_steps = 10
+      embedding_size = 11
+      gru_unit_size = 12
 
-    gru = rnn.GRU(gru_unit_size,
-                  return_sequences=True,
-                  return_state=True,
-                  recurrent_activation='sigmoid',
-                  recurrent_initializer='glorot_uniform')
+      gru = rnn.GRU(gru_unit_size,
+                    return_sequences=True,
+                    return_state=True,
+                    recurrent_activation='sigmoid',
+                    recurrent_initializer='glorot_uniform')
 
-    x = random_ops.random_uniform([1, time_steps, embedding_size])
-    y = random_ops.random_uniform([1, gru_unit_size])
+      x = random_ops.random_uniform([1, time_steps, embedding_size])
+      y = random_ops.random_uniform([1, gru_unit_size])
 
-    with backprop.GradientTape() as tape:
-      hidden_state = array_ops.zeros([1, gru_unit_size], dtype=dtypes.float32)
-      _, state = gru(x, initial_state=hidden_state)
+      with backprop.GradientTape() as tape:
+        hidden_state = array_ops.zeros([1, gru_unit_size], dtype=dtypes.float32)
+        _, state = gru(x, initial_state=hidden_state)
 
-      loss = math_ops.reduce_mean(math_ops.square(state - y))
+        loss = math_ops.reduce_mean(math_ops.square(state - y))
 
-    tape.gradient(loss, gru.variables)
+      tape.gradient(loss, gru.variables)
 
 
 @keras_parameterized.run_all_keras_modes(config=_config)
diff --git a/tensorflow/python/keras/layers/kernelized_test.py b/tensorflow/python/keras/layers/kernelized_test.py
index f0dc0a33a5b..edb58f77868 100644
--- a/tensorflow/python/keras/layers/kernelized_test.py
+++ b/tensorflow/python/keras/layers/kernelized_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend as keras_backend
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.layers import kernelized as kernel_layers
@@ -53,6 +54,7 @@ def _exact_laplacian(stddev):
       kernelized_utils.exact_laplacian_kernel, stddev=stddev)
 
 
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
 
   def _assert_all_close(self, expected, actual, atol=0.001):
@@ -63,27 +65,23 @@ class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
     else:
       self.assertAllClose(expected, actual, atol=atol)
 
-  @test_util.run_in_graph_and_eager_modes()
   def test_invalid_output_dim(self):
     with self.assertRaisesRegexp(
         ValueError, r'`output_dim` should be a positive integer. Given: -3.'):
       _ = kernel_layers.RandomFourierFeatures(output_dim=-3, scale=2.0)
 
-  @test_util.run_in_graph_and_eager_modes()
   def test_unsupported_kernel_type(self):
     with self.assertRaisesRegexp(
         ValueError, r'Unsupported kernel type: \'unsupported_kernel\'.'):
       _ = kernel_layers.RandomFourierFeatures(
           3, 'unsupported_kernel', stddev=2.0)
 
-  @test_util.run_in_graph_and_eager_modes()
   def test_invalid_scale(self):
     with self.assertRaisesRegexp(
         ValueError,
         r'When provided, `scale` should be a positive float. Given: 0.0.'):
       _ = kernel_layers.RandomFourierFeatures(output_dim=10, scale=0.0)
 
-  @test_util.run_in_graph_and_eager_modes()
   def test_invalid_input_shape(self):
     inputs = random_ops.random_uniform((3, 2, 4), seed=1)
     rff_layer = kernel_layers.RandomFourierFeatures(output_dim=10, scale=3.0)
@@ -95,7 +93,6 @@ class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
   @parameterized.named_parameters(
       ('gaussian', 'gaussian', 10.0, False),
       ('random', init_ops.random_uniform_initializer, 1.0, True))
-  @test_util.run_in_graph_and_eager_modes()
   def test_random_features_properties(self, initializer, scale, trainable):
     rff_layer = kernel_layers.RandomFourierFeatures(
         output_dim=10,
@@ -110,7 +107,6 @@ class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
   @parameterized.named_parameters(('gaussian', 'gaussian', False),
                                   ('laplacian', 'laplacian', True),
                                   ('other', init_ops.ones_initializer, True))
-  @test_util.run_in_graph_and_eager_modes()
   def test_call(self, initializer, trainable):
     rff_layer = kernel_layers.RandomFourierFeatures(
         output_dim=10,
@@ -132,7 +128,6 @@ class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
     kernel_layers.RandomFourierFeatures(output_dim=4, name='rff')(inputs)
     kernel_layers.RandomFourierFeatures(output_dim=10, scale=2.0)(inputs)
 
-  @test_util.run_in_graph_and_eager_modes()
   def test_output_shape(self):
     inputs = random_ops.random_uniform((3, 2), seed=1)
     rff_layer = kernel_layers.RandomFourierFeatures(
@@ -173,7 +168,6 @@ class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
   @parameterized.named_parameters(('gaussian', 10, 'gaussian', 2.0),
                                   ('laplacian', 5, 'laplacian', None),
                                   ('other', 10, init_ops.ones_initializer, 1.0))
-  @test_util.run_in_graph_and_eager_modes()
   def test_compute_output_shape(self, output_dim, initializer, scale):
     rff_layer = kernel_layers.RandomFourierFeatures(
         output_dim, initializer, scale=scale, name='rff')
@@ -202,7 +196,6 @@ class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
       ('gaussian', 10, 'gaussian', 3.0, False),
       ('laplacian', 5, 'laplacian', 5.5, True),
       ('other', 7, init_ops.random_uniform_initializer(), None, True))
-  @test_util.run_in_graph_and_eager_modes()
   def test_get_config(self, output_dim, initializer, scale, trainable):
     rff_layer = kernel_layers.RandomFourierFeatures(
         output_dim,
@@ -233,7 +226,6 @@ class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
       ('gaussian', 5, 'gaussian', None, True),
       ('laplacian', 5, 'laplacian', 5.5, False),
       ('other', 7, init_ops.ones_initializer(), 2.0, True))
-  @test_util.run_in_graph_and_eager_modes()
   def test_from_config(self, output_dim, initializer, scale, trainable):
     model_config = {
         'output_dim': output_dim,
@@ -262,7 +254,6 @@ class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
       ('gaussian', 10, 'gaussian', 3.0, True),
       ('laplacian', 5, 'laplacian', 5.5, False),
       ('other', 10, init_ops.random_uniform_initializer(), None, True))
-  @test_util.run_in_graph_and_eager_modes()
   def test_same_random_features_params_reused(self, output_dim, initializer,
                                               scale, trainable):
     """Applying the layer on the same input twice gives the same output."""
@@ -281,7 +272,6 @@ class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
   @parameterized.named_parameters(
       ('gaussian', 'gaussian', 5.0), ('laplacian', 'laplacian', 3.0),
       ('other', init_ops.random_uniform_initializer(), 5.0))
-  @test_util.run_in_graph_and_eager_modes()
   def test_different_params_similar_approximation(self, initializer, scale):
     random_seed.set_random_seed(12345)
     rff_layer1 = kernel_layers.RandomFourierFeatures(
@@ -314,7 +304,6 @@ class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
   @parameterized.named_parameters(
       ('gaussian', 'gaussian', 5.0, _exact_gaussian(stddev=5.0)),
       ('laplacian', 'laplacian', 20.0, _exact_laplacian(stddev=20.0)))
-  @test_util.run_in_graph_and_eager_modes()
   def test_bad_kernel_approximation(self, initializer, scale, exact_kernel_fn):
     """Approximation is bad when output dimension is small."""
     # Two distinct inputs.
@@ -353,7 +342,6 @@ class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
   @parameterized.named_parameters(
       ('gaussian', 'gaussian', 5.0, _exact_gaussian(stddev=5.0)),
       ('laplacian', 'laplacian', 10.0, _exact_laplacian(stddev=10.0)))
-  @test_util.run_in_graph_and_eager_modes()
   def test_good_kernel_approximation_multiple_inputs(self, initializer, scale,
                                                      exact_kernel_fn):
     # Parameters.
diff --git a/tensorflow/python/keras/layers/local_test.py b/tensorflow/python/keras/layers/local_test.py
index 86fd5c7f9de..52aaffb8ef3 100644
--- a/tensorflow/python/keras/layers/local_test.py
+++ b/tensorflow/python/keras/layers/local_test.py
@@ -18,12 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
 from absl.testing import parameterized
+import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
@@ -80,7 +80,7 @@ _DATA_FORMAT_PADDING_IMPLEMENTATION = [{
 }]
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class LocallyConnected1DLayersTest(test.TestCase, parameterized.TestCase):
 
   @parameterized.parameters(_DATA_FORMAT_PADDING_IMPLEMENTATION)
@@ -159,7 +159,7 @@ class LocallyConnected1DLayersTest(test.TestCase, parameterized.TestCase):
         self.assertEqual(layer.bias.constraint, b_constraint)
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class LocallyConnected2DLayersTest(test.TestCase, parameterized.TestCase):
 
   @parameterized.parameters(_DATA_FORMAT_PADDING_IMPLEMENTATION)
@@ -266,7 +266,7 @@ class LocallyConnected2DLayersTest(test.TestCase, parameterized.TestCase):
         self.assertEqual(layer.bias.constraint, b_constraint)
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class LocallyConnectedImplementationModeTest(test.TestCase,
                                              parameterized.TestCase):
 
diff --git a/tensorflow/python/keras/layers/merge_test.py b/tensorflow/python/keras/layers/merge_test.py
index 0753aeac9a7..897d48dcd35 100644
--- a/tensorflow/python/keras/layers/merge_test.py
+++ b/tensorflow/python/keras/layers/merge_test.py
@@ -24,10 +24,11 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
-from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
 
 
@@ -250,7 +251,7 @@ class MergeLayersTest(keras_parameterized.TestCase):
     self.assertAllEqual(out_dense, out_ragged)
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class MergeLayersTestNoExecution(test.TestCase):
 
   def test_merge_elementwise_errors(self):
@@ -360,6 +361,22 @@ class MergeLayersTestNoExecution(test.TestCase):
     mask = layer.output_mask
     self.assertListEqual(mask.shape.as_list(), [None, 4])
 
+  def test_user_changes_to_input_structure(self):
+    a = keras.layers.Input(shape=(4, 5))
+    struct = [a, a]
+    concat1 = keras.layers.Concatenate(1)
+    b = concat1(struct)
+    struct.append(b)
+    concat2 = keras.layers.Concatenate(1)
+    c = concat2(struct)
+
+    # Checks that the append to `struct` doesn't affect `concat1`s
+    # node data.
+    self.assertLen(concat1.inbound_nodes[0].input_tensors, 2)
+    self.assertLen(concat2.inbound_nodes[0].input_tensors, 3)
+
+    keras.Model(a, c)  # Ensure model can be built.
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index c2d152ea384..8545741aee7 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -251,6 +251,10 @@ class BatchNormalizationBase(Layer):
     In addition to the checks done in this function, the input tensors rank must
     be 4. The input rank check can only be done once the input shape is known.
     """
+    # Note the ValueErrors in this function are caught and not reraised in
+    # _fused_can_be_used(). No other exception besides ValueError should be
+    # raised here.
+
     # Currently fused batch norm doesn't support renorm. It also only supports a
     # channel dimension on axis 1 or 3, when no virtual batch size or adjustment
     # is used.
@@ -269,6 +273,11 @@ class BatchNormalizationBase(Layer):
     if self.adjustment is not None:
       raise ValueError('Passing fused=True is unsupported when '
                        'adjustment is specified.')
+    # TODO(reedwm): Support fp64 in FusedBatchNorm then remove this check.
+    if self._compute_dtype not in ('float16', 'bfloat16', 'float32', None):
+      raise ValueError('Passing fused=True is only supported when the compute '
+                       'dtype is float16, bfloat16, or float32. Got dtype: %s'
+                       % (self._compute_dtype,))
 
   def _fused_can_be_used(self):
     try:
@@ -514,11 +523,9 @@ class BatchNormalizationBase(Layer):
                                          K.zeros_like(update_delta))
         return state_ops.assign_sub(variable, update_delta, name=scope)
 
-  def _assign_new_value(self, variable, value, inputs_size=None):
+  def _assign_new_value(self, variable, value):
     with K.name_scope('AssignNewValue') as scope:
       with ops.colocate_with(variable):
-        if inputs_size is not None:
-          value = array_ops.where(inputs_size > 0, value, variable)
         return state_ops.assign(variable, value, name=scope)
 
   def _fused_batch_norm(self, inputs, training):
@@ -533,7 +540,14 @@ class BatchNormalizationBase(Layer):
     else:
       inputs_size = None
 
-    if compat.forward_compatible(2020, 3, 6):
+    # TODO(rmlarsen): Support using fused avg updates for non-eager execution
+    # after fixing graph pattern matching and enabling fused_batch_norm to
+    # take exponential_avg_factor as a tensor input.
+    use_fused_avg_updates = (
+        compat.forward_compatible(2020, 3, 6) and
+        ops.executing_eagerly_outside_functions() and
+        isinstance(self.momentum, (float, int)))
+    if use_fused_avg_updates:
       exponential_avg_factor = 1.0 - self.momentum
     else:
       exponential_avg_factor = None
@@ -569,6 +583,9 @@ class BatchNormalizationBase(Layer):
           data_format=self._data_format,
           exponential_avg_factor=exponential_avg_factor)
 
+    def _fused_batch_norm_training_empty():
+      return inputs, self.moving_mean, self.moving_variance
+
     def _fused_batch_norm_inference():
       return nn.fused_batch_norm(
           inputs,
@@ -580,13 +597,19 @@ class BatchNormalizationBase(Layer):
           is_training=False,
           data_format=self._data_format)
 
-    output, mean, variance = tf_utils.smart_cond(
-        training, _fused_batch_norm_training, _fused_batch_norm_inference)
+    train_op = _fused_batch_norm_training
+    if use_fused_avg_updates and inputs_size is not None:
+      train_op = lambda: tf_utils.smart_cond(inputs_size > 0,
+                                             _fused_batch_norm_training,
+                                             _fused_batch_norm_training_empty)
+
+    output, mean, variance = tf_utils.smart_cond(training, train_op,
+                                                 _fused_batch_norm_inference)
     variance = _maybe_add_or_remove_bessels_correction(variance, remove=True)
 
     training_value = tf_utils.constant_value(training)
     if training_value or training_value is None:
-      if not compat.forward_compatible(2020, 3, 6):
+      if not use_fused_avg_updates:
         if training_value is None:
           momentum = tf_utils.smart_cond(training, lambda: self.momentum,
                                          lambda: 1.0)
@@ -595,17 +618,16 @@ class BatchNormalizationBase(Layer):
 
       def mean_update():
         """Update self.moving_mean with the most recent data point."""
-        if compat.forward_compatible(2020, 3, 6):
-          return self._assign_new_value(self.moving_mean, mean, inputs_size)
+        if use_fused_avg_updates:
+          return self._assign_new_value(self.moving_mean, mean)
         else:
           return self._assign_moving_average(self.moving_mean, mean, momentum,
                                              inputs_size)
 
       def variance_update():
         """Update self.moving_variance with the most recent data point."""
-        if compat.forward_compatible(2020, 3, 6):
-          return self._assign_new_value(self.moving_variance, variance,
-                                        inputs_size)
+        if use_fused_avg_updates:
+          return self._assign_new_value(self.moving_variance, variance)
         else:
           return self._assign_moving_average(self.moving_variance, variance,
                                              momentum, inputs_size)
diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
index a2780980c0b..56f140f3edb 100644
--- a/tensorflow/python/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.eager import wrap_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.layers import normalization
@@ -70,7 +71,7 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
                 'center': False},
         input_shape=(3, 3))
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_batchnorm_weights(self):
     layer = keras.layers.BatchNormalization(scale=False, center=False)
     layer.build((None, 3, 4))
@@ -82,7 +83,7 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
     self.assertEqual(len(layer.trainable_weights), 2)
     self.assertEqual(len(layer.weights), 4)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_batchnorm_regularization(self):
     layer = keras.layers.BatchNormalization(
         gamma_regularizer='l1', beta_regularizer='l1')
@@ -153,7 +154,7 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
     _run_batchnorm_correctness_test(
         normalization_v2.BatchNormalization, dtype='float16')
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   @testing_utils.enable_v2_dtype_behavior
   def test_batchnorm_policy(self):
     norm = keras.layers.BatchNormalization(
@@ -191,7 +192,7 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
     train_loss = model.train_on_batch(test_data, test_targets)
     self.assertAlmostEqual(test_loss, train_loss)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_batchnorm_non_trainable_with_tf_function(self):
     inputs = keras.Input((3,))
     bn = normalization_v2.BatchNormalization()
@@ -250,9 +251,9 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
       self.assertAllClose(model.bn.moving_variance.numpy(), [0.9], atol=3e-2)
 
 
-class BatchNormalizationV1Test(test.TestCase):
+class BatchNormalizationV1Test(keras_parameterized.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_v1_fused_attribute(self):
     norm = normalization.BatchNormalization()
     inp = keras.layers.Input((4, 4, 4))
@@ -285,7 +286,7 @@ class BatchNormalizationV2Test(keras_parameterized.TestCase):
         kwargs={'fused': None},
         input_shape=(3, 3, 3))
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_v2_fused_attribute(self):
     norm = normalization_v2.BatchNormalization()
     self.assertEqual(norm.fused, None)
@@ -557,7 +558,7 @@ class LayerNormalizationTest(keras_parameterized.TestCase):
         kwargs={'axis': (-3, -1)},
         input_shape=(2, 8, 8, 3))
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_layernorm_weights(self):
     layer = keras.layers.LayerNormalization(scale=False, center=False)
     layer.build((None, 3, 4))
@@ -569,7 +570,7 @@ class LayerNormalizationTest(keras_parameterized.TestCase):
     self.assertEqual(len(layer.trainable_weights), 2)
     self.assertEqual(len(layer.weights), 2)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_layernorm_regularization(self):
     layer = keras.layers.LayerNormalization(
         gamma_regularizer='l1', beta_regularizer='l1')
@@ -612,25 +613,25 @@ class LayerNormalizationTest(keras_parameterized.TestCase):
     _run_layernorm_correctness_test(
         normalization.LayerNormalization, dtype='float16')
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testIncorrectAxisType(self):
     with self.assertRaisesRegexp(
         TypeError, r'Expected an int or a list/tuple of ints'):
       _ = normalization.LayerNormalization(axis={'axis': -1})
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testInvalidAxis(self):
     with self.assertRaisesRegexp(ValueError, r'Invalid axis: 3'):
       layer_norm = normalization.LayerNormalization(axis=3)
       layer_norm.build(input_shape=(2, 2, 2))
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testDuplicateAxis(self):
     with self.assertRaisesRegexp(ValueError, r'Duplicate axis:'):
       layer_norm = normalization.LayerNormalization(axis=[-1, -1])
       layer_norm.build(input_shape=(2, 2, 2))
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testFusedAttr(self):
     layer_norm = normalization.LayerNormalization(axis=[-2, -1])
     layer_norm.build(input_shape=(2, 2, 2))
@@ -696,7 +697,7 @@ class LayerNormalizationNumericsTest(keras_parameterized.TestCase):
         # some of the values are very close to zero.
         self.assertAllClose(expected, actual, rtol=tol, atol=tol)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_forward(self):
     # For numeric stability, we ensure the axis's dimension(s) have at least 4
     # elements.
diff --git a/tensorflow/python/keras/layers/pooling_test.py b/tensorflow/python/keras/layers/pooling_test.py
index 38fe2ab4ead..10d520cc6e2 100644
--- a/tensorflow/python/keras/layers/pooling_test.py
+++ b/tensorflow/python/keras/layers/pooling_test.py
@@ -18,20 +18,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
 
 
-class GlobalPoolingTest(test.TestCase):
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
+class GlobalPoolingTest(test.TestCase, parameterized.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_globalpooling_1d(self):
     testing_utils.layer_test(keras.layers.pooling.GlobalMaxPooling1D,
                              input_shape=(3, 4, 5))
@@ -44,7 +45,6 @@ class GlobalPoolingTest(test.TestCase):
                              kwargs={'data_format': 'channels_first'},
                              input_shape=(3, 4, 5))
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_globalpooling_1d_masking_support(self):
     model = keras.Sequential()
     model.add(keras.layers.Masking(mask_value=0., input_shape=(None, 4)))
@@ -56,7 +56,6 @@ class GlobalPoolingTest(test.TestCase):
     output = model.predict(model_input)
     self.assertAllClose(output[0], model_input[0, 0, :])
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_globalpooling_1d_with_ragged(self):
     ragged_data = ragged_factory_ops.constant([
         [[1.0, 1.0], [2.0, 2.0], [3.0, 3.0]],
@@ -76,7 +75,6 @@ class GlobalPoolingTest(test.TestCase):
 
     self.assertAllEqual(output_ragged, output_dense)
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_globalpooling_2d_with_ragged(self):
     ragged_data = ragged_factory_ops.constant([
         [[[1.0], [1.0]], [[2.0], [2.0]], [[3.0], [3.0]]],
@@ -95,7 +93,6 @@ class GlobalPoolingTest(test.TestCase):
 
     self.assertAllEqual(output_ragged, output_dense)
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_globalpooling_3d_with_ragged(self):
     ragged_data = ragged_factory_ops.constant([
         [[[[1.0]], [[1.0]]], [[[2.0]], [[2.0]]], [[[3.0]], [[3.0]]]],
@@ -110,7 +107,6 @@ class GlobalPoolingTest(test.TestCase):
     expected_output = constant_op.constant([[2.0], [1.5]])
     self.assertAllEqual(output_ragged, expected_output)
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_globalpooling_2d(self):
     testing_utils.layer_test(
         keras.layers.pooling.GlobalMaxPooling2D,
@@ -129,7 +125,6 @@ class GlobalPoolingTest(test.TestCase):
         kwargs={'data_format': 'channels_last'},
         input_shape=(3, 5, 6, 4))
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_globalpooling_3d(self):
     testing_utils.layer_test(
         keras.layers.pooling.GlobalMaxPooling3D,
@@ -149,9 +144,9 @@ class GlobalPoolingTest(test.TestCase):
         input_shape=(3, 4, 3, 4, 3))
 
 
-class Pooling2DTest(test.TestCase):
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
+class Pooling2DTest(test.TestCase, parameterized.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_maxpooling_2d(self):
     pool_size = (3, 3)
     for strides in [(1, 1), (2, 2)]:
@@ -164,7 +159,6 @@ class Pooling2DTest(test.TestCase):
           },
           input_shape=(3, 5, 6, 4))
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_averagepooling_2d(self):
     testing_utils.layer_test(
         keras.layers.AveragePooling2D,
@@ -196,9 +190,9 @@ class Pooling2DTest(test.TestCase):
             input_shape=(3, 4, 5, 6))
 
 
-class Pooling3DTest(test.TestCase):
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
+class Pooling3DTest(test.TestCase, parameterized.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_maxpooling_3d(self):
     if test.is_built_with_rocm():
       self.skipTest('Pooling with 3D tensors is not supported in ROCm')
@@ -219,7 +213,6 @@ class Pooling3DTest(test.TestCase):
         },
         input_shape=(3, 4, 11, 12, 10))
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_averagepooling_3d(self):
     if test.is_built_with_rocm():
       self.skipTest('Pooling with 3D tensors is not supported in ROCm')
@@ -241,9 +234,9 @@ class Pooling3DTest(test.TestCase):
         input_shape=(3, 4, 11, 12, 10))
 
 
-class Pooling1DTest(test.TestCase):
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
+class Pooling1DTest(test.TestCase, parameterized.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_maxpooling_1d(self):
     for padding in ['valid', 'same']:
       for stride in [1, 2]:
@@ -257,7 +250,6 @@ class Pooling1DTest(test.TestCase):
         kwargs={'data_format': 'channels_first'},
         input_shape=(3, 2, 6))
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_averagepooling_1d(self):
     for padding in ['valid', 'same']:
       for stride in [1, 2]:
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
index f78825cd453..8a886ec2778 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.compat import compat
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -36,6 +37,7 @@ from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import stateful_random_ops
 from tensorflow.python.ops import stateless_random_ops
+from tensorflow.python.util.tf_export import keras_export
 
 ResizeMethod = image_ops.ResizeMethod
 
@@ -51,6 +53,7 @@ _RESIZE_METHODS = {
 }
 
 
+@keras_export('keras.layers.experimental.preprocessing.Resizing')
 class Resizing(Layer):
   """Image resizing layer.
 
@@ -63,21 +66,21 @@ class Resizing(Layer):
     interpolation: String, the interpolation method. Defaults to `bilinear`.
       Supports `bilinear`, `nearest`, `bicubic`, `area`, `lanczos3`, `lanczos5`,
       `gaussian`, `mitchellcubic`
+    name: A string, the name of the layer.
   """
 
-  def __init__(self, height, width, interpolation='bilinear', **kwargs):
+  def __init__(self,
+               height,
+               width,
+               interpolation='bilinear',
+               name=None,
+               **kwargs):
     self.target_height = height
     self.target_width = width
     self.interpolation = interpolation
     self._interpolation_method = get_interpolation(interpolation)
     self.input_spec = InputSpec(ndim=4)
-    super(Resizing, self).__init__(**kwargs)
-
-  def build(self, input_shape):
-    channel_axis = 3
-    channel_dim = int(input_shape[channel_axis])
-    self.input_spec = InputSpec(ndim=4, axes={channel_axis: channel_dim})
-    self.built = True
+    super(Resizing, self).__init__(name=name, **kwargs)
 
   def call(self, inputs):
     outputs = image_ops.resize_images_v2(
@@ -101,6 +104,7 @@ class Resizing(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@keras_export('keras.layers.experimental.preprocessing.CenterCrop')
 class CenterCrop(Layer):
   """Crop the central portion of the images to target height and width.
 
@@ -118,19 +122,14 @@ class CenterCrop(Layer):
   Arguments:
     height: Integer, the height of the output shape.
     width: Integer, the width of the output shape.
+    name: A string, the name of the layer.
   """
 
-  def __init__(self, height, width, **kwargs):
+  def __init__(self, height, width, name=None, **kwargs):
     self.target_height = height
     self.target_width = width
     self.input_spec = InputSpec(ndim=4)
-    super(CenterCrop, self).__init__(**kwargs)
-
-  def build(self, input_shape):
-    channel_axis = 3
-    channel_dim = int(input_shape[channel_axis])
-    self.input_spec = InputSpec(ndim=4, axes={channel_axis: channel_dim})
-    self.built = True
+    super(CenterCrop, self).__init__(name=name, **kwargs)
 
   def call(self, inputs):
     inputs_shape = array_ops.shape(inputs)
@@ -173,6 +172,7 @@ class CenterCrop(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@keras_export('keras.layers.experimental.preprocessing.RandomCrop')
 class RandomCrop(Layer):
   """Randomly crop the images to target height and width.
 
@@ -195,15 +195,16 @@ class RandomCrop(Layer):
     height: Integer, the height of the output shape.
     width: Integer, the width of the output shape.
     seed: Integer. Used to create a random seed.
+    name: A string, the name of the layer.
   """
 
-  def __init__(self, height, width, seed=None, **kwargs):
+  def __init__(self, height, width, seed=None, name=None, **kwargs):
     self.height = height
     self.width = width
     self.seed = seed
     self._rng = make_generator(self.seed)
     self.input_spec = InputSpec(ndim=4)
-    super(RandomCrop, self).__init__(**kwargs)
+    super(RandomCrop, self).__init__(name=name, **kwargs)
 
   def call(self, inputs, training=None):
     if training is None:
@@ -278,6 +279,7 @@ class RandomCrop(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@keras_export('keras.layers.experimental.preprocessing.Rescaling')
 class Rescaling(Layer):
   """Multiply inputs by `scale`.
 
@@ -294,11 +296,12 @@ class Rescaling(Layer):
 
   Arguments:
     scale: Float, the scale to apply to the inputs.
+    name: A string, the name of the layer.
   """
 
-  def __init__(self, scale, **kwargs):
+  def __init__(self, scale, name=None, **kwargs):
     self.scale = scale
-    super(Rescaling, self).__init__(**kwargs)
+    super(Rescaling, self).__init__(name=name, **kwargs)
 
   def call(self, inputs):
     dtype = self._compute_dtype
@@ -315,13 +318,16 @@ class Rescaling(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+HORIZONTAL = 'horizontal'
+VERTICAL = 'vertical'
+HORIZONTAL_AND_VERTICAL = 'horizontal_and_vertical'
+
+
+@keras_export('keras.layers.experimental.preprocessing.RandomFlip')
 class RandomFlip(Layer):
   """Randomly flip each image horizontally and vertically.
 
-  This layer will by default flip the images horizontally and then vertically
-  during training time.
-  `RandomFlip(horizontal=True)` will only flip the input horizontally.
-  `RandomFlip(vertical=True)` will only flip the input vertically.
+  This layer will flip the images based on the `mode` attribute.
   During inference time, the output will be identical to input. Call the layer
   with `training=True` to flip the input.
 
@@ -334,23 +340,35 @@ class RandomFlip(Layer):
     `(samples, height, width, channels)`, data_format='channels_last'.
 
   Attributes:
-    horizontal: Bool, whether to randomly flip horizontally.
-    width: Bool, whether to randomly flip vertically.
+    mode: String indicating which flip mode to use. Can be "horizontal",
+      "vertical", or "horizontal_and_vertical". Defaults to
+      "horizontal_and_vertical".
     seed: Integer. Used to create a random seed.
+    name: A string, the name of the layer.
   """
 
-  def __init__(self, horizontal=None, vertical=None, seed=None, **kwargs):
-    # If both arguments are None, set both to True.
-    if horizontal is None and vertical is None:
+  def __init__(self,
+               mode=HORIZONTAL_AND_VERTICAL,
+               seed=None,
+               name=None,
+               **kwargs):
+    super(RandomFlip, self).__init__(name=name, **kwargs)
+    self.mode = mode
+    if mode == HORIZONTAL:
+      self.horizontal = True
+      self.vertical = False
+    elif mode == VERTICAL:
+      self.horizontal = False
+      self.vertical = True
+    elif mode == HORIZONTAL_AND_VERTICAL:
       self.horizontal = True
       self.vertical = True
     else:
-      self.horizontal = horizontal or False
-      self.vertical = vertical or False
+      raise ValueError('RandomFlip layer {name} received an unknown mode '
+                       'argument {arg}'.format(name=name, arg=mode))
     self.seed = seed
     self._rng = make_generator(self.seed)
     self.input_spec = InputSpec(ndim=4)
-    super(RandomFlip, self).__init__(**kwargs)
 
   def call(self, inputs, training=None):
     if training is None:
@@ -376,14 +394,15 @@ class RandomFlip(Layer):
 
   def get_config(self):
     config = {
-        'horizontal': self.horizontal,
-        'vertical': self.vertical,
+        'mode': self.mode,
         'seed': self.seed,
     }
     base_config = super(RandomFlip, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
+# TODO(tanzheny): Add examples, here and everywhere.
+@keras_export('keras.layers.experimental.preprocessing.RandomTranslation')
 class RandomTranslation(Layer):
   """Randomly translate each image during training.
 
@@ -400,16 +419,26 @@ class RandomTranslation(Layer):
       When represented as a single float, this value is used for both the upper
       and lower bound.
     fill_mode: Points outside the boundaries of the input are filled according
-      to the given mode (one of `{'nearest', 'bilinear'}`).
-    fill_value: Value used for points outside the boundaries of the input if
-      `mode='constant'`.
+      to the given mode (one of `{'constant', 'reflect', 'wrap'}`).
+      - *reflect*: `(d c b a | a b c d | d c b a)`
+        The input is extended by reflecting about the edge of the last pixel.
+      - *constant*: `(k k k k | a b c d | k k k k)`
+        The input is extended by filling all values beyond the edge with the
+        same constant value k = 0.
+      - *wrap*: `(a b c d | a b c d | a b c d)`
+        The input is extended by wrapping around to the opposite edge.
+    interpolation: Interpolation mode. Supported values: "nearest", "bilinear".
     seed: Integer. Used to create a random seed.
+    name: A string, the name of the layer.
+
   Input shape:
     4D tensor with shape: `(samples, height, width, channels)`,
       data_format='channels_last'.
+
   Output shape:
     4D tensor with shape: `(samples, height, width, channels)`,
       data_format='channels_last'.
+
   Raise:
     ValueError: if lower bound is not between [0, 1], or upper bound is
       negative.
@@ -418,9 +447,10 @@ class RandomTranslation(Layer):
   def __init__(self,
                height_factor,
                width_factor,
-               fill_mode='nearest',
-               fill_value=0.,
+               fill_mode='reflect',
+               interpolation='bilinear',
                seed=None,
+               name=None,
                **kwargs):
     self.height_factor = height_factor
     if isinstance(height_factor, (tuple, list)):
@@ -448,15 +478,20 @@ class RandomTranslation(Layer):
       raise ValueError('`width_factor` must have values between [-1, 1], '
                        'got {}'.format(width_factor))
 
-    if fill_mode not in {'nearest', 'bilinear'}:
+    if fill_mode not in {'reflect', 'wrap', 'constant'}:
       raise NotImplementedError(
-          '`fill_mode` {} is not supported yet.'.format(fill_mode))
+          'Unknown `fill_mode` {}. Only `reflect`, `wrap` and '
+          '`constant` are supported.'.format(fill_mode))
+    if interpolation not in {'nearest', 'bilinear'}:
+      raise NotImplementedError(
+          'Unknown `interpolation` {}. Only `nearest` and '
+          '`bilinear` are supported.'.format(interpolation))
     self.fill_mode = fill_mode
-    self.fill_value = fill_value
+    self.interpolation = interpolation
     self.seed = seed
     self._rng = make_generator(self.seed)
     self.input_spec = InputSpec(ndim=4)
-    super(RandomTranslation, self).__init__(**kwargs)
+    super(RandomTranslation, self).__init__(name=name, **kwargs)
 
   def call(self, inputs, training=None):
     if training is None:
@@ -485,7 +520,8 @@ class RandomTranslation(Layer):
       return transform(
           inputs,
           get_translation_matrix(translations),
-          interpolation=self.fill_mode)
+          interpolation=self.interpolation,
+          fill_mode=self.fill_mode)
 
     output = tf_utils.smart_cond(training, random_translated_inputs,
                                  lambda: inputs)
@@ -500,7 +536,7 @@ class RandomTranslation(Layer):
         'height_factor': self.height_factor,
         'width_factor': self.width_factor,
         'fill_mode': self.fill_mode,
-        'fill_value': self.fill_value,
+        'interpolation': self.interpolation,
         'seed': self.seed,
     }
     base_config = super(RandomTranslation, self).get_config()
@@ -542,7 +578,8 @@ def get_translation_matrix(translations, name=None):
 
 def transform(images,
               transforms,
-              interpolation='nearest',
+              fill_mode='reflect',
+              interpolation='bilinear',
               output_shape=None,
               name=None):
   """Applies the given transform(s) to the image(s).
@@ -559,11 +596,33 @@ def transform(images,
       `k = c0 x + c1 y + 1`. The transforms are *inverted* compared to the
       transform mapping input points to output points. Note that gradients are
       not backpropagated into transformation parameters.
-    interpolation: Interpolation mode. Supported values: "NEAREST", "BILINEAR".
+    fill_mode: Points outside the boundaries of the input are filled according
+      to the given mode (one of `{'constant', 'reflect', 'wrap'}`).
+    interpolation: Interpolation mode. Supported values: "nearest", "bilinear".
     output_shape: Output dimesion after the transform, [height, width]. If None,
       output is the same size as input image.
     name: The name of the op.
 
+  ## Fill mode.
+  Behavior for each valid value is as follows:
+
+  reflect (d c b a | a b c d | d c b a)
+  The input is extended by reflecting about the edge of the last pixel.
+
+  constant (k k k k | a b c d | k k k k)
+  The input is extended by filling all values beyond the edge with the same
+  constant value k = 0.
+
+  wrap (a b c d | a b c d | a b c d)
+  The input is extended by wrapping around to the opposite edge.
+
+  Input shape:
+    4D tensor with shape: `(samples, height, width, channels)`,
+      data_format='channels_last'.
+  Output shape:
+    4D tensor with shape: `(samples, height, width, channels)`,
+      data_format='channels_last'.
+
   Returns:
     Image(s) with the same type and shape as `images`, with the given
     transform(s) applied. Transformed coordinates outside of the input image
@@ -589,6 +648,13 @@ def transform(images,
                        'new_height, new_width, instead got '
                        '{}'.format(output_shape))
 
+    if compat.forward_compatible(2020, 3, 25):
+      return image_ops.image_projective_transform_v2(
+          images,
+          output_shape=output_shape,
+          transforms=transforms,
+          fill_mode=fill_mode.upper(),
+          interpolation=interpolation.upper())
     return image_ops.image_projective_transform_v2(
         images,
         output_shape=output_shape,
@@ -636,6 +702,7 @@ def get_rotation_matrix(angles, image_height, image_width, name=None):
         axis=1)
 
 
+@keras_export('keras.layers.experimental.preprocessing.RandomRotation')
 class RandomRotation(Layer):
   """Randomly rotate each image.
 
@@ -656,9 +723,24 @@ class RandomRotation(Layer):
       2 representing lower and upper bound for rotating clockwise and
       counter-clockwise. When represented as a single float, lower = upper.
     fill_mode: Points outside the boundaries of the input are filled according
-      to the given mode (one of `{'constant', 'nearest', 'bilinear', 'reflect',
-      'wrap'}`).
+      to the given mode (one of `{'constant', 'reflect', 'wrap'}`).
+      - *reflect*: `(d c b a | a b c d | d c b a)`
+        The input is extended by reflecting about the edge of the last pixel.
+      - *constant*: `(k k k k | a b c d | k k k k)`
+        The input is extended by filling all values beyond the edge with the
+        same constant value k = 0.
+      - *wrap*: `(a b c d | a b c d | a b c d)`
+    interpolation: Interpolation mode. Supported values: "nearest", "bilinear".
     seed: Integer. Used to create a random seed.
+    name: A string, the name of the layer.
+
+  Input shape:
+    4D tensor with shape: `(samples, height, width, channels)`,
+      data_format='channels_last'.
+  Output shape:
+    4D tensor with shape: `(samples, height, width, channels)`,
+      data_format='channels_last'.
+
   Raise:
     ValueError: if lower bound is not between [0, 1], or upper bound is
       negative.
@@ -666,8 +748,10 @@ class RandomRotation(Layer):
 
   def __init__(self,
                factor,
-               fill_mode='nearest',
+               fill_mode='reflect',
+               interpolation='bilinear',
                seed=None,
+               name=None,
                **kwargs):
     self.factor = factor
     if isinstance(factor, (tuple, list)):
@@ -678,14 +762,20 @@ class RandomRotation(Layer):
     if self.lower < 0. or self.upper < 0.:
       raise ValueError('Factor cannot have negative values, '
                        'got {}'.format(factor))
-    if fill_mode not in {'nearest', 'bilinear'}:
+    if fill_mode not in {'reflect', 'wrap', 'constant'}:
       raise NotImplementedError(
-          '`fill_mode` {} is not supported yet.'.format(fill_mode))
+          'Unknown `fill_mode` {}. Only `reflect`, `wrap` and '
+          '`constant` are supported.'.format(fill_mode))
+    if interpolation not in {'nearest', 'bilinear'}:
+      raise NotImplementedError(
+          'Unknown `interpolation` {}. Only `nearest` and '
+          '`bilinear` are supported.'.format(interpolation))
     self.fill_mode = fill_mode
+    self.interpolation = interpolation
     self.seed = seed
     self._rng = make_generator(self.seed)
     self.input_spec = InputSpec(ndim=4)
-    super(RandomRotation, self).__init__(**kwargs)
+    super(RandomRotation, self).__init__(name=name, **kwargs)
 
   def call(self, inputs, training=None):
     if training is None:
@@ -705,7 +795,8 @@ class RandomRotation(Layer):
       return transform(
           inputs,
           get_rotation_matrix(angles, img_hd, img_wd),
-          interpolation=self.fill_mode)
+          fill_mode=self.fill_mode,
+          interpolation=self.interpolation)
 
     output = tf_utils.smart_cond(training, random_rotated_inputs,
                                  lambda: inputs)
@@ -719,12 +810,14 @@ class RandomRotation(Layer):
     config = {
         'factor': self.factor,
         'fill_mode': self.fill_mode,
+        'interpolation': self.interpolation,
         'seed': self.seed,
     }
     base_config = super(RandomRotation, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@keras_export('keras.layers.experimental.preprocessing.RandomZoom')
 class RandomZoom(Layer):
   """Randomly zoom each image during training.
 
@@ -740,10 +833,16 @@ class RandomZoom(Layer):
       upper and lower bound. For instance, `width_factor=(0.2, 0.3)` result in
       an output zoom varying in the range `[original * 20%, original * 30%]`.
     fill_mode: Points outside the boundaries of the input are filled according
-      to the given mode (one of `{'nearest', 'bilinear'}`).
-    fill_value: Value used for points outside the boundaries of the input if
-      `mode='constant'`.
+      to the given mode (one of `{'constant', 'reflect', 'wrap'}`).
+      - *reflect*: `(d c b a | a b c d | d c b a)`
+        The input is extended by reflecting about the edge of the last pixel.
+      - *constant*: `(k k k k | a b c d | k k k k)`
+        The input is extended by filling all values beyond the edge with the
+        same constant value k = 0.
+      - *wrap*: `(a b c d | a b c d | a b c d)`
+    interpolation: Interpolation mode. Supported values: "nearest", "bilinear".
     seed: Integer. Used to create a random seed.
+    name: A string, the name of the layer.
 
   Input shape:
     4D tensor with shape:
@@ -761,9 +860,10 @@ class RandomZoom(Layer):
   def __init__(self,
                height_factor,
                width_factor,
-               fill_mode='nearest',
-               fill_value=0.,
+               fill_mode='reflect',
+               interpolation='bilinear',
                seed=None,
+               name=None,
                **kwargs):
     self.height_factor = height_factor
     if isinstance(height_factor, (tuple, list)):
@@ -791,15 +891,20 @@ class RandomZoom(Layer):
       raise ValueError('`width_factor` cannot have lower bound larger than '
                        'upper bound, got {}.'.format(width_factor))
 
-    if fill_mode not in {'nearest', 'bilinear'}:
+    if fill_mode not in {'reflect', 'wrap', 'constant'}:
       raise NotImplementedError(
-          '`fill_mode` {} is not supported yet.'.format(fill_mode))
+          'Unknown `fill_mode` {}. Only `reflect`, `wrap` and '
+          '`constant` are supported.'.format(fill_mode))
+    if interpolation not in {'nearest', 'bilinear'}:
+      raise NotImplementedError(
+          'Unknown `interpolation` {}. Only `nearest` and '
+          '`bilinear` are supported.'.format(interpolation))
     self.fill_mode = fill_mode
-    self.fill_value = fill_value
+    self.interpolation = interpolation
     self.seed = seed
     self._rng = make_generator(self.seed)
     self.input_spec = InputSpec(ndim=4)
-    super(RandomZoom, self).__init__(**kwargs)
+    super(RandomZoom, self).__init__(name=name, **kwargs)
 
   def call(self, inputs, training=None):
     if training is None:
@@ -827,7 +932,8 @@ class RandomZoom(Layer):
           dtype=inputs.dtype)
       return transform(
           inputs, get_zoom_matrix(zooms, img_hd, img_wd),
-          interpolation=self.fill_mode)
+          fill_mode=self.fill_mode,
+          interpolation=self.interpolation)
 
     output = tf_utils.smart_cond(training, random_zoomed_inputs,
                                  lambda: inputs)
@@ -842,7 +948,7 @@ class RandomZoom(Layer):
         'height_factor': self.height_factor,
         'width_factor': self.width_factor,
         'fill_mode': self.fill_mode,
-        'fill_value': self.fill_value,
+        'interpolation': self.interpolation,
         'seed': self.seed,
     }
     base_config = super(RandomZoom, self).get_config()
@@ -890,6 +996,7 @@ def get_zoom_matrix(zooms, image_height, image_width, name=None):
         axis=1)
 
 
+@keras_export('keras.layers.experimental.preprocessing.RandomContrast')
 class RandomContrast(Layer):
   """Adjust the contrast of an image or images by a random factor.
 
@@ -909,17 +1016,19 @@ class RandomContrast(Layer):
     `(samples, height, width, channels)`, data_format='channels_last'.
 
   Attributes:
-    factor: a positive float represented as fraction of value, or a tuple
-      of size 2 representing lower and upper bound. When represented as a
-      single float, lower = upper. The contrast factor will be randomly picked
-      between [1.0 - lower, 1.0 + upper].
+    factor: a positive float represented as fraction of value, or a tuple of
+      size 2 representing lower and upper bound. When represented as a single
+      float, lower = upper. The contrast factor will be randomly picked between
+      [1.0 - lower, 1.0 + upper].
     seed: Integer. Used to create a random seed.
+    name: A string, the name of the layer.
+
   Raise:
     ValueError: if lower bound is not between [0, 1], or upper bound is
       negative.
   """
 
-  def __init__(self, factor, seed=None, **kwargs):
+  def __init__(self, factor, seed=None, name=None, **kwargs):
     self.factor = factor
     if isinstance(factor, (tuple, list)):
       self.lower = factor[0]
@@ -931,7 +1040,7 @@ class RandomContrast(Layer):
                        'got {}'.format(factor))
     self.seed = seed
     self.input_spec = InputSpec(ndim=4)
-    super(RandomContrast, self).__init__(**kwargs)
+    super(RandomContrast, self).__init__(name=name, **kwargs)
 
   def call(self, inputs, training=None):
     if training is None:
@@ -958,6 +1067,7 @@ class RandomContrast(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@keras_export('keras.layers.experimental.preprocessing.RandomHeight')
 class RandomHeight(Layer):
   """Randomly vary the height of a batch of images during training.
 
@@ -967,8 +1077,8 @@ class RandomHeight(Layer):
   By default, this layer is inactive during inference.
 
   Arguments:
-    factor: A positive float (fraction of original height), or a tuple of
-      size 2 representing lower and upper bound for resizing vertically. When
+    factor: A positive float (fraction of original height), or a tuple of size 2
+      representing lower and upper bound for resizing vertically. When
       represented as a single float, this value is used for both the upper and
       lower bound. For instance, `factor=(0.2, 0.3)` results in an output height
       varying in the range `[original + 20%, original + 30%]`. `factor=(-0.2,
@@ -979,17 +1089,21 @@ class RandomHeight(Layer):
       Supports `bilinear`, `nearest`, `bicubic`, `area`, `lanczos3`, `lanczos5`,
       `gaussian`, `mitchellcubic`
     seed: Integer. Used to create a random seed.
+    name: A string, the name of the layer.
 
   Input shape:
-    4D tensor with shape:
-    `(samples, height, width, channels)` (data_format='channels_last').
-
+    4D tensor with shape: `(samples, height, width, channels)`
+      (data_format='channels_last').
   Output shape:
-    4D tensor with shape:
-    `(samples, random_height, width, channels)`.
+    4D tensor with shape: `(samples, random_height, width, channels)`.
   """
 
-  def __init__(self, factor, interpolation='bilinear', seed=None, **kwargs):
+  def __init__(self,
+               factor,
+               interpolation='bilinear',
+               seed=None,
+               name=None,
+               **kwargs):
     self.factor = factor
     if isinstance(factor, (tuple, list)):
       self.height_lower = -factor[0]
@@ -1004,7 +1118,7 @@ class RandomHeight(Layer):
     self.input_spec = InputSpec(ndim=4)
     self.seed = seed
     self._rng = make_generator(self.seed)
-    super(RandomHeight, self).__init__(**kwargs)
+    super(RandomHeight, self).__init__(name=name, **kwargs)
 
   def call(self, inputs, training=None):
     if training is None:
@@ -1046,6 +1160,7 @@ class RandomHeight(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@keras_export('keras.layers.experimental.preprocessing.RandomWidth')
 class RandomWidth(Layer):
   """Randomly vary the width of a batch of images during training.
 
@@ -1067,6 +1182,7 @@ class RandomWidth(Layer):
       Supports `bilinear`, `nearest`, `bicubic`, `area`, `lanczos3`, `lanczos5`,
       `gaussian`, `mitchellcubic`
     seed: Integer. Used to create a random seed.
+    name: A string, the name of the layer.
 
   Input shape:
     4D tensor with shape:
@@ -1077,7 +1193,12 @@ class RandomWidth(Layer):
     `(samples, random_height, width, channels)`.
   """
 
-  def __init__(self, factor, interpolation='bilinear', seed=None, **kwargs):
+  def __init__(self,
+               factor,
+               interpolation='bilinear',
+               seed=None,
+               name=None,
+               **kwargs):
     self.factor = factor
     if isinstance(factor, (tuple, list)):
       self.width_lower = -factor[0]
@@ -1092,7 +1213,7 @@ class RandomWidth(Layer):
     self.input_spec = InputSpec(ndim=4)
     self.seed = seed
     self._rng = make_generator(self.seed)
-    super(RandomWidth, self).__init__(**kwargs)
+    super(RandomWidth, self).__init__(name=name, **kwargs)
 
   def call(self, inputs, training=None):
     if training is None:
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
index b1b9914784c..ff5c63efd5a 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.compat import compat
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import keras_parameterized
@@ -290,11 +291,7 @@ class RescalingTest(keras_parameterized.TestCase):
 @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
 class RandomFlipTest(keras_parameterized.TestCase):
 
-  def _run_test(self,
-                flip_horizontal,
-                flip_vertical,
-                expected_output=None,
-                mock_random=None):
+  def _run_test(self, mode, expected_output=None, mock_random=None):
     np.random.seed(1337)
     num_samples = 2
     orig_height = 5
@@ -306,24 +303,24 @@ class RandomFlipTest(keras_parameterized.TestCase):
     inp = np.random.random((num_samples, orig_height, orig_width, channels))
     if expected_output is None:
       expected_output = inp
-      if flip_horizontal:
+      if mode == 'horizontal' or mode == 'horizontal_and_vertical':
         expected_output = np.flip(expected_output, axis=1)
-      if flip_vertical:
+      if mode == 'vertical' or mode == 'horizontal_and_vertical':
         expected_output = np.flip(expected_output, axis=2)
     with test.mock.patch.object(
         random_ops, 'random_uniform', return_value=mock_random):
       with tf_test_util.use_gpu():
-        layer = image_preprocessing.RandomFlip(flip_horizontal, flip_vertical)
+        layer = image_preprocessing.RandomFlip(mode)
         actual_output = layer(inp, training=1)
         self.assertAllClose(expected_output, actual_output)
 
-  @parameterized.named_parameters(('random_flip_horizontal', True, False),
-                                  ('random_flip_vertical', False, True),
-                                  ('random_flip_both', True, True),
-                                  ('random_flip_neither', False, False))
-  def test_random_flip(self, flip_horizontal, flip_vertical):
+  @parameterized.named_parameters(
+      ('random_flip_horizontal', 'horizontal'),
+      ('random_flip_vertical', 'vertical'),
+      ('random_flip_both', 'horizontal_and_vertical'))
+  def test_random_flip(self, mode):
     with CustomObjectScope({'RandomFlip': image_preprocessing.RandomFlip}):
-      self._run_test(flip_horizontal, flip_vertical)
+      self._run_test(mode)
 
   def test_random_flip_horizontal_half(self):
     with CustomObjectScope({'RandomFlip': image_preprocessing.RandomFlip}):
@@ -333,7 +330,7 @@ class RandomFlipTest(keras_parameterized.TestCase):
       input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
       expected_output = input_images.copy()
       expected_output[0, :, :, :] = np.flip(input_images[0, :, :, :], axis=0)
-      self._run_test(True, False, expected_output, mock_random)
+      self._run_test('horizontal', expected_output, mock_random)
 
   def test_random_flip_vertical_half(self):
     with CustomObjectScope({'RandomFlip': image_preprocessing.RandomFlip}):
@@ -343,14 +340,14 @@ class RandomFlipTest(keras_parameterized.TestCase):
       input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
       expected_output = input_images.copy()
       expected_output[0, :, :, :] = np.flip(input_images[0, :, :, :], axis=1)
-      self._run_test(False, True, expected_output, mock_random)
+      self._run_test('vertical', expected_output, mock_random)
 
   def test_random_flip_inference(self):
     with CustomObjectScope({'RandomFlip': image_preprocessing.RandomFlip}):
       input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
       expected_output = input_images
       with tf_test_util.use_gpu():
-        layer = image_preprocessing.RandomFlip(True, True)
+        layer = image_preprocessing.RandomFlip()
         actual_output = layer(input_images, training=0)
         self.assertAllClose(expected_output, actual_output)
 
@@ -369,7 +366,7 @@ class RandomFlipTest(keras_parameterized.TestCase):
 
   @tf_test_util.run_v2_only
   def test_config_with_custom_name(self):
-    layer = image_preprocessing.RandomFlip(5, 5, name='image_preproc')
+    layer = image_preprocessing.RandomFlip(name='image_preproc')
     config = layer.get_config()
     layer_1 = image_preprocessing.RandomFlip.from_config(config)
     self.assertEqual(layer_1.name, layer.name)
@@ -513,6 +510,250 @@ class RandomTranslationTest(keras_parameterized.TestCase):
     self.assertEqual(layer_1.name, layer.name)
 
 
+@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+class RandomTransformTest(keras_parameterized.TestCase):
+
+  def _run_random_transform_with_mock(self,
+                                      transform_matrix,
+                                      expected_output,
+                                      mode,
+                                      interpolation='bilinear'):
+    inp = np.arange(15).reshape((1, 5, 3, 1)).astype(np.float32)
+    with self.cached_session(use_gpu=True):
+      output = image_preprocessing.transform(
+          inp, transform_matrix, fill_mode=mode, interpolation=interpolation)
+    self.assertAllClose(expected_output, output)
+
+  def test_random_translation_reflect(self):
+    # reflected output is (dcba|abcd|dcba)
+
+    if compat.forward_compatible(2020, 3, 25):
+      # Test down shift by 1.
+      # pyformat: disable
+      expected_output = np.asarray(
+          [[0., 1., 2.],
+           [0., 1., 2.],
+           [3., 4., 5.],
+           [6., 7., 8],
+           [9., 10., 11]]).reshape((1, 5, 3, 1)).astype(np.float32)
+      # pyformat: enable
+      transform_matrix = np.asarray([[1., 0., 0., 0., 1., -1., 0., 0.]])
+      self._run_random_transform_with_mock(transform_matrix, expected_output,
+                                           'reflect')
+
+      # Test up shift by 1.
+      # pyformat: disable
+      expected_output = np.asarray(
+          [[3., 4., 5.],
+           [6., 7., 8],
+           [9., 10., 11.],
+           [12., 13., 14.],
+           [12., 13., 14.]]).reshape((1, 5, 3, 1)).astype(np.float32)
+      # pyformat: enable
+      transform_matrix = np.asarray([[1., 0., 0., 0., 1., 1., 0., 0.]])
+      self._run_random_transform_with_mock(transform_matrix, expected_output,
+                                           'reflect')
+
+      # Test left shift by 1.
+      # reflected output is (dcba|abcd|dcba)
+      # pyformat: disable
+      expected_output = np.asarray(
+          [[1., 2., 2.],
+           [4., 5., 5.],
+           [7., 8., 8.],
+           [10., 11., 11.],
+           [13., 14., 14.]]).reshape((1, 5, 3, 1)).astype(np.float32)
+      # pyformat: enable
+      transform_matrix = np.asarray([[1., 0., 1., 0., 1., 0., 0., 0.]])
+      self._run_random_transform_with_mock(transform_matrix, expected_output,
+                                           'reflect')
+
+      # Test right shift by 1.
+      # pyformat: disable
+      expected_output = np.asarray(
+          [[0., 0., 1.],
+           [3., 3., 4],
+           [6., 6., 7.],
+           [9., 9., 10.],
+           [12., 12., 13.]]).reshape((1, 5, 3, 1)).astype(np.float32)
+      # pyformat: enable
+      transform_matrix = np.asarray([[1., 0., -1., 0., 1., 0., 0., 0.]])
+      self._run_random_transform_with_mock(transform_matrix, expected_output,
+                                           'reflect')
+
+  def test_random_translation_wrap(self):
+    # warpped output is (abcd|abcd|abcd)
+
+    if compat.forward_compatible(2020, 3, 25):
+      # Test down shift by 1.
+      # pyformat: disable
+      expected_output = np.asarray(
+          [[12., 13., 14.],
+           [0., 1., 2.],
+           [3., 4., 5.],
+           [6., 7., 8],
+           [9., 10., 11]]).reshape((1, 5, 3, 1)).astype(np.float32)
+      # pyformat: enable
+      transform_matrix = np.asarray([[1., 0., 0., 0., 1., -1., 0., 0.]])
+      self._run_random_transform_with_mock(transform_matrix, expected_output,
+                                           'wrap')
+
+      # Test up shift by 1.
+      # pyformat: disable
+      expected_output = np.asarray(
+          [[3., 4., 5.],
+           [6., 7., 8],
+           [9., 10., 11.],
+           [12., 13., 14.],
+           [0., 1., 2.]]).reshape((1, 5, 3, 1)).astype(np.float32)
+      # pyformat: enable
+      transform_matrix = np.asarray([[1., 0., 0., 0., 1., 1., 0., 0.]])
+      self._run_random_transform_with_mock(transform_matrix, expected_output,
+                                           'wrap')
+
+      # Test left shift by 1.
+      # pyformat: disable
+      expected_output = np.asarray(
+          [[1., 2., 0.],
+           [4., 5., 3.],
+           [7., 8., 6.],
+           [10., 11., 9.],
+           [13., 14., 12.]]).reshape((1, 5, 3, 1)).astype(np.float32)
+      # pyformat: enable
+      transform_matrix = np.asarray([[1., 0., 1., 0., 1., 0., 0., 0.]])
+      self._run_random_transform_with_mock(transform_matrix, expected_output,
+                                           'wrap')
+
+      # Test right shift by 1.
+      # pyformat: disable
+      expected_output = np.asarray(
+          [[2., 0., 1.],
+           [5., 3., 4],
+           [8., 6., 7.],
+           [11., 9., 10.],
+           [14., 12., 13.]]).reshape((1, 5, 3, 1)).astype(np.float32)
+      # pyformat: enable
+      transform_matrix = np.asarray([[1., 0., -1., 0., 1., 0., 0., 0.]])
+      self._run_random_transform_with_mock(transform_matrix, expected_output,
+                                           'wrap')
+
+  def test_random_translation_constant(self):
+    # constant output is (0000|abcd|0000)
+
+    if compat.forward_compatible(2020, 3, 25):
+      # Test down shift by 1.
+      # pyformat: disable
+      expected_output = np.asarray(
+          [[0., 0., 0.],
+           [0., 1., 2.],
+           [3., 4., 5.],
+           [6., 7., 8],
+           [9., 10., 11]]).reshape((1, 5, 3, 1)).astype(np.float32)
+      # pyformat: enable
+      transform_matrix = np.asarray([[1., 0., 0., 0., 1., -1., 0., 0.]])
+      self._run_random_transform_with_mock(transform_matrix, expected_output,
+                                           'constant')
+
+      # Test up shift by 1.
+      # pyformat: disable
+      expected_output = np.asarray(
+          [[3., 4., 5.],
+           [6., 7., 8],
+           [9., 10., 11.],
+           [12., 13., 14.],
+           [0., 0., 0.]]).reshape((1, 5, 3, 1)).astype(np.float32)
+      # pyformat: enable
+      transform_matrix = np.asarray([[1., 0., 0., 0., 1., 1., 0., 0.]])
+      self._run_random_transform_with_mock(transform_matrix, expected_output,
+                                           'constant')
+
+      # Test left shift by 1.
+      # pyformat: disable
+      expected_output = np.asarray(
+          [[1., 2., 0.],
+           [4., 5., 0.],
+           [7., 8., 0.],
+           [10., 11., 0.],
+           [13., 14., 0.]]).reshape((1, 5, 3, 1)).astype(np.float32)
+      # pyformat: enable
+      transform_matrix = np.asarray([[1., 0., 1., 0., 1., 0., 0., 0.]])
+      self._run_random_transform_with_mock(transform_matrix, expected_output,
+                                           'constant')
+
+      # Test right shift by 1.
+      # pyformat: disable
+      expected_output = np.asarray(
+          [[0., 0., 1.],
+           [0., 3., 4],
+           [0., 6., 7.],
+           [0., 9., 10.],
+           [0., 12., 13.]]).reshape((1, 5, 3, 1)).astype(np.float32)
+      # pyformat: enable
+      transform_matrix = np.asarray([[1., 0., -1., 0., 1., 0., 0., 0.]])
+      self._run_random_transform_with_mock(transform_matrix, expected_output,
+                                           'constant')
+
+  def test_random_translation_nearest_interpolation(self):
+    # nearest output is (aaaa|abcd|dddd)
+
+    if compat.forward_compatible(2020, 3, 25):
+      # Test down shift by 1.
+      # pyformat: disable
+      expected_output = np.asarray(
+          [[0., 0., 0.],
+           [0., 1., 2.],
+           [3., 4., 5.],
+           [6., 7., 8],
+           [9., 10., 11]]).reshape((1, 5, 3, 1)).astype(np.float32)
+      # pyformat: enable
+      transform_matrix = np.asarray([[1., 0., 0., 0., 1., -1., 0., 0.]])
+      self._run_random_transform_with_mock(
+          transform_matrix, expected_output,
+          mode='constant', interpolation='nearest')
+
+      # Test up shift by 1.
+      # pyformat: disable
+      expected_output = np.asarray(
+          [[3., 4., 5.],
+           [6., 7., 8],
+           [9., 10., 11.],
+           [12., 13., 14.],
+           [0., 0., 0.]]).reshape((1, 5, 3, 1)).astype(np.float32)
+      # pyformat: enable
+      transform_matrix = np.asarray([[1., 0., 0., 0., 1., 1., 0., 0.]])
+      self._run_random_transform_with_mock(
+          transform_matrix, expected_output,
+          mode='constant', interpolation='nearest')
+
+      # Test left shift by 1.
+      # pyformat: disable
+      expected_output = np.asarray(
+          [[1., 2., 0.],
+           [4., 5., 0.],
+           [7., 8., 0.],
+           [10., 11., 0.],
+           [13., 14., 0.]]).reshape((1, 5, 3, 1)).astype(np.float32)
+      # pyformat: enable
+      transform_matrix = np.asarray([[1., 0., 1., 0., 1., 0., 0., 0.]])
+      self._run_random_transform_with_mock(
+          transform_matrix, expected_output,
+          mode='constant', interpolation='nearest')
+
+      # Test right shift by 1.
+      # pyformat: disable
+      expected_output = np.asarray(
+          [[0., 0., 1.],
+           [0., 3., 4],
+           [0., 6., 7.],
+           [0., 9., 10.],
+           [0., 12., 13.]]).reshape((1, 5, 3, 1)).astype(np.float32)
+      # pyformat: enable
+      transform_matrix = np.asarray([[1., 0., -1., 0., 1., 0., 0., 0.]])
+      self._run_random_transform_with_mock(
+          transform_matrix, expected_output,
+          mode='constant', interpolation='nearest')
+
+
 @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
 class RandomRotationTest(keras_parameterized.TestCase):
 
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization.py b/tensorflow/python/keras/layers/preprocessing/normalization.py
index 2b6b8a5b65a..00ee2adf70d 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization.py
@@ -53,8 +53,9 @@ class Normalization(CombinerPreprocessingLayer):
   Attributes:
       axis: Integer or tuple of integers, the axis or axes that should be
         normalized (typically the features axis). We will normalize each element
-        in the specified axis. The default is '-1' (the innermost axis); 0 (the
-        batch axis) is not allowed.
+        in the specified axis. If set to 'None', the layer will perform
+        scalar normalization (diving the input by a single scalar value).
+        0 (the batch axis) is not allowed.
   """
 
   def __init__(self, axis=-1, dtype=None, **kwargs):
@@ -79,8 +80,11 @@ class Normalization(CombinerPreprocessingLayer):
         mean_and_var_shape.append(input_shape[i])
         self._broadcast_shape[i] = input_shape[i]
     else:
-      mean_and_var_shape = input_shape[self.axis]
-      self._broadcast_shape[self.axis] = input_shape[self.axis]
+      if self.axis is None:
+        mean_and_var_shape = ()
+      else:
+        mean_and_var_shape = input_shape[self.axis]
+        self._broadcast_shape[self.axis] = input_shape[self.axis]
 
     # count is not used in this class's call() method, but is used to re-create
     # the accumulator during multiple calls to 'adapt'.
@@ -152,7 +156,10 @@ class _NormalizingCombiner(Combiner):
     """Compute a step in this computation, returning a new accumulator."""
 
     # This is the shape of all reduced axes (not specified in 'axis').
-    reduction_counts = np.delete(values.shape, self.axis)
+    if self.axis is None:
+      reduction_counts = values.shape
+    else:
+      reduction_counts = np.delete(values.shape, self.axis)
     # We get the number of elements that will be reduced by multiplying all
     # values of 'shape' corresponding to the reduced axes.
     count = np.prod(reduction_counts, dtype=np.int32)
@@ -160,7 +167,10 @@ class _NormalizingCombiner(Combiner):
     # We want to reduce across dimensions except those specified in 'axis'
     # when using np.mean or np.variance; create the tuple of axes to reduce
     # over here.
-    reduction_axes = tuple(np.delete(range(values.ndim), self.axis))
+    if self.axis is None:
+      reduction_axes = None
+    else:
+      reduction_axes = tuple(np.delete(range(values.ndim), self.axis))
     mean = np.mean(values, axis=reduction_axes, dtype=np.float64)
     variance = np.var(values, axis=reduction_axes, dtype=np.float64)
 
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization_test.py b/tensorflow/python/keras/layers/preprocessing/normalization_test.py
index 65100e9915b..2e6f4990cc5 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization_test.py
@@ -48,6 +48,18 @@ def _get_layer_computation_test_cases():
       "test_data": np.array([[1.], [2.], [3.]], np.float32),
       "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
       "testcase_name": "2d_single_element"
+  }, {
+      "adapt_data": np.array([[1.], [2.], [3.], [4.], [5.]], dtype=np.float32),
+      "axis": None,
+      "test_data": np.array([[1.], [2.], [3.]], np.float32),
+      "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
+      "testcase_name": "2d_single_element_none_axis"
+  }, {
+      "adapt_data": np.array([[1., 2., 3., 4., 5.]], dtype=np.float32),
+      "axis": None,
+      "test_data": np.array([[1.], [2.], [3.]], np.float32),
+      "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
+      "testcase_name": "2d_single_element_none_axis_flat_data"
   }, {
       "adapt_data":
           np.array([[[1., 2., 3.], [2., 3., 4.]], [[3., 4., 5.], [4., 5., 6.]]],
@@ -212,7 +224,7 @@ class NormalizationTest(keras_parameterized.TestCase,
       self.skipTest("'assign' doesn't work in V1, so don't test in V1.")
 
     cls = get_layer_class()
-    layer = cls()
+    layer = cls(axis=-1)
     layer.build((2,))
     layer.mean.assign([1.3, 2.0])
     with self.assertRaisesRegex(RuntimeError, "without also setting 'count'"):
@@ -224,7 +236,7 @@ class NormalizationTest(keras_parameterized.TestCase,
       self.skipTest("'assign' doesn't work in V1, so don't test in V1.")
 
     cls = get_layer_class()
-    layer = cls()
+    layer = cls(axis=-1)
     layer.build((2,))
     layer.variance.assign([1.3, 2.0])
     with self.assertRaisesRegex(RuntimeError, "without also setting 'count'"):
@@ -232,7 +244,7 @@ class NormalizationTest(keras_parameterized.TestCase,
 
   def test_weight_setting_continued_adapt_failure(self):
     cls = get_layer_class()
-    layer = cls()
+    layer = cls(axis=-1)
     layer.build((2,))
     layer.set_weights([np.array([1.3, 2.0]), np.array([0.0, 1.0]), np.array(0)])
     with self.assertRaisesRegex(RuntimeError, "without also setting 'count'"):
@@ -240,7 +252,7 @@ class NormalizationTest(keras_parameterized.TestCase,
 
   def test_weight_setting_no_count_continued_adapt_failure(self):
     cls = get_layer_class()
-    layer = cls()
+    layer = cls(axis=-1)
     layer.build((2,))
     layer.set_weights([np.array([1.3, 2.0]), np.array([0.0, 1.0])])
     with self.assertRaisesRegex(RuntimeError, "without also setting 'count'"):
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index 87df1cddc05..ad7905f4caa 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -608,16 +608,17 @@ class RNN(Layer):
         '`cell.state_size`. Received `state_spec`={}; '
         'however `cell.state_size` is '
         '{}'.format(init_state_specs, cell_state_sizes))
-    flat_cell_state_size = nest.flatten(cell_state_sizes)
-    flat_state_spec = nest.flatten(init_state_specs)
+    flat_cell_state_sizes = nest.flatten(cell_state_sizes)
+    flat_state_specs = nest.flatten(init_state_specs)
 
-    if len(flat_cell_state_size) != len(flat_state_spec):
+    if len(flat_cell_state_sizes) != len(flat_state_specs):
       raise validation_error
-    for i in range(len(flat_cell_state_size)):
+    for cell_state_spec, cell_state_size in zip(flat_state_specs,
+                                                flat_cell_state_sizes):
       if not tensor_shape.TensorShape(
           # Ignore the first axis for init_state which is for batch
-          flat_state_spec[i].shape[1:]).is_compatible_with(
-              tensor_shape.TensorShape(flat_cell_state_size[i])):
+          cell_state_spec.shape[1:]).is_compatible_with(
+              tensor_shape.TensorShape(cell_state_size)):
         raise validation_error
 
   @doc_controls.do_not_doc_inheritable
@@ -794,9 +795,10 @@ class RNN(Layer):
         zero_output_for_mask=self.zero_output_for_mask)
 
     if self.stateful:
-      updates = []
-      for state_, state in zip(nest.flatten(self.states), nest.flatten(states)):
-        updates.append(state_ops.assign(state_, state))
+      updates = [
+          state_ops.assign(self_state, state) for self_state, state in zip(
+              nest.flatten(self.states), nest.flatten(states))
+      ]
       self.add_update(updates)
 
     if self.return_sequences:
diff --git a/tensorflow/python/keras/layers/recurrent_v2.py b/tensorflow/python/keras/layers/recurrent_v2.py
index e8274e81fb2..a9d5ef8587c 100644
--- a/tensorflow/python/keras/layers/recurrent_v2.py
+++ b/tensorflow/python/keras/layers/recurrent_v2.py
@@ -44,8 +44,8 @@ from tensorflow.python.util.tf_export import keras_export
 
 # The following string constants are used by Defun approach for unified backend
 # of LSTM and GRU.
-_DEFUN_API_NAME_ATTRIBUTE = 'api_implements'
-_DEFUN_DEVICE_ATTRIBUTE = 'api_preferred_device'
+_FUNCTION_API_NAME_ATTRIBUTE = 'api_implements'
+_FUNCTION_DEVICE_ATTRIBUTE = 'api_preferred_device'
 _CPU_DEVICE_NAME = 'CPU'
 _GPU_DEVICE_NAME = 'GPU'
 
@@ -204,7 +204,7 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
   4. `unroll` is `False`
   5. `use_bias` is `True`
   6. `reset_after` is `True`
-  7. Inputs are not masked or strictly right padded.
+  7. Inputs, if use masking, are strictly right-padded.
 
   There are two variants of the GRU implementation. The default one is based on
   [v3](https://arxiv.org/abs/1406.1078v3) and has reset gate applied to hidden
@@ -466,10 +466,10 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
 
     gpu_gru_kwargs = {
         'inputs': inputs,
-        'init_h': initial_state[0],
-        'kernel': self.cell.kernel,
-        'recurrent_kernel': self.cell.recurrent_kernel,
-        'bias': self.cell.bias,
+        'init_h': _read_variable_value(initial_state[0]),
+        'kernel': _read_variable_value(self.cell.kernel),
+        'recurrent_kernel': _read_variable_value(self.cell.recurrent_kernel),
+        'bias': _read_variable_value(self.cell.bias),
         'mask': mask,
         'time_major': self.time_major,
         'go_backwards': self.go_backwards,
@@ -477,8 +477,6 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
     }
     normal_gru_kwargs = gpu_gru_kwargs.copy()
     normal_gru_kwargs.update({
-        'activation': self.activation,
-        'recurrent_activation': self.recurrent_activation,
         'zero_output_for_mask': self.zero_output_for_mask,
     })
 
@@ -503,9 +501,9 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
     return last_output, outputs, runtime, states
 
 
-def standard_gru(inputs, init_h, kernel, recurrent_kernel, bias, activation,
-                 recurrent_activation, mask, time_major, go_backwards,
-                 sequence_lengths, zero_output_for_mask):
+def standard_gru(inputs, init_h, kernel, recurrent_kernel, bias, mask,
+                 time_major, go_backwards, sequence_lengths,
+                 zero_output_for_mask):
   """GRU with standard kernel implementation.
 
   This implementation can be run on all types of hardware.
@@ -522,8 +520,6 @@ def standard_gru(inputs, init_h, kernel, recurrent_kernel, bias, activation,
     recurrent_kernel: Weights for cell recurrent kernel.
     bias: Weights for cell kernel bias and recurrent bias. The bias contains the
       combined input_bias and recurrent_bias.
-    activation: Activation function to use for output.
-    recurrent_activation: Activation function to use for hidden recurrent state.
     mask: Binary tensor of shape `(samples, timesteps)` indicating whether
       a given timestep should be masked.
     time_major: Boolean, whether the inputs are in the format of
@@ -565,9 +561,9 @@ def standard_gru(inputs, init_h, kernel, recurrent_kernel, bias, activation,
 
     recurrent_z, recurrent_r, recurrent_h = array_ops.split(matrix_inner, 3,
                                                             axis=1)
-    z = recurrent_activation(x_z + recurrent_z)
-    r = recurrent_activation(x_r + recurrent_r)
-    hh = activation(x_h + r * recurrent_h)
+    z = nn.sigmoid(x_z + recurrent_z)
+    r = nn.sigmoid(x_r + recurrent_r)
+    hh = nn.tanh(x_h + r * recurrent_h)
 
     # previous and candidate state mixed by update gate
     h = z * h_tm1 + (1 - z) * hh
@@ -674,8 +670,7 @@ def gpu_gru(inputs, init_h, kernel, recurrent_kernel, bias, mask, time_major,
 
 
 def gru_with_backend_selection(inputs, init_h, kernel, recurrent_kernel, bias,
-                               mask, time_major, go_backwards, activation,
-                               recurrent_activation, sequence_lengths,
+                               mask, time_major, go_backwards, sequence_lengths,
                                zero_output_for_mask):
   """Call the GRU with optimized backend kernel selection.
 
@@ -700,8 +695,6 @@ def gru_with_backend_selection(inputs, init_h, kernel, recurrent_kernel, bias,
       [time, batch, feature] or [batch, time, feature].
     go_backwards: Boolean (default False). If True, process the input sequence
       backwards and return the reversed sequence.
-    activation: Activation function to use for output.
-    recurrent_activation: Activation function to use for hidden recurrent state.
     sequence_lengths: The lengths of all sequences coming from a variable length
       input, such as ragged tensors. If the input has a fixed timestep size,
       this should be None.
@@ -719,15 +712,12 @@ def gru_with_backend_selection(inputs, init_h, kernel, recurrent_kernel, bias,
       'mask': mask,
       'time_major': time_major,
       'go_backwards': go_backwards,
-      'activation': activation,
-      'recurrent_activation': recurrent_activation,
       'sequence_lengths': sequence_lengths,
       'zero_output_for_mask': zero_output_for_mask,
   }
 
   def gpu_gru_with_fallback(inputs, init_h, kernel, recurrent_kernel, bias,
-                            mask, time_major, go_backwards, activation,
-                            recurrent_activation, sequence_lengths,
+                            mask, time_major, go_backwards, sequence_lengths,
                             zero_output_for_mask):
     """Use CuDNN kernel when mask is none or strictly right padded."""
     if mask is None:
@@ -764,8 +754,6 @@ def gru_with_backend_selection(inputs, init_h, kernel, recurrent_kernel, bias,
           mask=mask,
           time_major=time_major,
           go_backwards=go_backwards,
-          activation=activation,
-          recurrent_activation=recurrent_activation,
           sequence_lengths=sequence_lengths,
           zero_output_for_mask=zero_output_for_mask)
 
@@ -779,10 +767,14 @@ def gru_with_backend_selection(inputs, init_h, kernel, recurrent_kernel, bias,
   # sees multiple GRU layers added into same graph, and it will be able
   # to pair up the different implementations across them.
   api_name = 'gru_' + str(uuid.uuid4())
+  supportive_attribute = {
+      'time_major': time_major,
+      'go_backwards': go_backwards,
+  }
   defun_standard_gru = _generate_defun_backend(
-      api_name, _CPU_DEVICE_NAME, standard_gru)
-  defun_gpu_gru = _generate_defun_backend(api_name, _GPU_DEVICE_NAME,
-                                          gpu_gru_with_fallback)
+      api_name, _CPU_DEVICE_NAME, standard_gru, supportive_attribute)
+  defun_gpu_gru = _generate_defun_backend(
+      api_name, _GPU_DEVICE_NAME, gpu_gru_with_fallback, supportive_attribute)
 
   # Call the normal GRU impl and register the CuDNN impl function. The
   # grappler will kick in during session execution to optimize the graph.
@@ -933,7 +925,7 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
   3. `recurrent_dropout` == 0
   4. `unroll` is `False`
   5. `use_bias` is `True`
-  6. Inputs are not masked or strictly right padded.
+  6. Inputs, if use masking, are strictly right-padded.
 
   For example:
 
@@ -1149,11 +1141,11 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
         inputs = inputs * dropout_mask[0]
       gpu_lstm_kwargs = {
           'inputs': inputs,
-          'init_h': initial_state[0],
-          'init_c': initial_state[1],
-          'kernel': self.cell.kernel,
-          'recurrent_kernel': self.cell.recurrent_kernel,
-          'bias': self.cell.bias,
+          'init_h': _read_variable_value(initial_state[0]),
+          'init_c': _read_variable_value(initial_state[1]),
+          'kernel': _read_variable_value(self.cell.kernel),
+          'recurrent_kernel': _read_variable_value(self.cell.recurrent_kernel),
+          'bias': _read_variable_value(self.cell.bias),
           'mask': mask,
           'time_major': self.time_major,
           'go_backwards': self.go_backwards,
@@ -1161,8 +1153,6 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
       }
       normal_lstm_kwargs = gpu_lstm_kwargs.copy()
       normal_lstm_kwargs.update({
-          'activation': self.activation,
-          'recurrent_activation': self.recurrent_activation,
           'zero_output_for_mask': self.zero_output_for_mask,
       })
 
@@ -1189,9 +1179,10 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
       states = [new_h, new_c]
 
     if self.stateful:
-      updates = []
-      for i in range(len(states)):
-        updates.append(state_ops.assign(self.states[i], states[i]))
+      updates = [
+          state_ops.assign(self_state, state)
+          for self_state, state in zip(self.states, states)
+      ]
       self.add_update(updates)
 
     if self.return_sequences:
@@ -1239,8 +1230,8 @@ def _canonical_to_params(weights, biases, shape, transpose_weights=False):
 
 
 def standard_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias,
-                  activation, recurrent_activation, mask, time_major,
-                  go_backwards, sequence_lengths, zero_output_for_mask):
+                  mask, time_major, go_backwards, sequence_lengths,
+                  zero_output_for_mask):
   """LSTM with standard kernel implementation.
 
   This implementation can be run on all types for hardware.
@@ -1263,8 +1254,6 @@ def standard_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias,
     recurrent_kernel: weights for cell recurrent kernel.
     bias: weights for cell kernel bias and recurrent bias. Only recurrent bias
       is used in this case.
-    activation: Activation function to use for output.
-    recurrent_activation: Activation function to use for hidden recurrent state.
     mask: Boolean tensor for mask out the steps within sequence.
     time_major: boolean, whether the inputs are in the format of
       [time, batch, feature] or [batch, time, feature].
@@ -1299,12 +1288,12 @@ def standard_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias,
 
     z0, z1, z2, z3 = array_ops.split(z, 4, axis=1)
 
-    i = recurrent_activation(z0)
-    f = recurrent_activation(z1)
-    c = f * c_tm1 + i * activation(z2)
-    o = recurrent_activation(z3)
+    i = nn.sigmoid(z0)
+    f = nn.sigmoid(z1)
+    c = f * c_tm1 + i * nn.tanh(z2)
+    o = nn.sigmoid(z3)
 
-    h = o * activation(c)
+    h = o * nn.tanh(c)
     return h, [h, c]
 
   last_output, outputs, new_states = K.rnn(
@@ -1446,8 +1435,8 @@ def gpu_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias, mask,
 
 def lstm_with_backend_selection(inputs, init_h, init_c, kernel,
                                 recurrent_kernel, bias, mask, time_major,
-                                go_backwards, activation, recurrent_activation,
-                                sequence_lengths, zero_output_for_mask):
+                                go_backwards, sequence_lengths,
+                                zero_output_for_mask):
   """Call the LSTM with optimized backend kernel selection.
 
   Under the hood, this function will create two TF function, one with the most
@@ -1472,8 +1461,6 @@ def lstm_with_backend_selection(inputs, init_h, init_c, kernel,
       [time, batch, feature] or [batch, time, feature].
     go_backwards: Boolean (default False). If True, process the input sequence
       backwards and return the reversed sequence.
-    activation: Activation function to use for output.
-    recurrent_activation: Activation function to use for hidden recurrent state.
     sequence_lengths: The lengths of all sequences coming from a variable length
       input, such as ragged tensors. If the input has a fixed timestep size,
       this should be None.
@@ -1492,16 +1479,13 @@ def lstm_with_backend_selection(inputs, init_h, init_c, kernel,
       'mask': mask,
       'time_major': time_major,
       'go_backwards': go_backwards,
-      'activation': activation,
-      'recurrent_activation': recurrent_activation,
       'sequence_lengths': sequence_lengths,
       'zero_output_for_mask': zero_output_for_mask,
   }
 
   def gpu_lstm_with_fallback(inputs, init_h, init_c, kernel, recurrent_kernel,
-                             bias, mask, time_major, go_backwards, activation,
-                             recurrent_activation, sequence_lengths,
-                             zero_output_for_mask):
+                             bias, mask, time_major, go_backwards,
+                             sequence_lengths, zero_output_for_mask):
     """Use CuDNN kernel when mask is none or strictly right padded."""
     if mask is None:
       return gpu_lstm(
@@ -1540,8 +1524,6 @@ def lstm_with_backend_selection(inputs, init_h, init_c, kernel,
           mask=mask,
           time_major=time_major,
           go_backwards=go_backwards,
-          activation=activation,
-          recurrent_activation=recurrent_activation,
           sequence_lengths=sequence_lengths,
           zero_output_for_mask=zero_output_for_mask)
 
@@ -1555,10 +1537,14 @@ def lstm_with_backend_selection(inputs, init_h, init_c, kernel,
   # sees multiple LSTM layers added into same graph, and it will be able
   # to pair up the different implementations across them.
   api_name = 'lstm_' + str(uuid.uuid4())
+  supportive_attribute = {
+      'time_major': time_major,
+      'go_backwards': go_backwards,
+  }
   defun_standard_lstm = _generate_defun_backend(
-      api_name, _CPU_DEVICE_NAME, standard_lstm)
-  defun_gpu_lstm = _generate_defun_backend(api_name, _GPU_DEVICE_NAME,
-                                           gpu_lstm_with_fallback)
+      api_name, _CPU_DEVICE_NAME, standard_lstm, supportive_attribute)
+  defun_gpu_lstm = _generate_defun_backend(
+      api_name, _GPU_DEVICE_NAME, gpu_lstm_with_fallback, supportive_attribute)
 
   # Call the normal LSTM impl and register the CuDNN impl function. The
   # grappler will kick in during session execution to optimize the graph.
@@ -1627,11 +1613,13 @@ def calculate_sequence_by_mask(mask, time_major):
                              axis=timestep_index)
 
 
-def _generate_defun_backend(unique_api_name, preferred_device, func):
+def _generate_defun_backend(unique_api_name, preferred_device, func,
+                            supportive_attributes):
   function_attributes = {
-      _DEFUN_API_NAME_ATTRIBUTE: unique_api_name,
-      _DEFUN_DEVICE_ATTRIBUTE: preferred_device,
+      _FUNCTION_API_NAME_ATTRIBUTE: unique_api_name,
+      _FUNCTION_DEVICE_ATTRIBUTE: preferred_device,
   }
+  function_attributes.update(supportive_attributes)
   return function.defun_with_attributes(func=func,
                                         attributes=function_attributes,
                                         autograph=False)
@@ -1649,3 +1637,10 @@ def _runtime(runtime_name):
   with ops.device('/cpu:0'):
     return constant_op.constant(
         runtime_name, dtype=dtypes.float32, name='runtime')
+
+
+def _read_variable_value(v):
+  """Read the value of a resource variable if it is variable."""
+  if resource_variable_ops.is_resource_variable(v):
+    return v.read_value()
+  return v
diff --git a/tensorflow/python/keras/layers/rnn_cell_wrapper_v2_test.py b/tensorflow/python/keras/layers/rnn_cell_wrapper_v2_test.py
index e61c52970e3..8491ed0098e 100644
--- a/tensorflow/python/keras/layers/rnn_cell_wrapper_v2_test.py
+++ b/tensorflow/python/keras/layers/rnn_cell_wrapper_v2_test.py
@@ -23,7 +23,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras import layers
 from tensorflow.python.keras.layers import rnn_cell_wrapper_v2
 from tensorflow.python.keras.utils import generic_utils
@@ -35,9 +35,9 @@ from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 
 
+@combinations.generate(combinations.combine(mode=["graph", "eager"]))
 class RNNCellWrapperTest(test.TestCase, parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
   def testResidualWrapper(self):
     wrapper_type = rnn_cell_wrapper_v2.ResidualWrapper
     x = ops.convert_to_tensor_v2(np.array([[1., 1., 1.]]), dtype="float32")
@@ -60,7 +60,6 @@ class RNNCellWrapperTest(test.TestCase, parameterized.TestCase):
     # States are left untouched
     self.assertAllClose(res[2], res[3])
 
-  @test_util.run_in_graph_and_eager_modes
   def testResidualWrapperWithSlice(self):
     wrapper_type = rnn_cell_wrapper_v2.ResidualWrapper
     x = ops.convert_to_tensor_v2(
@@ -102,7 +101,6 @@ class RNNCellWrapperTest(test.TestCase, parameterized.TestCase):
   @parameterized.parameters(
       [[rnn_cell_impl.DropoutWrapper, rnn_cell_wrapper_v2.DropoutWrapper],
        [rnn_cell_impl.ResidualWrapper, rnn_cell_wrapper_v2.ResidualWrapper]])
-  @test_util.run_in_graph_and_eager_modes
   def testWrapperKerasStyle(self, wrapper, wrapper_v2):
     """Tests if wrapper cell is instantiated in keras style scope."""
     wrapped_cell_v2 = wrapper_v2(rnn_cell_impl.BasicRNNCell(1))
@@ -113,7 +111,6 @@ class RNNCellWrapperTest(test.TestCase, parameterized.TestCase):
 
   @parameterized.parameters(
       [rnn_cell_wrapper_v2.DropoutWrapper, rnn_cell_wrapper_v2.ResidualWrapper])
-  @test_util.run_in_graph_and_eager_modes
   def testWrapperWeights(self, wrapper):
     """Tests that wrapper weights contain wrapped cells weights."""
     base_cell = layers.SimpleRNNCell(1, name="basic_rnn_cell")
@@ -136,7 +133,6 @@ class RNNCellWrapperTest(test.TestCase, parameterized.TestCase):
 
   @parameterized.parameters(
       [rnn_cell_wrapper_v2.DropoutWrapper, rnn_cell_wrapper_v2.ResidualWrapper])
-  @test_util.run_in_graph_and_eager_modes
   def testWrapperV2Caller(self, wrapper):
     """Tests that wrapper V2 is using the LayerRNNCell's caller."""
 
@@ -153,7 +149,6 @@ class RNNCellWrapperTest(test.TestCase, parameterized.TestCase):
 
   @parameterized.parameters(
       [rnn_cell_wrapper_v2.DropoutWrapper, rnn_cell_wrapper_v2.ResidualWrapper])
-  @test_util.run_in_graph_and_eager_modes
   def testWrapperV2Build(self, wrapper):
     cell = rnn_cell_impl.LSTMCell(10)
     wrapper = wrapper(cell)
diff --git a/tensorflow/python/keras/layers/serialization_test.py b/tensorflow/python/keras/layers/serialization_test.py
index cd2668840f6..5c23937ddb4 100644
--- a/tensorflow/python/keras/layers/serialization_test.py
+++ b/tensorflow/python/keras/layers/serialization_test.py
@@ -22,7 +22,7 @@ from absl.testing import parameterized
 
 from tensorflow.python import keras
 from tensorflow.python import tf2
-from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras.layers import normalization as batchnorm_v1
 from tensorflow.python.keras.layers import normalization_v2 as batchnorm_v2
 from tensorflow.python.keras.layers import recurrent as rnn_v1
@@ -43,7 +43,7 @@ class SerializableInt(int):
     return cls(**config)
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class LayerSerializationTest(parameterized.TestCase, test.TestCase):
 
   def test_serialize_deserialize(self):
diff --git a/tensorflow/python/keras/layers/simplernn_test.py b/tensorflow/python/keras/layers/simplernn_test.py
index 47085fa68d8..3842e957463 100644
--- a/tensorflow/python/keras/layers/simplernn_test.py
+++ b/tensorflow/python/keras/layers/simplernn_test.py
@@ -18,20 +18,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
 
 
-@keras_parameterized.run_all_keras_modes
-class SimpleRNNLayerTest(keras_parameterized.TestCase):
+@combinations.generate(combinations.keras_mode_combinations())
+class SimpleRNNLayerTest(test.TestCase, parameterized.TestCase):
 
   def test_return_sequences_SimpleRNN(self):
     num_samples = 2
@@ -145,14 +146,14 @@ class SimpleRNNLayerTest(keras_parameterized.TestCase):
         bias_regularizer='l2',
         activity_regularizer='l1')
     layer.build((None, None, 2))
-    self.assertEqual(len(layer.losses), 3)
+    self.assertLen(layer.losses, 3)
 
     x = keras.backend.variable(np.ones((2, 3, 2)))
     layer(x)
     if context.executing_eagerly():
-      self.assertEqual(len(layer.losses), 4)
+      self.assertLen(layer.losses, 4)
     else:
-      self.assertEqual(len(layer.get_losses_for(x)), 1)
+      self.assertLen(layer.get_losses_for(x), 1)
 
   def test_statefulness_SimpleRNN(self):
     num_samples = 2
diff --git a/tensorflow/python/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py
index 55920fb4f2b..8765cf2dc25 100644
--- a/tensorflow/python/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/layers/wrappers_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import base_layer_utils
@@ -90,7 +91,7 @@ class _ResidualLSTMCell(keras.layers.LSTMCell):
 
 class TimeDistributedTest(keras_parameterized.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_timedistributed_dense(self):
     model = keras.models.Sequential()
     model.add(
@@ -253,7 +254,7 @@ class TimeDistributedTest(keras_parameterized.TestCase):
         self.assertAllEqual(mask_outputs_val[i], ref_mask_val[i])
       self.assertIs(mask_outputs[-1], None)  # final layer
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_TimeDistributed_with_masking_layer(self):
     # test with Masking layer
     model = keras.models.Sequential()
@@ -298,7 +299,7 @@ class TimeDistributedTest(keras_parameterized.TestCase):
         '`TimeDistributed` Layer should be passed an `input_shape `'):
       time_dist(ph)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_TimeDistributed_reshape(self):
 
     class NoReshapeLayer(keras.layers.Layer):
@@ -319,7 +320,7 @@ class TimeDistributedTest(keras_parameterized.TestCase):
     td3 = keras.layers.TimeDistributed(NoReshapeLayer())
     self.assertFalse(td3._always_use_reshape)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_TimeDistributed_output_shape_return_types(self):
 
     class TestLayer(keras.layers.Layer):
@@ -448,7 +449,7 @@ class TimeDistributedTest(keras_parameterized.TestCase):
     self.assertAllEqual(output_ragged.to_tensor(), output_dense)
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class BidirectionalTest(test.TestCase, parameterized.TestCase):
 
   @parameterized.parameters(['sum', 'concat', 'ave', 'mul'])
@@ -1175,7 +1176,7 @@ class ExampleWrapper(keras.layers.Wrapper):
     return self.layer(inputs, *args, **kwargs)
 
 
-class WrapperTest(keras_parameterized.TestCase):
+class WrapperTest(parameterized.TestCase):
 
   def test_wrapper_from_config_no_mutation(self):
     wrapper = ExampleWrapper(keras.layers.Dense(1))
diff --git a/tensorflow/python/keras/legacy_tf_layers/BUILD b/tensorflow/python/keras/legacy_tf_layers/BUILD
new file mode 100644
index 00000000000..49907d4328a
--- /dev/null
+++ b/tensorflow/python/keras/legacy_tf_layers/BUILD
@@ -0,0 +1,197 @@
+# Description:
+#   Contains the legacy TF layers (internal TensorFlow version).
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+py_library(
+    name = "layers_base",
+    srcs = [
+        "__init__.py",
+        "base.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/keras:backend",
+        "//tensorflow/python/keras/engine:base_layer",
+        "//tensorflow/python/keras/mixed_precision/experimental:policy",
+        "//tensorflow/python/training/tracking:base",
+    ],
+)
+
+py_library(
+    name = "convolutional",
+    srcs = ["convolutional.py"],
+    deps = [
+        ":layers_base",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python/keras/layers",
+    ],
+)
+
+py_library(
+    name = "core",
+    srcs = ["core.py"],
+    deps = [
+        ":layers_base",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python/keras/layers",
+    ],
+)
+
+py_library(
+    name = "normalization",
+    srcs = ["normalization.py"],
+    deps = [
+        ":layers_base",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python/keras/layers:normalization",
+    ],
+)
+
+py_library(
+    name = "pooling",
+    srcs = ["pooling.py"],
+    deps = [
+        ":layers_base",
+        "//tensorflow/python:util",
+        "//tensorflow/python/keras/layers",
+    ],
+)
+
+tf_py_test(
+    name = "base_test",
+    size = "small",
+    srcs = ["base_test.py"],
+    main = "base_test.py",
+    python_version = "PY3",
+    deps = [
+        ":core",
+        ":layers_base",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:extra_py_tests_deps",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/keras:combinations",
+        "//tensorflow/python/keras/engine:base_layer",
+        "//tensorflow/python/keras/engine:input_spec",
+    ],
+)
+
+tf_py_test(
+    name = "core_test",
+    size = "small",
+    srcs = ["core_test.py"],
+    main = "core_test.py",
+    python_version = "PY3",
+    deps = [
+        ":core",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:extra_py_tests_deps",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/keras:combinations",
+    ],
+)
+
+tf_py_test(
+    name = "convolutional_test",
+    size = "small",
+    srcs = ["convolutional_test.py"],
+    main = "convolutional_test.py",
+    python_version = "PY3",
+    deps = [
+        ":convolutional",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:extra_py_tests_deps",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
+tf_py_test(
+    name = "pooling_test",
+    size = "small",
+    srcs = ["pooling_test.py"],
+    main = "pooling_test.py",
+    python_version = "PY3",
+    deps = [
+        ":pooling",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:extra_py_tests_deps",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:random_ops",
+    ],
+)
+
+cuda_py_test(
+    name = "normalization_test",
+    size = "medium",
+    srcs = ["normalization_test.py"],
+    main = "normalization_test.py",
+    python_version = "PY3",
+    shard_count = 10,
+    deps = [
+        ":convolutional",
+        ":normalization",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:extra_py_tests_deps",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:saver",
+        "//tensorflow/python:training_lib",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
diff --git a/tensorflow/python/keras/legacy_tf_layers/__init__.py b/tensorflow/python/keras/legacy_tf_layers/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tensorflow/python/keras/legacy_tf_layers/base.py b/tensorflow/python/keras/legacy_tf_layers/base.py
new file mode 100644
index 00000000000..5a944703af9
--- /dev/null
+++ b/tensorflow/python/keras/legacy_tf_layers/base.py
@@ -0,0 +1,593 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Contains the base Layer class, from which all layers inherit."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend
+from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.mixed_precision.experimental import policy
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.util import deprecation
+from tensorflow.python.util import function_utils
+from tensorflow.python.util import nest
+from tensorflow.python.util import tf_contextlib
+from tensorflow.python.util.tf_export import tf_export
+
+# Avoid breaking users who directly import this symbol from this file.
+# TODO(fchollet): remove this.
+InputSpec = base_layer.InputSpec  # pylint: disable=invalid-name
+
+_KERAS_STYLE_SCOPE = False
+
+
+@tf_export(v1=['layers.experimental.keras_style_scope'])
+@tf_contextlib.contextmanager
+def keras_style_scope():
+  """Use Keras-style variable management.
+
+  All tf.layers and tf RNN cells created in this scope use Keras-style
+  variable management.  Creating such layers with a scope= argument is
+  disallowed, and reuse=True is disallowed.
+
+  The purpose of this scope is to allow users of existing layers to
+  slowly transition to a Keras layers API without breaking existing
+  functionality.
+
+  One example of this is when using TensorFlow's RNN classes with Keras
+  Models or Networks.  Because Keras models do not properly set variable
+  scopes, users of RNNs may either accidentally share scopes between two
+  different models, or get errors about variables that already exist.
+
+  Example:
+
+  ```python
+  class RNNModel(tf.keras.Model):
+
+    def __init__(self, name):
+      super(RNNModel, self).__init__(name=name)
+      self.rnn = tf.compat.v1.nn.rnn_cell.MultiRNNCell(
+        [tf.compat.v1.nn.rnn_cell.LSTMCell(64) for _ in range(2)])
+
+    def call(self, input, state):
+      return self.rnn(input, state)
+
+  model_1 = RNNModel("model_1")
+  model_2 = RNNModel("model_2")
+
+  # OK
+  output_1, next_state_1 = model_1(input, state)
+  # Raises an error about trying to create an already existing variable.
+  output_2, next_state_2 = model_2(input, state)
+  ```
+
+  The solution is to wrap the model construction and execution in a keras-style
+  scope:
+
+  ```python
+  with keras_style_scope():
+    model_1 = RNNModel("model_1")
+    model_2 = RNNModel("model_2")
+
+    # model_1 and model_2 are guaranteed to create their own variables.
+    output_1, next_state_1 = model_1(input, state)
+    output_2, next_state_2 = model_2(input, state)
+
+    assert len(model_1.weights) > 0
+    assert len(model_2.weights) > 0
+    assert(model_1.weights != model_2.weights)
+  ```
+
+  Yields:
+    A keras layer style scope.
+  """
+  global _KERAS_STYLE_SCOPE
+  stack = _KERAS_STYLE_SCOPE
+  _KERAS_STYLE_SCOPE = True
+  try:
+    yield
+  finally:
+    _KERAS_STYLE_SCOPE = stack
+
+
+@tf_export(v1=['layers.experimental.set_keras_style'])
+def set_keras_style():
+  """Use Keras-style variable management.
+
+  All tf.layers and tf RNN cells created after keras style ha been enabled
+  use Keras-style variable management.  Creating such layers with a
+  scope= argument is disallowed, and reuse=True is disallowed.
+
+  The purpose of this function is to allow users of existing layers to
+  slowly transition to Keras layers API without breaking existing
+  functionality.
+
+  For more details, see the documentation for `keras_style_scope`.
+
+  Note, once keras style has been set, it is set globally for the entire
+  program and cannot be unset.
+
+  Example:
+
+  ```python
+  set_keras_style()
+
+  model_1 = RNNModel(name="model_1")
+  model_2 = RNNModel(name="model_2")
+
+  # model_1 and model_2 are guaranteed to create their own variables.
+  output_1, next_state_1 = model_1(input, state)
+  output_2, next_state_2 = model_2(input, state)
+
+  assert len(model_1.weights) > 0
+  assert len(model_2.weights) > 0
+  assert(model_1.weights != model_2.weights)
+  ```
+  """
+  global _KERAS_STYLE_SCOPE
+  _KERAS_STYLE_SCOPE = True
+
+
+def _is_in_keras_style_scope():
+  global _KERAS_STYLE_SCOPE
+  return _KERAS_STYLE_SCOPE
+
+
+@tf_export(v1=['layers.Layer'])
+class Layer(base_layer.Layer):
+  """Base layer class.
+
+  It is considered legacy, and we recommend the use of `tf.keras.layers.Layer`
+  instead.
+
+  Arguments:
+    trainable: Boolean, whether the layer's variables should be trainable.
+    name: String name of the layer.
+    dtype: Default dtype of the layer's weights (default of `None` means use the
+      type of the first input).
+
+  Read-only properties:
+    name: The name of the layer (string).
+    dtype: Default dtype of the layer's weights (default of `None` means use the
+      type of the first input).
+    trainable_variables: List of trainable variables.
+    non_trainable_variables: List of non-trainable variables.
+    variables: List of all variables of this layer, trainable and
+      non-trainable.
+    updates: List of update ops of this layer.
+    losses: List of losses added by this layer.
+    trainable_weights: List of variables to be included in backprop.
+    non_trainable_weights: List of variables that should not be
+      included in backprop.
+    weights: The concatenation of the lists trainable_weights and
+      non_trainable_weights (in this order).
+
+  Mutable properties:
+    trainable: Whether the layer should be trained (boolean).
+    input_spec: Optional (list of) `InputSpec` object(s) specifying the
+      constraints on inputs that can be accepted by the layer.
+  """
+
+  def __init__(self, trainable=True, name=None, dtype=None,
+               **kwargs):
+    # For backwards compatibility, legacy layers do not use `ResourceVariable`
+    # by default.
+    self._use_resource_variables = False
+    scope = kwargs.pop('_scope', None)
+    self._reuse = kwargs.pop('_reuse', None)
+
+    # Avoid an incorrect lint error
+    self._trainable_weights = []
+    self.built = False
+
+    if dtype is None:
+      # Indicates to infer dtype from inputs. When the V2 dtype behavior is
+      # enabled, Keras layers default their dtype to floatx instead, so we pass
+      # an "_infer" policy to keep the old V1 behavior.
+      dtype = policy.Policy('_infer')
+
+    if 'autocast' not in kwargs:
+      kwargs['autocast'] = False
+
+    super(Layer, self).__init__(trainable=trainable, name=name, dtype=dtype,
+                                **kwargs)
+
+    if _is_in_keras_style_scope():
+      if scope is not None:
+        raise ValueError(
+            'scope argument not allowed when keras style layers are enabled, '
+            'but saw: {}'.format(scope))
+      if self._reuse is not None:
+        raise ValueError(
+            'reuse argument not allowed when keras style layers are enabled, '
+            'but saw: {}'.format(self._reuse))
+      self._keras_style = True
+    else:
+      self._keras_style = False
+
+    self._call_has_scope_arg = 'scope' in self._call_fn_args
+    if scope:
+      with vs.variable_scope(scope) as captured_scope:
+        self._scope = captured_scope
+    else:
+      self._scope = None
+    self._current_scope = None
+
+  # We no longer track graph in tf.layers layers. This property is only kept to
+  # maintain API backward compatibility.
+  @property
+  @deprecation.deprecated(
+      date=None,
+      instructions='Stop using this property because tf.layers layers no '
+      'longer track their graph.')
+  def graph(self):
+    if context.executing_eagerly():
+      raise RuntimeError('Layer.graph not supported when executing eagerly.')
+    return None
+
+  def _init_set_name(self, name):
+    # Determine layer name (non-unique).
+    if isinstance(name, vs.VariableScope):
+      base_name = name.name
+      self._name, _ = self._make_unique_name()
+    else:
+      base_name = name
+      self._name = name
+    if not name:
+      self._name, base_name = self._make_unique_name()
+    self._base_name = base_name
+
+  def _make_unique_name(self, name_uid_map=None, avoid_names=None,
+                        namespace='', zero_based=False):
+    base_name = base_layer.to_snake_case(self.__class__.__name__)
+    name = backend.unique_object_name(
+        base_name,
+        name_uid_map=name_uid_map,
+        avoid_names=avoid_names,
+        namespace=namespace,
+        zero_based=zero_based)
+    return (name, base_name)
+
+  @property
+  def scope_name(self):
+    if not self._scope:
+      raise ValueError('No name available for layer scope because the layer "' +
+                       self._name + '" has not been used yet. The scope name ' +
+                       ' is determined the first time the layer instance is ' +
+                       'called. You must therefore call the layer before ' +
+                       'querying `scope_name`.')
+    return self._scope.name
+
+  def add_loss(self, losses, inputs=None):
+    previous_losses_length = len(self._losses)
+    previous_callable_losses_length = len(self._callable_losses)
+    super(Layer, self).add_loss(losses, inputs=inputs)
+    if not context.executing_eagerly():
+      # TODO(fchollet): deprecate collection below.
+      new_losses = self._losses[previous_losses_length:]
+      new_callable_losses = self._callable_losses[
+          previous_callable_losses_length:]
+      for regularizer in new_callable_losses:
+        loss_tensor = regularizer()
+        if loss_tensor is not None:
+          new_losses.append(loss_tensor)
+      _add_elements_to_collection(
+          new_losses,
+          ops.GraphKeys.REGULARIZATION_LOSSES)
+
+  def _name_scope(self):
+    """Determines op naming for the Layer."""
+    if self._keras_style:
+      return super(Layer, self)._name_scope()
+    return self._current_scope.original_name_scope
+
+  def _set_scope(self, scope=None):
+    if self._scope is None:
+      # If constructed with _scope=None, lazy setting of scope.
+      if self._reuse:
+        with vs.variable_scope(
+            scope if scope is not None else self._base_name) as captured_scope:
+          self._scope = captured_scope
+      else:
+        with vs.variable_scope(
+            scope, default_name=self._base_name) as captured_scope:
+          self._scope = captured_scope
+
+  def add_weight(self,
+                 name,
+                 shape,
+                 dtype=None,
+                 initializer=None,
+                 regularizer=None,
+                 trainable=None,
+                 constraint=None,
+                 use_resource=None,
+                 synchronization=vs.VariableSynchronization.AUTO,
+                 aggregation=vs.VariableAggregation.NONE,
+                 partitioner=None,
+                 **kwargs):
+    """Adds a new variable to the layer, or gets an existing one; returns it.
+
+    Arguments:
+      name: variable name.
+      shape: variable shape.
+      dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
+      initializer: initializer instance (callable).
+      regularizer: regularizer instance (callable).
+      trainable: whether the variable should be part of the layer's
+        "trainable_variables" (e.g. variables, biases)
+        or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
+        Note, if the current variable scope is marked as non-trainable
+        then this parameter is ignored and any added variables are also
+        marked as non-trainable. `trainable` defaults to `True` unless
+        `synchronization` is set to `ON_READ`.
+      constraint: constraint instance (callable).
+      use_resource: Whether to use `ResourceVariable`.
+      synchronization: Indicates when a distributed a variable will be
+        aggregated. Accepted values are constants defined in the class
+        `tf.VariableSynchronization`. By default the synchronization is set to
+        `AUTO` and the current `DistributionStrategy` chooses
+        when to synchronize. If `synchronization` is set to `ON_READ`,
+        `trainable` must not be set to `True`.
+      aggregation: Indicates how a distributed variable will be aggregated.
+        Accepted values are constants defined in the class
+        `tf.VariableAggregation`.
+      partitioner: (optional) partitioner instance (callable).  If
+        provided, when the requested variable is created it will be split
+        into multiple partitions according to `partitioner`.  In this case,
+        an instance of `PartitionedVariable` is returned.  Available
+        partitioners include `tf.compat.v1.fixed_size_partitioner` and
+        `tf.compat.v1.variable_axis_size_partitioner`.  For more details, see
+        the documentation of `tf.compat.v1.get_variable` and the  "Variable
+        Partitioners and Sharding" section of the API guide.
+      **kwargs: Additional keyword arguments.
+
+    Returns:
+      The created variable.  Usually either a `Variable` or `ResourceVariable`
+      instance.  If `partitioner` is not `None`, a `PartitionedVariable`
+      instance is returned.
+
+    Raises:
+      RuntimeError: If called with partitioned variable regularization and
+        eager execution is enabled.
+      ValueError: When trainable has been set to True with synchronization
+        set as `ON_READ`.
+    """
+    for kwarg in kwargs:
+      if kwarg != 'experimental_autocast':
+        raise TypeError('Unknown keyword argument:', kwarg)
+    if self._keras_style:
+      return super(Layer, self).add_weight(
+          name=name,
+          shape=shape,
+          dtype=dtype,
+          initializer=initializer,
+          regularizer=regularizer,
+          trainable=trainable and self.trainable,
+          constraint=constraint,
+          use_resource=use_resource,
+          synchronization=vs.VariableSynchronization.AUTO,
+          aggregation=vs.VariableAggregation.NONE,
+          partitioner=partitioner,
+          **kwargs)
+
+    if synchronization == vs.VariableSynchronization.ON_READ:
+      if trainable:
+        raise ValueError(
+            'Synchronization value can be set to '
+            'VariableSynchronization.ON_READ only for non-trainable variables. '
+            'You have specified trainable=True and '
+            'synchronization=VariableSynchronization.ON_READ.')
+      else:
+        # Set trainable to be false when variable is to be synced on read.
+        trainable = False
+    elif trainable is None:
+      trainable = True
+
+    def _should_add_regularizer(variable, existing_variable_set):
+      if isinstance(variable, tf_variables.PartitionedVariable):
+        for var in variable:
+          if var in existing_variable_set:
+            return False
+        return True
+      else:
+        return variable not in existing_variable_set
+
+    init_graph = None
+    if not context.executing_eagerly():
+      default_graph = ops.get_default_graph()
+      if default_graph.building_function:
+        with ops.init_scope():
+          # Retrieve the variables from the graph into which variables
+          # will be lifted; if initialization ops will be lifted into
+          # the eager context, then there is nothing to retrieve, since variable
+          # collections are not supported when eager execution is enabled.
+          if not context.executing_eagerly():
+            init_graph = ops.get_default_graph()
+            existing_variables = set(tf_variables.global_variables())
+      else:
+        # Initialization ops will not be lifted out of the default graph.
+        init_graph = default_graph
+        existing_variables = set(tf_variables.global_variables())
+
+    if dtype is None:
+      dtype = self.dtype or dtypes.float32
+
+    self._set_scope(None)
+    reuse = self.built or self._reuse
+    prev_len_trainable = len(self._trainable_weights)
+    with vs.variable_scope(
+        self._scope, reuse=reuse, auxiliary_name_scope=False) as scope:
+      self._current_scope = scope
+      with ops.name_scope(self._name_scope(), skip_on_eager=False):
+        use_resource = (use_resource or
+                        self._use_resource_variables or
+                        scope.use_resource)
+        if initializer is None:
+          initializer = scope.initializer
+        variable = super(Layer, self).add_weight(
+            name,
+            shape,
+            dtype=dtypes.as_dtype(dtype),
+            initializer=initializer,
+            trainable=trainable and self.trainable,
+            constraint=constraint,
+            partitioner=partitioner,
+            use_resource=use_resource,
+            synchronization=synchronization,
+            aggregation=aggregation,
+            getter=vs.get_variable,
+            **kwargs)
+
+        if regularizer:
+          if (ops.executing_eagerly_outside_functions()
+              or _should_add_regularizer(variable, existing_variables)):
+            self._handle_weight_regularization(name, variable, regularizer)
+
+        if init_graph is not None:
+          # Handle edge case where a custom getter has overridden `trainable`.
+          # There is one known occurrence of this, in unit test
+          # testBasicRNNCellNotTrainable in
+          # contrib.rnn.python.kernel_tests.core_rnn_cell_test
+          with init_graph.as_default():
+            trainable_variables = tf_variables.trainable_variables()
+          if (trainable and self.trainable and
+              variable not in trainable_variables):
+            # A custom getter / variable scope overrode the trainable flag.
+            extra_trainable_vars = self._trainable_weights[prev_len_trainable:]
+            self._trainable_weights = self._trainable_weights[
+                :prev_len_trainable]
+            self._non_trainable_weights += extra_trainable_vars
+    return variable
+
+  def __call__(self, inputs, *args, **kwargs):
+    """Wraps `call`, applying pre- and post-processing steps.
+
+    Arguments:
+      inputs: input tensor(s).
+      *args: additional positional arguments to be passed to `self.call`.
+      **kwargs: additional keyword arguments to be passed to `self.call`.
+        **Note**: kwarg `scope` is reserved for use by the layer.
+
+    Returns:
+      Output tensor(s).
+
+    Note:
+      - If the layer's `call` method takes a `scope` keyword argument,
+        this argument will be automatically set to the current variable scope.
+      - If the layer's `call` method takes a `mask` argument (as some Keras
+        layers do), its default value will be set to the mask generated
+        for `inputs` by the previous layer (if `input` did come from
+        a layer that generated a corresponding mask, i.e. if it came from
+        a Keras layer with masking support.
+
+    Raises:
+      ValueError: if the layer's `call` method returns None (an invalid value).
+    """
+    scope = kwargs.pop('scope', None)
+
+    if self._keras_style:
+      if scope is not None:
+        raise ValueError(
+            'scope argument not allowed when keras style layers are enabled, '
+            'but saw: {}'.format(scope))
+      return super(Layer, self).__call__(inputs, *args, **kwargs)
+
+    self._set_scope(scope)
+
+    if self.built:
+      try:
+        # Some classes which inherit from Layer do not use its constructor, so
+        # rather than initializing to None we check for an AttributeError.
+        scope_context_manager = self._always_reuse_variable_scope
+      except AttributeError:
+        # From this point we will always set reuse=True, so create a "final"
+        # variable scope with this setting. We avoid re-creating variable scopes
+        # after this point as an optimization.
+        self._always_reuse_variable_scope = vs.variable_scope(
+            self._scope, reuse=True, auxiliary_name_scope=False)
+        scope_context_manager = self._always_reuse_variable_scope
+    else:
+      scope_context_manager = vs.variable_scope(
+          self._scope, reuse=self._reuse, auxiliary_name_scope=False)
+
+    with scope_context_manager as scope:
+      self._current_scope = scope
+
+      try:
+        call_has_scope_arg = self._call_has_scope_arg
+      except AttributeError:
+        self._call_fn_args = function_utils.fn_args(self.call)
+        self._call_has_scope_arg = 'scope' in self._call_fn_args
+        call_has_scope_arg = self._call_has_scope_arg
+      if call_has_scope_arg:
+        kwargs['scope'] = scope
+
+      # Actually call layer
+      outputs = super(Layer, self).__call__(inputs, *args, **kwargs)
+
+    if not context.executing_eagerly():
+      # Update global default collections.
+      _add_elements_to_collection(self.updates, ops.GraphKeys.UPDATE_OPS)
+    return outputs
+
+  def __deepcopy__(self, memo):
+    no_copy = set(['_graph', '_thread_local', '_metrics_lock'])
+    shallow_copy = set(['_scope', '_always_reuse_variable_scope'])
+    cls = self.__class__
+    result = cls.__new__(cls)
+    memo[id(self)] = result
+    for k, v in self.__dict__.items():
+      if k in no_copy:
+        setattr(result, k, v)
+      elif k in shallow_copy:
+        setattr(result, k, copy.copy(v))
+      elif base_layer.is_tensor_or_tensor_list(v):
+        setattr(result, k, v)
+      else:
+        setattr(result, k, copy.deepcopy(v, memo))
+    return result
+
+  def __setattr__(self, value, name):
+    # By-pass the automatic dependency tracking performed by the parent Layer.
+    super(trackable.Trackable, self).__setattr__(value, name)
+
+  @property
+  def _is_legacy_layer(self):
+    """Used by keras to check compatibility. This should not be overridden."""
+    return True
+
+
+def _add_elements_to_collection(elements, collection_list):
+  if context.executing_eagerly():
+    raise RuntimeError('Using collections from Layers not supported in Eager '
+                       'mode. Tried to add %s to %s' % (elements,
+                                                        collection_list))
+  elements = nest.flatten(elements)
+  collection_list = nest.flatten(collection_list)
+  for name in collection_list:
+    collection = ops.get_collection_ref(name)
+    collection_set = {id(e) for e in collection}
+    for element in elements:
+      if id(element) not in collection_set:
+        collection.append(element)
diff --git a/tensorflow/python/layers/base_test.py b/tensorflow/python/keras/legacy_tf_layers/base_test.py
similarity index 93%
rename from tensorflow/python/layers/base_test.py
rename to tensorflow/python/keras/legacy_tf_layers/base_test.py
index 321a1854819..23d4c90d55e 100644
--- a/tensorflow/python/layers/base_test.py
+++ b/tensorflow/python/keras/legacy_tf_layers/base_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import copy
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.eager import context
@@ -28,10 +29,11 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras.engine import base_layer as keras_base_layer
 from tensorflow.python.keras.engine import input_spec
-from tensorflow.python.layers import base as base_layers
-from tensorflow.python.layers import core as core_layers
+from tensorflow.python.keras.legacy_tf_layers import base as base_layers
+from tensorflow.python.keras.legacy_tf_layers import core as core_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -42,9 +44,9 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 
 
-class BaseLayerTest(test.TestCase):
+class BaseLayerTest(test.TestCase, parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testLayerProperties(self):
     layer = base_layers.Layer(name='my_layer')
     self.assertEqual(layer.variables, [])
@@ -58,13 +60,13 @@ class BaseLayerTest(test.TestCase):
     layer = base_layers.Layer(name='my_layer', trainable=False)
     self.assertEqual(layer.trainable, False)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testInt64Layer(self):
     layer = base_layers.Layer(name='my_layer', dtype='int64')
     layer.add_variable('my_var', [2, 2])
     self.assertEqual(layer.name, 'my_layer')
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testKerasStyleAddWeight(self):
     keras_layer = keras_base_layer.Layer(name='keras_layer')
     with ops.name_scope('foo', skip_on_eager=False):
@@ -87,7 +89,7 @@ class BaseLayerTest(test.TestCase):
           'my_var', [2, 2], initializer=init_ops.zeros_initializer())
     self.assertEqual(variable.name, 'bar/my_var:0')
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testAddWeight(self):
     layer = base_layers.Layer(name='my_layer')
 
@@ -176,7 +178,7 @@ class BaseLayerTest(test.TestCase):
     self.assertEqual(
         len(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)), 3)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testCall(self):
 
     class MyLayer(base_layers.Layer):
@@ -192,7 +194,7 @@ class BaseLayerTest(test.TestCase):
       # op is only supported in GRAPH mode
       self.assertEqual(outputs.op.name, 'my_layer/Square')
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testDeepCopy(self):
 
     class MyLayer(base_layers.Layer):
@@ -214,7 +216,7 @@ class BaseLayerTest(test.TestCase):
     self.assertEqual(layer_copy._scope.name, layer._scope.name)
     self.assertEqual(layer_copy._private_tensor, layer._private_tensor)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testScopeNaming(self):
 
     class PrivateLayer(base_layers.Layer):
@@ -262,7 +264,7 @@ class BaseLayerTest(test.TestCase):
       my_layer_scoped1.apply(inputs)
       self.assertEqual(my_layer_scoped1._scope.name, 'var_scope/my_layer_1')
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testInputSpecNdimCheck(self):
 
     class CustomerLayer(base_layers.Layer):
@@ -289,7 +291,7 @@ class BaseLayerTest(test.TestCase):
     layer = CustomerLayer()
     layer.apply(constant_op.constant([[1], [2]]))
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testInputSpecMinNdimCheck(self):
 
     class CustomerLayer(base_layers.Layer):
@@ -317,7 +319,7 @@ class BaseLayerTest(test.TestCase):
     layer = CustomerLayer()
     layer.apply(constant_op.constant([[[1], [2]]]))
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testInputSpecMaxNdimCheck(self):
 
     class CustomerLayer(base_layers.Layer):
@@ -345,7 +347,7 @@ class BaseLayerTest(test.TestCase):
     layer = CustomerLayer()
     layer.apply(constant_op.constant([[1], [2]]))
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testInputSpecDtypeCheck(self):
 
     class CustomerLayer(base_layers.Layer):
@@ -365,7 +367,7 @@ class BaseLayerTest(test.TestCase):
     layer = CustomerLayer()
     layer.apply(constant_op.constant(1.0, dtype=dtypes.float32))
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testInputSpecAxesCheck(self):
 
     class CustomerLayer(base_layers.Layer):
@@ -387,7 +389,7 @@ class BaseLayerTest(test.TestCase):
     layer = CustomerLayer()
     layer.apply(constant_op.constant([[1, 2], [3, 4], [5, 6]]))
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testInputSpecShapeCheck(self):
 
     class CustomerLayer(base_layers.Layer):
@@ -407,7 +409,7 @@ class BaseLayerTest(test.TestCase):
     layer = CustomerLayer()
     layer.apply(constant_op.constant([[1, 2, 3], [4, 5, 6]]))
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testNoInputSpec(self):
 
     class CustomerLayer(base_layers.Layer):
@@ -428,7 +430,7 @@ class BaseLayerTest(test.TestCase):
       layer.apply(array_ops.placeholder('int32'))
       layer.apply(array_ops.placeholder('int32', shape=(2, 3)))
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_count_params(self):
     dense = core_layers.Dense(16)
     dense.build((None, 4))
@@ -438,7 +440,7 @@ class BaseLayerTest(test.TestCase):
     with self.assertRaises(ValueError):
       dense.count_params()
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testDictInputOutput(self):
 
     class DictLayer(base_layers.Layer):
@@ -648,8 +650,8 @@ class IdentityLayer(base_layers.Layer):
     return inputs
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class DTypeTest(test.TestCase):
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
+class DTypeTest(test.TestCase, parameterized.TestCase):
 
   def _const(self, dtype):
     return array_ops.constant(1, dtype=dtype)
diff --git a/tensorflow/python/keras/legacy_tf_layers/convolutional.py b/tensorflow/python/keras/legacy_tf_layers/convolutional.py
new file mode 100644
index 00000000000..4c91251a0e7
--- /dev/null
+++ b/tensorflow/python/keras/legacy_tf_layers/convolutional.py
@@ -0,0 +1,1469 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Contains the convolutional layer classes and their functional aliases.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras import layers as keras_layers
+from tensorflow.python.keras.legacy_tf_layers import base
+from tensorflow.python.ops import init_ops
+from tensorflow.python.util import deprecation
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export(v1=['layers.Conv1D'])
+class Conv1D(keras_layers.Conv1D, base.Layer):
+  """1D convolution layer (e.g. temporal convolution).
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
+  a bias vector is created and added to the outputs. Finally, if
+  `activation` is not `None`, it is applied to the outputs as well.
+
+  Arguments:
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of a single integer, specifying the
+      length of the 1D convolution window.
+    strides: An integer or tuple/list of a single integer,
+      specifying the stride length of the convolution.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    dilation_rate: An integer or tuple/list of a single integer, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any `strides` value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: An initializer for the convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
+    kernel_regularizer: Optional regularizer for the convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Optional regularizer function for the output.
+    kernel_constraint: Optional projection function to be applied to the
+        kernel after being updated by an `Optimizer` (e.g. used to implement
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, filters,
+               kernel_size,
+               strides=1,
+               padding='valid',
+               data_format='channels_last',
+               dilation_rate=1,
+               activation=None,
+               use_bias=True,
+               kernel_initializer=None,
+               bias_initializer=init_ops.zeros_initializer(),
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(Conv1D, self).__init__(
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        activation=activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        bias_initializer=bias_initializer,
+        kernel_regularizer=kernel_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        kernel_constraint=kernel_constraint,
+        bias_constraint=bias_constraint,
+        trainable=trainable,
+        name=name, **kwargs)
+
+
+@deprecation.deprecated(
+    date=None,
+    instructions='Use `tf.keras.layers.Conv1D` instead.')
+@tf_export(v1=['layers.conv1d'])
+def conv1d(inputs,
+           filters,
+           kernel_size,
+           strides=1,
+           padding='valid',
+           data_format='channels_last',
+           dilation_rate=1,
+           activation=None,
+           use_bias=True,
+           kernel_initializer=None,
+           bias_initializer=init_ops.zeros_initializer(),
+           kernel_regularizer=None,
+           bias_regularizer=None,
+           activity_regularizer=None,
+           kernel_constraint=None,
+           bias_constraint=None,
+           trainable=True,
+           name=None,
+           reuse=None):
+  """Functional interface for 1D convolution layer (e.g. temporal convolution).
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
+  a bias vector is created and added to the outputs. Finally, if
+  `activation` is not `None`, it is applied to the outputs as well.
+
+  Arguments:
+    inputs: Tensor input.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of a single integer, specifying the
+      length of the 1D convolution window.
+    strides: An integer or tuple/list of a single integer,
+      specifying the stride length of the convolution.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    dilation_rate: An integer or tuple/list of a single integer, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any `strides` value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: An initializer for the convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
+    kernel_regularizer: Optional regularizer for the convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Optional regularizer function for the output.
+    kernel_constraint: Optional projection function to be applied to the
+        kernel after being updated by an `Optimizer` (e.g. used to implement
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Returns:
+    Output tensor.
+
+  Raises:
+    ValueError: if eager execution is enabled.
+  """
+  layer = Conv1D(
+      filters=filters,
+      kernel_size=kernel_size,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      dilation_rate=dilation_rate,
+      activation=activation,
+      use_bias=use_bias,
+      kernel_initializer=kernel_initializer,
+      bias_initializer=bias_initializer,
+      kernel_regularizer=kernel_regularizer,
+      bias_regularizer=bias_regularizer,
+      activity_regularizer=activity_regularizer,
+      kernel_constraint=kernel_constraint,
+      bias_constraint=bias_constraint,
+      trainable=trainable,
+      name=name,
+      _reuse=reuse,
+      _scope=name)
+  return layer.apply(inputs)
+
+
+@tf_export(v1=['layers.Conv2D'])
+class Conv2D(keras_layers.Conv2D, base.Layer):
+  """2D convolution layer (e.g. spatial convolution over images).
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
+  a bias vector is created and added to the outputs. Finally, if
+  `activation` is not `None`, it is applied to the outputs as well.
+
+  Arguments:
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of 2 integers, specifying the
+      height and width of the 2D convolution window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 2 integers,
+      specifying the strides of the convolution along the height and width.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+
+    dilation_rate: An integer or tuple/list of 2 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: An initializer for the convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
+    kernel_regularizer: Optional regularizer for the convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Optional regularizer function for the output.
+    kernel_constraint: Optional projection function to be applied to the
+        kernel after being updated by an `Optimizer` (e.g. used to implement
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, filters,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               data_format='channels_last',
+               dilation_rate=(1, 1),
+               activation=None,
+               use_bias=True,
+               kernel_initializer=None,
+               bias_initializer=init_ops.zeros_initializer(),
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(Conv2D, self).__init__(
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        activation=activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        bias_initializer=bias_initializer,
+        kernel_regularizer=kernel_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        kernel_constraint=kernel_constraint,
+        bias_constraint=bias_constraint,
+        trainable=trainable,
+        name=name, **kwargs)
+
+
+@deprecation.deprecated(
+    date=None,
+    instructions='Use `tf.keras.layers.Conv2D` instead.')
+@tf_export(v1=['layers.conv2d'])
+def conv2d(inputs,
+           filters,
+           kernel_size,
+           strides=(1, 1),
+           padding='valid',
+           data_format='channels_last',
+           dilation_rate=(1, 1),
+           activation=None,
+           use_bias=True,
+           kernel_initializer=None,
+           bias_initializer=init_ops.zeros_initializer(),
+           kernel_regularizer=None,
+           bias_regularizer=None,
+           activity_regularizer=None,
+           kernel_constraint=None,
+           bias_constraint=None,
+           trainable=True,
+           name=None,
+           reuse=None):
+  """Functional interface for the 2D convolution layer.
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
+  a bias vector is created and added to the outputs. Finally, if
+  `activation` is not `None`, it is applied to the outputs as well.
+
+  Arguments:
+    inputs: Tensor input.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of 2 integers, specifying the
+      height and width of the 2D convolution window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 2 integers,
+      specifying the strides of the convolution along the height and width.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+
+    dilation_rate: An integer or tuple/list of 2 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: An initializer for the convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
+    kernel_regularizer: Optional regularizer for the convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Optional regularizer function for the output.
+    kernel_constraint: Optional projection function to be applied to the
+        kernel after being updated by an `Optimizer` (e.g. used to implement
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Returns:
+    Output tensor.
+
+  Raises:
+    ValueError: if eager execution is enabled.
+  """
+  layer = Conv2D(
+      filters=filters,
+      kernel_size=kernel_size,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      dilation_rate=dilation_rate,
+      activation=activation,
+      use_bias=use_bias,
+      kernel_initializer=kernel_initializer,
+      bias_initializer=bias_initializer,
+      kernel_regularizer=kernel_regularizer,
+      bias_regularizer=bias_regularizer,
+      activity_regularizer=activity_regularizer,
+      kernel_constraint=kernel_constraint,
+      bias_constraint=bias_constraint,
+      trainable=trainable,
+      name=name,
+      _reuse=reuse,
+      _scope=name)
+  return layer.apply(inputs)
+
+
+@tf_export(v1=['layers.Conv3D'])
+class Conv3D(keras_layers.Conv3D, base.Layer):
+  """3D convolution layer (e.g. spatial convolution over volumes).
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
+  a bias vector is created and added to the outputs. Finally, if
+  `activation` is not `None`, it is applied to the outputs as well.
+
+  Arguments:
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of 3 integers, specifying the
+      depth, height and width of the 3D convolution window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 3 integers,
+      specifying the strides of the convolution along the depth,
+      height and width.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, depth, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, depth, height, width)`.
+    dilation_rate: An integer or tuple/list of 3 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: An initializer for the convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
+    kernel_regularizer: Optional regularizer for the convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Optional regularizer function for the output.
+    kernel_constraint: Optional projection function to be applied to the
+        kernel after being updated by an `Optimizer` (e.g. used to implement
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, filters,
+               kernel_size,
+               strides=(1, 1, 1),
+               padding='valid',
+               data_format='channels_last',
+               dilation_rate=(1, 1, 1),
+               activation=None,
+               use_bias=True,
+               kernel_initializer=None,
+               bias_initializer=init_ops.zeros_initializer(),
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(Conv3D, self).__init__(
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        activation=activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        bias_initializer=bias_initializer,
+        kernel_regularizer=kernel_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        kernel_constraint=kernel_constraint,
+        bias_constraint=bias_constraint,
+        trainable=trainable,
+        name=name, **kwargs)
+
+
+@deprecation.deprecated(
+    date=None,
+    instructions='Use `tf.keras.layers.Conv3D` instead.')
+@tf_export(v1=['layers.conv3d'])
+def conv3d(inputs,
+           filters,
+           kernel_size,
+           strides=(1, 1, 1),
+           padding='valid',
+           data_format='channels_last',
+           dilation_rate=(1, 1, 1),
+           activation=None,
+           use_bias=True,
+           kernel_initializer=None,
+           bias_initializer=init_ops.zeros_initializer(),
+           kernel_regularizer=None,
+           bias_regularizer=None,
+           activity_regularizer=None,
+           kernel_constraint=None,
+           bias_constraint=None,
+           trainable=True,
+           name=None,
+           reuse=None):
+  """Functional interface for the 3D convolution layer.
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
+  a bias vector is created and added to the outputs. Finally, if
+  `activation` is not `None`, it is applied to the outputs as well.
+
+  Arguments:
+    inputs: Tensor input.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of 3 integers, specifying the
+      depth, height and width of the 3D convolution window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 3 integers,
+      specifying the strides of the convolution along the depth,
+      height and width.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, depth, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, depth, height, width)`.
+    dilation_rate: An integer or tuple/list of 3 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: An initializer for the convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
+    kernel_regularizer: Optional regularizer for the convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Optional regularizer function for the output.
+    kernel_constraint: Optional projection function to be applied to the
+        kernel after being updated by an `Optimizer` (e.g. used to implement
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Returns:
+    Output tensor.
+
+  Raises:
+    ValueError: if eager execution is enabled.
+  """
+  layer = Conv3D(
+      filters=filters,
+      kernel_size=kernel_size,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      dilation_rate=dilation_rate,
+      activation=activation,
+      use_bias=use_bias,
+      kernel_initializer=kernel_initializer,
+      bias_initializer=bias_initializer,
+      kernel_regularizer=kernel_regularizer,
+      bias_regularizer=bias_regularizer,
+      activity_regularizer=activity_regularizer,
+      kernel_constraint=kernel_constraint,
+      bias_constraint=bias_constraint,
+      trainable=trainable,
+      name=name,
+      _reuse=reuse,
+      _scope=name)
+  return layer.apply(inputs)
+
+
+@tf_export(v1=['layers.SeparableConv1D'])
+class SeparableConv1D(keras_layers.SeparableConv1D, base.Layer):
+  """Depthwise separable 1D convolution.
+
+  This layer performs a depthwise convolution that acts separately on
+  channels, followed by a pointwise convolution that mixes channels.
+  If `use_bias` is True and a bias initializer is provided,
+  it adds a bias vector to the output.
+  It then optionally applies an activation function to produce the final output.
+
+  Arguments:
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: A single integer specifying the spatial
+      dimensions of the filters.
+    strides: A single integer specifying the strides
+      of the convolution.
+      Specifying any `stride` value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    dilation_rate: A single integer, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    depth_multiplier: The number of depthwise convolution output channels for
+      each input channel. The total number of depthwise convolution output
+      channels will be equal to `num_filters_in * depth_multiplier`.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    depthwise_initializer: An initializer for the depthwise convolution kernel.
+    pointwise_initializer: An initializer for the pointwise convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
+    depthwise_regularizer: Optional regularizer for the depthwise
+      convolution kernel.
+    pointwise_regularizer: Optional regularizer for the pointwise
+      convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Optional regularizer function for the output.
+    depthwise_constraint: Optional projection function to be applied to the
+        depthwise kernel after being updated by an `Optimizer` (e.g. used for
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    pointwise_constraint: Optional projection function to be applied to the
+        pointwise kernel after being updated by an `Optimizer`.
+    bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, filters,
+               kernel_size,
+               strides=1,
+               padding='valid',
+               data_format='channels_last',
+               dilation_rate=1,
+               depth_multiplier=1,
+               activation=None,
+               use_bias=True,
+               depthwise_initializer=None,
+               pointwise_initializer=None,
+               bias_initializer=init_ops.zeros_initializer(),
+               depthwise_regularizer=None,
+               pointwise_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               depthwise_constraint=None,
+               pointwise_constraint=None,
+               bias_constraint=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(SeparableConv1D, self).__init__(
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        depth_multiplier=depth_multiplier,
+        activation=activation,
+        use_bias=use_bias,
+        depthwise_initializer=depthwise_initializer,
+        pointwise_initializer=pointwise_initializer,
+        bias_initializer=bias_initializer,
+        depthwise_regularizer=depthwise_regularizer,
+        pointwise_regularizer=pointwise_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        depthwise_constraint=depthwise_constraint,
+        pointwise_constraint=pointwise_constraint,
+        bias_constraint=bias_constraint,
+        trainable=trainable,
+        name=name,
+        **kwargs)
+
+
+@tf_export(v1=['layers.SeparableConv2D'])
+class SeparableConv2D(keras_layers.SeparableConv2D, base.Layer):
+  """Depthwise separable 2D convolution.
+
+  This layer performs a depthwise convolution that acts separately on
+  channels, followed by a pointwise convolution that mixes channels.
+  If `use_bias` is True and a bias initializer is provided,
+  it adds a bias vector to the output.
+  It then optionally applies an activation function to produce the final output.
+
+  Arguments:
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: A tuple or list of 2 integers specifying the spatial
+      dimensions of the filters. Can be a single integer to specify the same
+      value for all spatial dimensions.
+    strides: A tuple or list of 2 positive integers specifying the strides
+      of the convolution. Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any `stride` value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+
+    dilation_rate: An integer or tuple/list of 2 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    depth_multiplier: The number of depthwise convolution output channels for
+      each input channel. The total number of depthwise convolution output
+      channels will be equal to `num_filters_in * depth_multiplier`.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    depthwise_initializer: An initializer for the depthwise convolution kernel.
+    pointwise_initializer: An initializer for the pointwise convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
+    depthwise_regularizer: Optional regularizer for the depthwise
+      convolution kernel.
+    pointwise_regularizer: Optional regularizer for the pointwise
+      convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Optional regularizer function for the output.
+    depthwise_constraint: Optional projection function to be applied to the
+        depthwise kernel after being updated by an `Optimizer` (e.g. used for
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    pointwise_constraint: Optional projection function to be applied to the
+        pointwise kernel after being updated by an `Optimizer`.
+    bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, filters,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               data_format='channels_last',
+               dilation_rate=(1, 1),
+               depth_multiplier=1,
+               activation=None,
+               use_bias=True,
+               depthwise_initializer=None,
+               pointwise_initializer=None,
+               bias_initializer=init_ops.zeros_initializer(),
+               depthwise_regularizer=None,
+               pointwise_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               depthwise_constraint=None,
+               pointwise_constraint=None,
+               bias_constraint=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(SeparableConv2D, self).__init__(
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        depth_multiplier=depth_multiplier,
+        activation=activation,
+        use_bias=use_bias,
+        depthwise_initializer=depthwise_initializer,
+        pointwise_initializer=pointwise_initializer,
+        bias_initializer=bias_initializer,
+        depthwise_regularizer=depthwise_regularizer,
+        pointwise_regularizer=pointwise_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        depthwise_constraint=depthwise_constraint,
+        pointwise_constraint=pointwise_constraint,
+        bias_constraint=bias_constraint,
+        trainable=trainable,
+        name=name,
+        **kwargs)
+
+
+@deprecation.deprecated(
+    date=None,
+    instructions='Use `tf.keras.layers.SeparableConv1D` instead.')
+@tf_export(v1=['layers.separable_conv1d'])
+def separable_conv1d(inputs,
+                     filters,
+                     kernel_size,
+                     strides=1,
+                     padding='valid',
+                     data_format='channels_last',
+                     dilation_rate=1,
+                     depth_multiplier=1,
+                     activation=None,
+                     use_bias=True,
+                     depthwise_initializer=None,
+                     pointwise_initializer=None,
+                     bias_initializer=init_ops.zeros_initializer(),
+                     depthwise_regularizer=None,
+                     pointwise_regularizer=None,
+                     bias_regularizer=None,
+                     activity_regularizer=None,
+                     depthwise_constraint=None,
+                     pointwise_constraint=None,
+                     bias_constraint=None,
+                     trainable=True,
+                     name=None,
+                     reuse=None):
+  """Functional interface for the depthwise separable 1D convolution layer.
+
+  This layer performs a depthwise convolution that acts separately on
+  channels, followed by a pointwise convolution that mixes channels.
+  If `use_bias` is True and a bias initializer is provided,
+  it adds a bias vector to the output.
+  It then optionally applies an activation function to produce the final output.
+
+  Arguments:
+    inputs: Input tensor.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: A single integer specifying the spatial
+      dimensions of the filters.
+    strides: A single integer specifying the strides
+      of the convolution.
+      Specifying any `stride` value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    dilation_rate: A single integer, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    depth_multiplier: The number of depthwise convolution output channels for
+      each input channel. The total number of depthwise convolution output
+      channels will be equal to `num_filters_in * depth_multiplier`.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    depthwise_initializer: An initializer for the depthwise convolution kernel.
+    pointwise_initializer: An initializer for the pointwise convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
+    depthwise_regularizer: Optional regularizer for the depthwise
+      convolution kernel.
+    pointwise_regularizer: Optional regularizer for the pointwise
+      convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Optional regularizer function for the output.
+    depthwise_constraint: Optional projection function to be applied to the
+        depthwise kernel after being updated by an `Optimizer` (e.g. used for
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    pointwise_constraint: Optional projection function to be applied to the
+        pointwise kernel after being updated by an `Optimizer`.
+    bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Returns:
+    Output tensor.
+
+  Raises:
+    ValueError: if eager execution is enabled.
+  """
+  layer = SeparableConv1D(
+      filters=filters,
+      kernel_size=kernel_size,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      dilation_rate=dilation_rate,
+      depth_multiplier=depth_multiplier,
+      activation=activation,
+      use_bias=use_bias,
+      depthwise_initializer=depthwise_initializer,
+      pointwise_initializer=pointwise_initializer,
+      bias_initializer=bias_initializer,
+      depthwise_regularizer=depthwise_regularizer,
+      pointwise_regularizer=pointwise_regularizer,
+      bias_regularizer=bias_regularizer,
+      activity_regularizer=activity_regularizer,
+      depthwise_constraint=depthwise_constraint,
+      pointwise_constraint=pointwise_constraint,
+      bias_constraint=bias_constraint,
+      trainable=trainable,
+      name=name,
+      _reuse=reuse,
+      _scope=name)
+  return layer.apply(inputs)
+
+
+@deprecation.deprecated(
+    date=None,
+    instructions='Use `tf.keras.layers.SeparableConv2D` instead.')
+@tf_export(v1=['layers.separable_conv2d'])
+def separable_conv2d(inputs,
+                     filters,
+                     kernel_size,
+                     strides=(1, 1),
+                     padding='valid',
+                     data_format='channels_last',
+                     dilation_rate=(1, 1),
+                     depth_multiplier=1,
+                     activation=None,
+                     use_bias=True,
+                     depthwise_initializer=None,
+                     pointwise_initializer=None,
+                     bias_initializer=init_ops.zeros_initializer(),
+                     depthwise_regularizer=None,
+                     pointwise_regularizer=None,
+                     bias_regularizer=None,
+                     activity_regularizer=None,
+                     depthwise_constraint=None,
+                     pointwise_constraint=None,
+                     bias_constraint=None,
+                     trainable=True,
+                     name=None,
+                     reuse=None):
+  """Functional interface for the depthwise separable 2D convolution layer.
+
+  This layer performs a depthwise convolution that acts separately on
+  channels, followed by a pointwise convolution that mixes channels.
+  If `use_bias` is True and a bias initializer is provided,
+  it adds a bias vector to the output.
+  It then optionally applies an activation function to produce the final output.
+
+  Arguments:
+    inputs: Input tensor.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: A tuple or list of 2 integers specifying the spatial
+      dimensions of the filters. Can be a single integer to specify the same
+      value for all spatial dimensions.
+    strides: A tuple or list of 2 positive integers specifying the strides
+      of the convolution. Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any `stride` value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+
+    dilation_rate: An integer or tuple/list of 2 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    depth_multiplier: The number of depthwise convolution output channels for
+      each input channel. The total number of depthwise convolution output
+      channels will be equal to `num_filters_in * depth_multiplier`.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    depthwise_initializer: An initializer for the depthwise convolution kernel.
+    pointwise_initializer: An initializer for the pointwise convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
+    depthwise_regularizer: Optional regularizer for the depthwise
+      convolution kernel.
+    pointwise_regularizer: Optional regularizer for the pointwise
+      convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Optional regularizer function for the output.
+    depthwise_constraint: Optional projection function to be applied to the
+        depthwise kernel after being updated by an `Optimizer` (e.g. used for
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    pointwise_constraint: Optional projection function to be applied to the
+        pointwise kernel after being updated by an `Optimizer`.
+    bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Returns:
+    Output tensor.
+
+  Raises:
+    ValueError: if eager execution is enabled.
+  """
+  layer = SeparableConv2D(
+      filters=filters,
+      kernel_size=kernel_size,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      dilation_rate=dilation_rate,
+      depth_multiplier=depth_multiplier,
+      activation=activation,
+      use_bias=use_bias,
+      depthwise_initializer=depthwise_initializer,
+      pointwise_initializer=pointwise_initializer,
+      bias_initializer=bias_initializer,
+      depthwise_regularizer=depthwise_regularizer,
+      pointwise_regularizer=pointwise_regularizer,
+      bias_regularizer=bias_regularizer,
+      activity_regularizer=activity_regularizer,
+      depthwise_constraint=depthwise_constraint,
+      pointwise_constraint=pointwise_constraint,
+      bias_constraint=bias_constraint,
+      trainable=trainable,
+      name=name,
+      _reuse=reuse,
+      _scope=name)
+  return layer.apply(inputs)
+
+
+@tf_export(v1=['layers.Conv2DTranspose'])
+class Conv2DTranspose(keras_layers.Conv2DTranspose, base.Layer):
+  """Transposed 2D convolution layer (sometimes called 2D Deconvolution).
+
+  The need for transposed convolutions generally arises
+  from the desire to use a transformation going in the opposite direction
+  of a normal convolution, i.e., from something that has the shape of the
+  output of some convolution to something that has the shape of its input
+  while maintaining a connectivity pattern that is compatible with
+  said convolution.
+
+  Arguments:
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: A tuple or list of 2 positive integers specifying the spatial
+      dimensions of the filters. Can be a single integer to specify the same
+      value for all spatial dimensions.
+    strides: A tuple or list of 2 positive integers specifying the strides
+      of the convolution. Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: one of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: An initializer for the convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
+    kernel_regularizer: Optional regularizer for the convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Optional regularizer function for the output.
+    kernel_constraint: Optional projection function to be applied to the
+        kernel after being updated by an `Optimizer` (e.g. used to implement
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, filters,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               data_format='channels_last',
+               activation=None,
+               use_bias=True,
+               kernel_initializer=None,
+               bias_initializer=init_ops.zeros_initializer(),
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(Conv2DTranspose, self).__init__(
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        activation=activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        bias_initializer=bias_initializer,
+        kernel_regularizer=kernel_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        kernel_constraint=kernel_constraint,
+        bias_constraint=bias_constraint,
+        trainable=trainable,
+        name=name,
+        **kwargs)
+
+
+@deprecation.deprecated(
+    date=None,
+    instructions='Use `tf.keras.layers.Conv2DTranspose` instead.')
+@tf_export(v1=['layers.conv2d_transpose'])
+def conv2d_transpose(inputs,
+                     filters,
+                     kernel_size,
+                     strides=(1, 1),
+                     padding='valid',
+                     data_format='channels_last',
+                     activation=None,
+                     use_bias=True,
+                     kernel_initializer=None,
+                     bias_initializer=init_ops.zeros_initializer(),
+                     kernel_regularizer=None,
+                     bias_regularizer=None,
+                     activity_regularizer=None,
+                     kernel_constraint=None,
+                     bias_constraint=None,
+                     trainable=True,
+                     name=None,
+                     reuse=None):
+  """Functional interface for transposed 2D convolution layer.
+
+  The need for transposed convolutions generally arises
+  from the desire to use a transformation going in the opposite direction
+  of a normal convolution, i.e., from something that has the shape of the
+  output of some convolution to something that has the shape of its input
+  while maintaining a connectivity pattern that is compatible with
+  said convolution.
+
+  Arguments:
+    inputs: Input tensor.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: A tuple or list of 2 positive integers specifying the spatial
+      dimensions of the filters. Can be a single integer to specify the same
+      value for all spatial dimensions.
+    strides: A tuple or list of 2 positive integers specifying the strides
+      of the convolution. Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: one of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+    activation: Activation function. Set it to `None` to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: An initializer for the convolution kernel.
+    bias_initializer: An initializer for the bias vector. If `None`, the default
+      initializer will be used.
+    kernel_regularizer: Optional regularizer for the convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Optional regularizer function for the output.
+    kernel_constraint: Optional projection function to be applied to the
+        kernel after being updated by an `Optimizer` (e.g. used to implement
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Returns:
+    Output tensor.
+
+  Raises:
+    ValueError: if eager execution is enabled.
+  """
+  layer = Conv2DTranspose(
+      filters=filters,
+      kernel_size=kernel_size,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      activation=activation,
+      use_bias=use_bias,
+      kernel_initializer=kernel_initializer,
+      bias_initializer=bias_initializer,
+      kernel_regularizer=kernel_regularizer,
+      bias_regularizer=bias_regularizer,
+      activity_regularizer=activity_regularizer,
+      kernel_constraint=kernel_constraint,
+      bias_constraint=bias_constraint,
+      trainable=trainable,
+      name=name,
+      _reuse=reuse,
+      _scope=name)
+  return layer.apply(inputs)
+
+
+@tf_export(v1=['layers.Conv3DTranspose'])
+class Conv3DTranspose(keras_layers.Conv3DTranspose, base.Layer):
+  """Transposed 3D convolution layer (sometimes called 3D Deconvolution).
+
+  Arguments:
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of 3 integers, specifying the
+      depth, height and width of the 3D convolution window.
+      Can be a single integer to specify the same value for all spatial
+      dimensions.
+    strides: An integer or tuple/list of 3 integers, specifying the strides
+      of the convolution along the depth, height and width.
+      Can be a single integer to specify the same value for all spatial
+      dimensions.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, depth, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, depth, height, width)`.
+    activation: Activation function. Set it to `None` to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: An initializer for the convolution kernel.
+    bias_initializer: An initializer for the bias vector. If `None`, the default
+      initializer will be used.
+    kernel_regularizer: Optional regularizer for the convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Optional regularizer function for the output.
+    kernel_constraint: Optional projection function to be applied to the
+        kernel after being updated by an `Optimizer` (e.g. used to implement
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=(1, 1, 1),
+               padding='valid',
+               data_format='channels_last',
+               activation=None,
+               use_bias=True,
+               kernel_initializer=None,
+               bias_initializer=init_ops.zeros_initializer(),
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(Conv3DTranspose, self).__init__(
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        activation=activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        bias_initializer=bias_initializer,
+        kernel_regularizer=kernel_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        kernel_constraint=kernel_constraint,
+        bias_constraint=bias_constraint,
+        trainable=trainable,
+        name=name,
+        **kwargs)
+
+
+@deprecation.deprecated(
+    date=None,
+    instructions='Use `tf.keras.layers.Conv3DTranspose` instead.')
+@tf_export(v1=['layers.conv3d_transpose'])
+def conv3d_transpose(inputs,
+                     filters,
+                     kernel_size,
+                     strides=(1, 1, 1),
+                     padding='valid',
+                     data_format='channels_last',
+                     activation=None,
+                     use_bias=True,
+                     kernel_initializer=None,
+                     bias_initializer=init_ops.zeros_initializer(),
+                     kernel_regularizer=None,
+                     bias_regularizer=None,
+                     activity_regularizer=None,
+                     kernel_constraint=None,
+                     bias_constraint=None,
+                     trainable=True,
+                     name=None,
+                     reuse=None):
+  """Functional interface for transposed 3D convolution layer.
+
+  Arguments:
+    inputs: Input tensor.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: A tuple or list of 3 positive integers specifying the spatial
+      dimensions of the filters. Can be a single integer to specify the same
+      value for all spatial dimensions.
+    strides: A tuple or list of 3 positive integers specifying the strides
+      of the convolution. Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: one of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, depth, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, depth, height, width)`.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: An initializer for the convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
+    kernel_regularizer: Optional regularizer for the convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Optional regularizer function for the output.
+    kernel_constraint: Optional projection function to be applied to the
+        kernel after being updated by an `Optimizer` (e.g. used to implement
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Returns:
+    Output tensor.
+
+  Raises:
+    ValueError: if eager execution is enabled.
+  """
+  layer = Conv3DTranspose(
+      filters=filters,
+      kernel_size=kernel_size,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      activation=activation,
+      use_bias=use_bias,
+      kernel_initializer=kernel_initializer,
+      bias_initializer=bias_initializer,
+      kernel_regularizer=kernel_regularizer,
+      bias_regularizer=bias_regularizer,
+      activity_regularizer=activity_regularizer,
+      kernel_constraint=kernel_constraint,
+      bias_constraint=bias_constraint,
+      trainable=trainable,
+      name=name,
+      _reuse=reuse,
+      _scope=name)
+  return layer.apply(inputs)
+
+
+# Aliases
+
+Convolution1D = Conv1D
+Convolution2D = Conv2D
+Convolution3D = Conv3D
+SeparableConvolution2D = SeparableConv2D
+Convolution2DTranspose = Deconvolution2D = Deconv2D = Conv2DTranspose
+Convolution3DTranspose = Deconvolution3D = Deconv3D = Conv3DTranspose
+convolution1d = conv1d
+convolution2d = conv2d
+convolution3d = conv3d
+separable_convolution2d = separable_conv2d
+convolution2d_transpose = deconvolution2d = deconv2d = conv2d_transpose
+convolution3d_transpose = deconvolution3d = deconv3d = conv3d_transpose
diff --git a/tensorflow/python/layers/convolutional_test.py b/tensorflow/python/keras/legacy_tf_layers/convolutional_test.py
similarity index 99%
rename from tensorflow/python/layers/convolutional_test.py
rename to tensorflow/python/keras/legacy_tf_layers/convolutional_test.py
index a3e493edfea..b0eeede8737 100644
--- a/tensorflow/python/layers/convolutional_test.py
+++ b/tensorflow/python/keras/legacy_tf_layers/convolutional_test.py
@@ -23,7 +23,7 @@ import numpy as np
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.layers import convolutional as conv_layers
+from tensorflow.python.keras.legacy_tf_layers import convolutional as conv_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
diff --git a/tensorflow/python/keras/legacy_tf_layers/core.py b/tensorflow/python/keras/legacy_tf_layers/core.py
new file mode 100644
index 00000000000..78ddf2547ae
--- /dev/null
+++ b/tensorflow/python/keras/legacy_tf_layers/core.py
@@ -0,0 +1,338 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Contains the core layers: Dense, Dropout.
+
+Also contains their functional aliases.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.keras import layers as keras_layers
+from tensorflow.python.keras.legacy_tf_layers import base
+from tensorflow.python.ops import init_ops
+from tensorflow.python.util import deprecation
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export(v1=['layers.Dense'])
+class Dense(keras_layers.Dense, base.Layer):
+  """Densely-connected layer class.
+
+  This layer implements the operation:
+  `outputs = activation(inputs * kernel + bias)`
+  Where `activation` is the activation function passed as the `activation`
+  argument (if not `None`), `kernel` is a weights matrix created by the layer,
+  and `bias` is a bias vector created by the layer
+  (only if `use_bias` is `True`).
+
+  Arguments:
+    units: Integer or Long, dimensionality of the output space.
+    activation: Activation function (callable). Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: Initializer function for the weight matrix.
+      If `None` (default), weights are initialized using the default
+      initializer used by `tf.compat.v1.get_variable`.
+    bias_initializer: Initializer function for the bias.
+    kernel_regularizer: Regularizer function for the weight matrix.
+    bias_regularizer: Regularizer function for the bias.
+    activity_regularizer: Regularizer function for the output.
+    kernel_constraint: An optional projection function to be applied to the
+        kernel after being updated by an `Optimizer` (e.g. used to implement
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    bias_constraint: An optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: String, the name of the layer. Layers with the same name will
+      share weights, but to avoid mistakes we require reuse=True in such cases.
+    _reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Properties:
+    units: Python integer, dimensionality of the output space.
+    activation: Activation function (callable).
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: Initializer instance (or name) for the kernel matrix.
+    bias_initializer: Initializer instance (or name) for the bias.
+    kernel_regularizer: Regularizer instance for the kernel matrix (callable)
+    bias_regularizer: Regularizer instance for the bias (callable).
+    activity_regularizer: Regularizer instance for the output (callable)
+    kernel_constraint: Constraint function for the kernel matrix.
+    bias_constraint: Constraint function for the bias.
+    kernel: Weight matrix (TensorFlow variable or tensor).
+    bias: Bias vector, if applicable (TensorFlow variable or tensor).
+  """
+
+  def __init__(self, units,
+               activation=None,
+               use_bias=True,
+               kernel_initializer=None,
+               bias_initializer=init_ops.zeros_initializer(),
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(Dense, self).__init__(units=units,
+                                activation=activation,
+                                use_bias=use_bias,
+                                kernel_initializer=kernel_initializer,
+                                bias_initializer=bias_initializer,
+                                kernel_regularizer=kernel_regularizer,
+                                bias_regularizer=bias_regularizer,
+                                activity_regularizer=activity_regularizer,
+                                kernel_constraint=kernel_constraint,
+                                bias_constraint=bias_constraint,
+                                trainable=trainable,
+                                name=name,
+                                **kwargs)
+
+
+@deprecation.deprecated(
+    date=None, instructions='Use keras.layers.Dense instead.')
+@tf_export(v1=['layers.dense'])
+def dense(
+    inputs, units,
+    activation=None,
+    use_bias=True,
+    kernel_initializer=None,
+    bias_initializer=init_ops.zeros_initializer(),
+    kernel_regularizer=None,
+    bias_regularizer=None,
+    activity_regularizer=None,
+    kernel_constraint=None,
+    bias_constraint=None,
+    trainable=True,
+    name=None,
+    reuse=None):
+  """Functional interface for the densely-connected layer.
+
+  This layer implements the operation:
+  `outputs = activation(inputs * kernel + bias)`
+  where `activation` is the activation function passed as the `activation`
+  argument (if not `None`), `kernel` is a weights matrix created by the layer,
+  and `bias` is a bias vector created by the layer
+  (only if `use_bias` is `True`).
+
+  Arguments:
+    inputs: Tensor input.
+    units: Integer or Long, dimensionality of the output space.
+    activation: Activation function (callable). Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: Initializer function for the weight matrix.
+      If `None` (default), weights are initialized using the default
+      initializer used by `tf.compat.v1.get_variable`.
+    bias_initializer: Initializer function for the bias.
+    kernel_regularizer: Regularizer function for the weight matrix.
+    bias_regularizer: Regularizer function for the bias.
+    activity_regularizer: Regularizer function for the output.
+    kernel_constraint: An optional projection function to be applied to the
+        kernel after being updated by an `Optimizer` (e.g. used to implement
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    bias_constraint: An optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: String, the name of the layer.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Returns:
+    Output tensor the same shape as `inputs` except the last dimension is of
+    size `units`.
+
+  Raises:
+    ValueError: if eager execution is enabled.
+  """
+  layer = Dense(units,
+                activation=activation,
+                use_bias=use_bias,
+                kernel_initializer=kernel_initializer,
+                bias_initializer=bias_initializer,
+                kernel_regularizer=kernel_regularizer,
+                bias_regularizer=bias_regularizer,
+                activity_regularizer=activity_regularizer,
+                kernel_constraint=kernel_constraint,
+                bias_constraint=bias_constraint,
+                trainable=trainable,
+                name=name,
+                _scope=name,
+                _reuse=reuse)
+  return layer.apply(inputs)
+
+
+@tf_export(v1=['layers.Dropout'])
+class Dropout(keras_layers.Dropout, base.Layer):
+  """Applies Dropout to the input.
+
+  Dropout consists in randomly setting a fraction `rate` of input units to 0
+  at each update during training time, which helps prevent overfitting.
+  The units that are kept are scaled by `1 / (1 - rate)`, so that their
+  sum is unchanged at training time and inference time.
+
+  Arguments:
+    rate: The dropout rate, between 0 and 1. E.g. `rate=0.1` would drop out
+      10% of input units.
+    noise_shape: 1D tensor of type `int32` representing the shape of the
+      binary dropout mask that will be multiplied with the input.
+      For instance, if your inputs have shape
+      `(batch_size, timesteps, features)`, and you want the dropout mask
+      to be the same for all timesteps, you can use
+      `noise_shape=[batch_size, 1, features]`.
+    seed: A Python integer. Used to create random seeds. See
+      `tf.compat.v1.set_random_seed`.
+      for behavior.
+    name: The name of the layer (string).
+  """
+
+  def __init__(self, rate=0.5,
+               noise_shape=None,
+               seed=None,
+               name=None,
+               **kwargs):
+    super(Dropout, self).__init__(rate=rate,
+                                  noise_shape=noise_shape,
+                                  seed=seed,
+                                  name=name,
+                                  **kwargs)
+
+  def call(self, inputs, training=False):
+    return super(Dropout, self).call(inputs, training=training)
+
+
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.dropout instead.')
+@tf_export(v1=['layers.dropout'])
+def dropout(inputs,
+            rate=0.5,
+            noise_shape=None,
+            seed=None,
+            training=False,
+            name=None):
+  """Applies Dropout to the input.
+
+  Dropout consists in randomly setting a fraction `rate` of input units to 0
+  at each update during training time, which helps prevent overfitting.
+  The units that are kept are scaled by `1 / (1 - rate)`, so that their
+  sum is unchanged at training time and inference time.
+
+  Arguments:
+    inputs: Tensor input.
+    rate: The dropout rate, between 0 and 1. E.g. "rate=0.1" would drop out
+      10% of input units.
+    noise_shape: 1D tensor of type `int32` representing the shape of the
+      binary dropout mask that will be multiplied with the input.
+      For instance, if your inputs have shape
+      `(batch_size, timesteps, features)`, and you want the dropout mask
+      to be the same for all timesteps, you can use
+      `noise_shape=[batch_size, 1, features]`.
+    seed: A Python integer. Used to create random seeds. See
+      `tf.compat.v1.set_random_seed`
+      for behavior.
+    training: Either a Python boolean, or a TensorFlow boolean scalar tensor
+      (e.g. a placeholder). Whether to return the output in training mode
+      (apply dropout) or in inference mode (return the input untouched).
+    name: The name of the layer (string).
+
+  Returns:
+    Output tensor.
+
+  Raises:
+    ValueError: if eager execution is enabled.
+  """
+  layer = Dropout(rate, noise_shape=noise_shape, seed=seed, name=name)
+  return layer.apply(inputs, training=training)
+
+
+@tf_export(v1=['layers.Flatten'])
+class Flatten(keras_layers.Flatten, base.Layer):
+  """Flattens an input tensor while preserving the batch axis (axis 0).
+
+  Arguments:
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, ..., channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, ...)`.
+
+  Examples:
+
+  ```
+    x = tf.compat.v1.placeholder(shape=(None, 4, 4), dtype='float32')
+    y = Flatten()(x)
+    # now `y` has shape `(None, 16)`
+
+    x = tf.compat.v1.placeholder(shape=(None, 3, None), dtype='float32')
+    y = Flatten()(x)
+    # now `y` has shape `(None, None)`
+  ```
+  """
+  pass
+
+
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.Flatten instead.')
+@tf_export(v1=['layers.flatten'])
+def flatten(inputs, name=None, data_format='channels_last'):
+  """Flattens an input tensor while preserving the batch axis (axis 0).
+
+  Arguments:
+    inputs: Tensor input.
+    name: The name of the layer (string).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+
+  Returns:
+    Reshaped tensor.
+
+  Examples:
+
+  ```
+    x = tf.compat.v1.placeholder(shape=(None, 4, 4), dtype='float32')
+    y = flatten(x)
+    # now `y` has shape `(None, 16)`
+
+    x = tf.compat.v1.placeholder(shape=(None, 3, None), dtype='float32')
+    y = flatten(x)
+    # now `y` has shape `(None, None)`
+  ```
+  """
+  layer = Flatten(name=name, data_format=data_format)
+  return layer.apply(inputs)
+
+
+# Aliases
+
+FullyConnected = Dense
+fully_connected = dense
diff --git a/tensorflow/python/layers/core_test.py b/tensorflow/python/keras/legacy_tf_layers/core_test.py
similarity index 95%
rename from tensorflow/python/layers/core_test.py
rename to tensorflow/python/keras/legacy_tf_layers/core_test.py
index afd288739b6..573eb974ac1 100644
--- a/tensorflow/python/layers/core_test.py
+++ b/tensorflow/python/keras/legacy_tf_layers/core_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import collections
 import platform
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.eager import context
@@ -29,7 +30,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
-from tensorflow.python.layers import core as core_layers
+from tensorflow.python.keras import combinations
+from tensorflow.python.keras.legacy_tf_layers import core as core_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -40,9 +42,9 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
-class DenseTest(test.TestCase):
+class DenseTest(test.TestCase, parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testDenseProperties(self):
     dense = core_layers.Dense(2, activation=nn_ops.relu, name='my_dense')
     self.assertEqual(dense.units, 2)
@@ -69,7 +71,7 @@ class DenseTest(test.TestCase):
       variables.global_variables_initializer().run()
       self.assertAllEqual(x.eval(), [[0.0]])
 
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testCall(self):
     dense = core_layers.Dense(2, activation=nn_ops.relu, name='my_dense')
     inputs = random_ops.random_uniform((5, 4), seed=1)
@@ -93,14 +95,14 @@ class DenseTest(test.TestCase):
     core_layers.Dense(5)(inputs)
     core_layers.Dense(2, activation=nn_ops.relu, name='my_dense')(inputs)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testCallTensorDot(self):
     dense = core_layers.Dense(2, activation=nn_ops.relu, name='my_dense')
     inputs = random_ops.random_uniform((5, 4, 3), seed=1)
     outputs = dense(inputs)
     self.assertListEqual([5, 4, 2], outputs.get_shape().as_list())
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testNoBias(self):
     dense = core_layers.Dense(2, use_bias=False, name='my_dense')
     inputs = random_ops.random_uniform((5, 2), seed=1)
@@ -114,7 +116,7 @@ class DenseTest(test.TestCase):
     self.assertEqual(dense.kernel.name, 'my_dense/kernel:0')
     self.assertEqual(dense.bias, None)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testNonTrainable(self):
     dense = core_layers.Dense(2, trainable=False, name='my_dense')
     inputs = random_ops.random_uniform((5, 2), seed=1)
@@ -127,7 +129,7 @@ class DenseTest(test.TestCase):
       self.assertEqual(
           len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 0)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testOutputShape(self):
     dense = core_layers.Dense(7, activation=nn_ops.relu, name='my_dense')
     inputs = random_ops.random_uniform((5, 3), seed=1)
@@ -168,7 +170,7 @@ class DenseTest(test.TestCase):
     dense = core_layers.Dense(4, name='my_dense')
     dense(inputs)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testActivation(self):
     dense = core_layers.Dense(2, activation=nn_ops.relu, name='dense1')
     inputs = random_ops.random_uniform((5, 3), seed=1)
@@ -339,7 +341,7 @@ class DenseTest(test.TestCase):
         var_key = 'test2/dense/kernel'
         self.assertEqual(var_dict[var_key].name, '%s:0' % var_key)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testComputeOutputShape(self):
     dense = core_layers.Dense(2, activation=nn_ops.relu, name='dense1')
     ts = tensor_shape.TensorShape
@@ -361,7 +363,7 @@ class DenseTest(test.TestCase):
         dense.compute_output_shape(ts([None, 4, 3])).as_list())
     # pylint: enable=protected-access
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testConstraints(self):
     k_constraint = lambda x: x / math_ops.reduce_sum(x)
     b_constraint = lambda x: x / math_ops.reduce_max(x)
@@ -381,9 +383,9 @@ def _get_variable_dict_from_varstore():
   return sorted_var_dict
 
 
-class DropoutTest(test.TestCase):
+class DropoutTest(test.TestCase, parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testDropoutProperties(self):
     dp = core_layers.Dropout(0.5, name='dropout')
     self.assertEqual(dp.rate, 0.5)
@@ -391,7 +393,7 @@ class DropoutTest(test.TestCase):
     dp.apply(array_ops.ones(()))
     self.assertEqual(dp.name, 'dropout')
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testBooleanLearningPhase(self):
     dp = core_layers.Dropout(0.5)
     inputs = array_ops.ones((5, 3))
@@ -417,7 +419,7 @@ class DropoutTest(test.TestCase):
       np_output = sess.run(dropped, feed_dict={training: False})
       self.assertAllClose(np.ones((5, 5)), np_output)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testDynamicNoiseShape(self):
     inputs = array_ops.ones((5, 3, 2))
     noise_shape = [None, 1, None]
diff --git a/tensorflow/python/keras/legacy_tf_layers/normalization.py b/tensorflow/python/keras/legacy_tf_layers/normalization.py
new file mode 100644
index 00000000000..d874882aed1
--- /dev/null
+++ b/tensorflow/python/keras/legacy_tf_layers/normalization.py
@@ -0,0 +1,342 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Contains the normalization layer classes and their functional aliases.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.keras.layers import normalization as keras_normalization
+from tensorflow.python.keras.legacy_tf_layers import base
+from tensorflow.python.ops import init_ops
+from tensorflow.python.util import deprecation
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export(v1=['layers.BatchNormalization'])
+class BatchNormalization(keras_normalization.BatchNormalization, base.Layer):
+  """Batch Normalization layer from (Ioffe et al., 2015).
+
+  Keras APIs handle BatchNormalization updates to the moving_mean and
+  moving_variance as part of their `fit()` and `evaluate()` loops. However, if a
+  custom training loop is used with an instance of `Model`, these updates need
+  to be explicitly included.  Here's a simple example of how it can be done:
+
+  ```python
+    # model is an instance of Model that contains BatchNormalization layer.
+    update_ops = model.get_updates_for(None) + model.get_updates_for(features)
+    train_op = optimizer.minimize(loss)
+    train_op = tf.group([train_op, update_ops])
+  ```
+
+  Arguments:
+    axis: An `int` or list of `int`, the axis or axes that should be normalized,
+      typically the features axis/axes. For instance, after a `Conv2D` layer
+      with `data_format="channels_first"`, set `axis=1`. If a list of axes is
+      provided, each axis in `axis` will be normalized
+        simultaneously. Default is `-1` which uses the last axis. Note: when
+          using multi-axis batch norm, the `beta`, `gamma`, `moving_mean`, and
+          `moving_variance` variables are the same rank as the input Tensor,
+          with dimension size 1 in all reduced (non-axis) dimensions).
+    momentum: Momentum for the moving average.
+    epsilon: Small float added to variance to avoid dividing by zero.
+    center: If True, add offset of `beta` to normalized tensor. If False, `beta`
+      is ignored.
+    scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the
+      next layer is linear (also e.g. `nn.relu`), this can be disabled since the
+      scaling can be done by the next layer.
+    beta_initializer: Initializer for the beta weight.
+    gamma_initializer: Initializer for the gamma weight.
+    moving_mean_initializer: Initializer for the moving mean.
+    moving_variance_initializer: Initializer for the moving variance.
+    beta_regularizer: Optional regularizer for the beta weight.
+    gamma_regularizer: Optional regularizer for the gamma weight.
+    beta_constraint: An optional projection function to be applied to the `beta`
+      weight after being updated by an `Optimizer` (e.g. used to implement norm
+      constraints or value constraints for layer weights). The function must
+      take as input the unprojected variable and must return the projected
+      variable (which must have the same shape). Constraints are not safe to use
+      when doing asynchronous distributed training.
+    gamma_constraint: An optional projection function to be applied to the
+      `gamma` weight after being updated by an `Optimizer`.
+    renorm: Whether to use Batch Renormalization (Ioffe, 2017). This adds extra
+      variables during training. The inference is the same for either value of
+      this parameter.
+    renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
+      scalar `Tensors` used to clip the renorm correction. The correction `(r,
+      d)` is used as `corrected_value = normalized_value * r + d`, with `r`
+      clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
+      dmax are set to inf, 0, inf, respectively.
+    renorm_momentum: Momentum used to update the moving means and standard
+      deviations with renorm. Unlike `momentum`, this affects training and
+      should be neither too small (which would add noise) nor too large (which
+      would give stale estimates). Note that `momentum` is still applied to get
+      the means and variances for inference.
+    fused: if `None` or `True`, use a faster, fused implementation if possible.
+      If `False`, use the system recommended implementation.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+    virtual_batch_size: An `int`. By default, `virtual_batch_size` is `None`,
+      which means batch normalization is performed across the whole batch. When
+      `virtual_batch_size` is not `None`, instead perform "Ghost Batch
+      Normalization", which creates virtual sub-batches which are each
+      normalized separately (with shared gamma, beta, and moving statistics).
+      Must divide the actual batch size during execution.
+    adjustment: A function taking the `Tensor` containing the (dynamic) shape of
+      the input tensor and returning a pair (scale, bias) to apply to the
+      normalized values (before gamma and beta), only during training. For
+      example, if axis==-1,
+        `adjustment = lambda shape: (
+          tf.random.uniform(shape[-1:], 0.93, 1.07),
+          tf.random.uniform(shape[-1:], -0.1, 0.1))` will scale the normalized
+            value by up to 7% up or down, then shift the result by up to 0.1
+            (with independent scaling and bias for each feature but shared
+            across all examples), and finally apply gamma and/or beta. If
+            `None`, no adjustment is applied. Cannot be specified if
+            virtual_batch_size is specified.
+    name: A string, the name of the layer.
+  References:
+    Batch Normalization - Accelerating Deep Network Training by Reducing
+      Internal Covariate Shift:
+      [Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html)
+      ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))
+    Batch Renormalization - Towards Reducing Minibatch Dependence in
+      Batch-Normalized Models:
+      [Ioffe,
+        2017](http://papers.nips.cc/paper/6790-batch-renormalization-towards-reducing-minibatch-dependence-in-batch-normalized-models)
+      ([pdf](http://papers.nips.cc/paper/6790-batch-renormalization-towards-reducing-minibatch-dependence-in-batch-normalized-models.pdf))
+  """
+
+  def __init__(self,
+               axis=-1,
+               momentum=0.99,
+               epsilon=1e-3,
+               center=True,
+               scale=True,
+               beta_initializer=init_ops.zeros_initializer(),
+               gamma_initializer=init_ops.ones_initializer(),
+               moving_mean_initializer=init_ops.zeros_initializer(),
+               moving_variance_initializer=init_ops.ones_initializer(),
+               beta_regularizer=None,
+               gamma_regularizer=None,
+               beta_constraint=None,
+               gamma_constraint=None,
+               renorm=False,
+               renorm_clipping=None,
+               renorm_momentum=0.99,
+               fused=None,
+               trainable=True,
+               virtual_batch_size=None,
+               adjustment=None,
+               name=None,
+               **kwargs):
+    super(BatchNormalization, self).__init__(
+        axis=axis,
+        momentum=momentum,
+        epsilon=epsilon,
+        center=center,
+        scale=scale,
+        beta_initializer=beta_initializer,
+        gamma_initializer=gamma_initializer,
+        moving_mean_initializer=moving_mean_initializer,
+        moving_variance_initializer=moving_variance_initializer,
+        beta_regularizer=beta_regularizer,
+        gamma_regularizer=gamma_regularizer,
+        beta_constraint=beta_constraint,
+        gamma_constraint=gamma_constraint,
+        renorm=renorm,
+        renorm_clipping=renorm_clipping,
+        renorm_momentum=renorm_momentum,
+        fused=fused,
+        trainable=trainable,
+        virtual_batch_size=virtual_batch_size,
+        adjustment=adjustment,
+        name=name,
+        **kwargs)
+
+  def call(self, inputs, training=False):
+    return super(BatchNormalization, self).call(inputs, training=training)
+
+
+@deprecation.deprecated(
+    date=None, instructions='Use keras.layers.BatchNormalization instead.  In '
+    'particular, `tf.control_dependencies(tf.GraphKeys.UPDATE_OPS)` should not '
+    'be used (consult the `tf.keras.layers.BatchNormalization` '
+    'documentation).')
+@tf_export(v1=['layers.batch_normalization'])
+def batch_normalization(inputs,
+                        axis=-1,
+                        momentum=0.99,
+                        epsilon=1e-3,
+                        center=True,
+                        scale=True,
+                        beta_initializer=init_ops.zeros_initializer(),
+                        gamma_initializer=init_ops.ones_initializer(),
+                        moving_mean_initializer=init_ops.zeros_initializer(),
+                        moving_variance_initializer=init_ops.ones_initializer(),
+                        beta_regularizer=None,
+                        gamma_regularizer=None,
+                        beta_constraint=None,
+                        gamma_constraint=None,
+                        training=False,
+                        trainable=True,
+                        name=None,
+                        reuse=None,
+                        renorm=False,
+                        renorm_clipping=None,
+                        renorm_momentum=0.99,
+                        fused=None,
+                        virtual_batch_size=None,
+                        adjustment=None):
+  """Functional interface for the batch normalization layer from_config(Ioffe et al., 2015).
+
+  Note: when training, the moving_mean and moving_variance need to be updated.
+  By default the update ops are placed in `tf.GraphKeys.UPDATE_OPS`, so they
+  need to be executed alongside the `train_op`. Also, be sure to add any
+  batch_normalization ops before getting the update_ops collection. Otherwise,
+  update_ops will be empty, and training/inference will not work properly. For
+  example:
+
+  ```python
+    x_norm = tf.compat.v1.layers.batch_normalization(x, training=training)
+
+    # ...
+
+    update_ops = tf.compat.v1.get_collection(tf.GraphKeys.UPDATE_OPS)
+    train_op = optimizer.minimize(loss)
+    train_op = tf.group([train_op, update_ops])
+  ```
+
+  Arguments:
+    inputs: Tensor input.
+    axis: An `int`, the axis that should be normalized (typically the features
+      axis). For instance, after a `Convolution2D` layer with
+      `data_format="channels_first"`, set `axis=1` in `BatchNormalization`.
+    momentum: Momentum for the moving average.
+    epsilon: Small float added to variance to avoid dividing by zero.
+    center: If True, add offset of `beta` to normalized tensor. If False, `beta`
+      is ignored.
+    scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the
+      next layer is linear (also e.g. `nn.relu`), this can be disabled since the
+      scaling can be done by the next layer.
+    beta_initializer: Initializer for the beta weight.
+    gamma_initializer: Initializer for the gamma weight.
+    moving_mean_initializer: Initializer for the moving mean.
+    moving_variance_initializer: Initializer for the moving variance.
+    beta_regularizer: Optional regularizer for the beta weight.
+    gamma_regularizer: Optional regularizer for the gamma weight.
+    beta_constraint: An optional projection function to be applied to the `beta`
+      weight after being updated by an `Optimizer` (e.g. used to implement norm
+      constraints or value constraints for layer weights). The function must
+      take as input the unprojected variable and must return the projected
+      variable (which must have the same shape). Constraints are not safe to use
+      when doing asynchronous distributed training.
+    gamma_constraint: An optional projection function to be applied to the
+      `gamma` weight after being updated by an `Optimizer`.
+    training: Either a Python boolean, or a TensorFlow boolean scalar tensor
+      (e.g. a placeholder). Whether to return the output in training mode
+      (normalized with statistics of the current batch) or in inference mode
+      (normalized with moving statistics). **NOTE**: make sure to set this
+        parameter correctly, or else your training/inference will not work
+        properly.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+    name: String, the name of the layer.
+    reuse: Boolean, whether to reuse the weights of a previous layer by the same
+      name.
+    renorm: Whether to use Batch Renormalization (Ioffe, 2017). This adds extra
+      variables during training. The inference is the same for either value of
+      this parameter.
+    renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
+      scalar `Tensors` used to clip the renorm correction. The correction `(r,
+      d)` is used as `corrected_value = normalized_value * r + d`, with `r`
+      clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
+      dmax are set to inf, 0, inf, respectively.
+    renorm_momentum: Momentum used to update the moving means and standard
+      deviations with renorm. Unlike `momentum`, this affects training and
+      should be neither too small (which would add noise) nor too large (which
+      would give stale estimates). Note that `momentum` is still applied to get
+      the means and variances for inference.
+    fused: if `None` or `True`, use a faster, fused implementation if possible.
+      If `False`, use the system recommended implementation.
+    virtual_batch_size: An `int`. By default, `virtual_batch_size` is `None`,
+      which means batch normalization is performed across the whole batch. When
+      `virtual_batch_size` is not `None`, instead perform "Ghost Batch
+      Normalization", which creates virtual sub-batches which are each
+      normalized separately (with shared gamma, beta, and moving statistics).
+      Must divide the actual batch size during execution.
+    adjustment: A function taking the `Tensor` containing the (dynamic) shape of
+      the input tensor and returning a pair (scale, bias) to apply to the
+      normalized values (before gamma and beta), only during training. For
+      example, if axis==-1,
+        `adjustment = lambda shape: (
+          tf.random.uniform(shape[-1:], 0.93, 1.07),
+          tf.random.uniform(shape[-1:], -0.1, 0.1))` will scale the normalized
+            value by up to 7% up or down, then shift the result by up to 0.1
+            (with independent scaling and bias for each feature but shared
+            across all examples), and finally apply gamma and/or beta. If
+            `None`, no adjustment is applied. Cannot be specified if
+            virtual_batch_size is specified.
+
+  Returns:
+    Output tensor.
+
+  Raises:
+    ValueError: if eager execution is enabled.
+
+  References:
+    Batch Normalization - Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift:
+      [Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html)
+      ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))
+    Batch Renormalization - Towards Reducing Minibatch Dependence in
+    Batch-Normalized Models:
+      [Ioffe,
+      2017](http://papers.nips.cc/paper/6790-batch-renormalization-towards-reducing-minibatch-dependence-in-batch-normalized-models)
+      ([pdf](http://papers.nips.cc/paper/6790-batch-renormalization-towards-reducing-minibatch-dependence-in-batch-normalized-models.pdf))
+  """
+  layer = BatchNormalization(
+      axis=axis,
+      momentum=momentum,
+      epsilon=epsilon,
+      center=center,
+      scale=scale,
+      beta_initializer=beta_initializer,
+      gamma_initializer=gamma_initializer,
+      moving_mean_initializer=moving_mean_initializer,
+      moving_variance_initializer=moving_variance_initializer,
+      beta_regularizer=beta_regularizer,
+      gamma_regularizer=gamma_regularizer,
+      beta_constraint=beta_constraint,
+      gamma_constraint=gamma_constraint,
+      renorm=renorm,
+      renorm_clipping=renorm_clipping,
+      renorm_momentum=renorm_momentum,
+      fused=fused,
+      trainable=trainable,
+      virtual_batch_size=virtual_batch_size,
+      adjustment=adjustment,
+      name=name,
+      _reuse=reuse,
+      _scope=name)
+  return layer.apply(inputs, training=training)
+
+
+# Aliases
+
+BatchNorm = BatchNormalization
+batch_norm = batch_normalization
diff --git a/tensorflow/python/layers/normalization_test.py b/tensorflow/python/keras/legacy_tf_layers/normalization_test.py
similarity index 99%
rename from tensorflow/python/layers/normalization_test.py
rename to tensorflow/python/keras/legacy_tf_layers/normalization_test.py
index 7672a9da84a..668fab885cc 100644
--- a/tensorflow/python/layers/normalization_test.py
+++ b/tensorflow/python/keras/legacy_tf_layers/normalization_test.py
@@ -26,8 +26,8 @@ from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.layers import convolutional as conv_layers
-from tensorflow.python.layers import normalization as normalization_layers
+from tensorflow.python.keras.legacy_tf_layers import convolutional as conv_layers
+from tensorflow.python.keras.legacy_tf_layers import normalization as normalization_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
diff --git a/tensorflow/python/keras/legacy_tf_layers/pooling.py b/tensorflow/python/keras/legacy_tf_layers/pooling.py
new file mode 100644
index 00000000000..2e1ba36c5b9
--- /dev/null
+++ b/tensorflow/python/keras/legacy_tf_layers/pooling.py
@@ -0,0 +1,470 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Contains the pooling layer classes and their functional aliases.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras import layers as keras_layers
+from tensorflow.python.keras.legacy_tf_layers import base
+from tensorflow.python.util import deprecation
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export(v1=['layers.AveragePooling1D'])
+class AveragePooling1D(keras_layers.AveragePooling1D, base.Layer):
+  """Average Pooling layer for 1D inputs.
+
+  Arguments:
+    pool_size: An integer or tuple/list of a single integer,
+      representing the size of the pooling window.
+    strides: An integer or tuple/list of a single integer, specifying the
+      strides of the pooling operation.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, pool_size, strides,
+               padding='valid', data_format='channels_last',
+               name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
+    super(AveragePooling1D, self).__init__(
+        pool_size=pool_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name,
+        **kwargs)
+
+
+@deprecation.deprecated(
+    date=None, instructions='Use keras.layers.AveragePooling1D instead.')
+@tf_export(v1=['layers.average_pooling1d'])
+def average_pooling1d(inputs, pool_size, strides,
+                      padding='valid', data_format='channels_last',
+                      name=None):
+  """Average Pooling layer for 1D inputs.
+
+  Arguments:
+    inputs: The tensor over which to pool. Must have rank 3.
+    pool_size: An integer or tuple/list of a single integer,
+      representing the size of the pooling window.
+    strides: An integer or tuple/list of a single integer, specifying the
+      strides of the pooling operation.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    name: A string, the name of the layer.
+
+  Returns:
+    The output tensor, of rank 3.
+
+  Raises:
+    ValueError: if eager execution is enabled.
+  """
+  layer = AveragePooling1D(pool_size=pool_size,
+                           strides=strides,
+                           padding=padding,
+                           data_format=data_format,
+                           name=name)
+  return layer.apply(inputs)
+
+
+@tf_export(v1=['layers.MaxPooling1D'])
+class MaxPooling1D(keras_layers.MaxPooling1D, base.Layer):
+  """Max Pooling layer for 1D inputs.
+
+  Arguments:
+    pool_size: An integer or tuple/list of a single integer,
+      representing the size of the pooling window.
+    strides: An integer or tuple/list of a single integer, specifying the
+      strides of the pooling operation.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, pool_size, strides,
+               padding='valid', data_format='channels_last',
+               name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
+    super(MaxPooling1D, self).__init__(
+        pool_size=pool_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name,
+        **kwargs)
+
+
+@deprecation.deprecated(
+    date=None, instructions='Use keras.layers.MaxPooling1D instead.')
+@tf_export(v1=['layers.max_pooling1d'])
+def max_pooling1d(inputs, pool_size, strides,
+                  padding='valid', data_format='channels_last',
+                  name=None):
+  """Max Pooling layer for 1D inputs.
+
+  Arguments:
+    inputs: The tensor over which to pool. Must have rank 3.
+    pool_size: An integer or tuple/list of a single integer,
+      representing the size of the pooling window.
+    strides: An integer or tuple/list of a single integer, specifying the
+      strides of the pooling operation.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    name: A string, the name of the layer.
+
+  Returns:
+    The output tensor, of rank 3.
+
+  Raises:
+    ValueError: if eager execution is enabled.
+  """
+  layer = MaxPooling1D(pool_size=pool_size,
+                       strides=strides,
+                       padding=padding,
+                       data_format=data_format,
+                       name=name)
+  return layer.apply(inputs)
+
+
+@tf_export(v1=['layers.AveragePooling2D'])
+class AveragePooling2D(keras_layers.AveragePooling2D, base.Layer):
+  """Average pooling layer for 2D inputs (e.g. images).
+
+  Arguments:
+    pool_size: An integer or tuple/list of 2 integers: (pool_height, pool_width)
+      specifying the size of the pooling window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 2 integers,
+      specifying the strides of the pooling operation.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string. The ordering of the dimensions in the inputs.
+      `channels_last` (default) and `channels_first` are supported.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, pool_size, strides,
+               padding='valid', data_format='channels_last',
+               name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
+    super(AveragePooling2D, self).__init__(
+        pool_size=pool_size, strides=strides,
+        padding=padding, data_format=data_format, name=name, **kwargs)
+
+
+@deprecation.deprecated(
+    date=None, instructions='Use keras.layers.AveragePooling2D instead.')
+@tf_export(v1=['layers.average_pooling2d'])
+def average_pooling2d(inputs,
+                      pool_size, strides,
+                      padding='valid', data_format='channels_last',
+                      name=None):
+  """Average pooling layer for 2D inputs (e.g. images).
+
+  Arguments:
+    inputs: The tensor over which to pool. Must have rank 4.
+    pool_size: An integer or tuple/list of 2 integers: (pool_height, pool_width)
+      specifying the size of the pooling window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 2 integers,
+      specifying the strides of the pooling operation.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string. The ordering of the dimensions in the inputs.
+      `channels_last` (default) and `channels_first` are supported.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+    name: A string, the name of the layer.
+
+  Returns:
+    Output tensor.
+
+  Raises:
+    ValueError: if eager execution is enabled.
+  """
+  layer = AveragePooling2D(pool_size=pool_size, strides=strides,
+                           padding=padding, data_format=data_format,
+                           name=name)
+  return layer.apply(inputs)
+
+
+@tf_export(v1=['layers.MaxPooling2D'])
+class MaxPooling2D(keras_layers.MaxPooling2D, base.Layer):
+  """Max pooling layer for 2D inputs (e.g. images).
+
+  Arguments:
+    pool_size: An integer or tuple/list of 2 integers: (pool_height, pool_width)
+      specifying the size of the pooling window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 2 integers,
+      specifying the strides of the pooling operation.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string. The ordering of the dimensions in the inputs.
+      `channels_last` (default) and `channels_first` are supported.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, pool_size, strides,
+               padding='valid', data_format='channels_last',
+               name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
+    super(MaxPooling2D, self).__init__(
+        pool_size=pool_size, strides=strides,
+        padding=padding, data_format=data_format, name=name, **kwargs)
+
+
+@deprecation.deprecated(
+    date=None, instructions='Use keras.layers.MaxPooling2D instead.')
+@tf_export(v1=['layers.max_pooling2d'])
+def max_pooling2d(inputs,
+                  pool_size, strides,
+                  padding='valid', data_format='channels_last',
+                  name=None):
+  """Max pooling layer for 2D inputs (e.g. images).
+
+  Arguments:
+    inputs: The tensor over which to pool. Must have rank 4.
+    pool_size: An integer or tuple/list of 2 integers: (pool_height, pool_width)
+      specifying the size of the pooling window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 2 integers,
+      specifying the strides of the pooling operation.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string. The ordering of the dimensions in the inputs.
+      `channels_last` (default) and `channels_first` are supported.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+    name: A string, the name of the layer.
+
+  Returns:
+    Output tensor.
+
+  Raises:
+    ValueError: if eager execution is enabled.
+  """
+  layer = MaxPooling2D(pool_size=pool_size, strides=strides,
+                       padding=padding, data_format=data_format,
+                       name=name)
+  return layer.apply(inputs)
+
+
+@tf_export(v1=['layers.AveragePooling3D'])
+class AveragePooling3D(keras_layers.AveragePooling3D, base.Layer):
+  """Average pooling layer for 3D inputs (e.g. volumes).
+
+  Arguments:
+    pool_size: An integer or tuple/list of 3 integers:
+      (pool_depth, pool_height, pool_width)
+      specifying the size of the pooling window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 3 integers,
+      specifying the strides of the pooling operation.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string. The ordering of the dimensions in the inputs.
+      `channels_last` (default) and `channels_first` are supported.
+      `channels_last` corresponds to inputs with shape
+      `(batch, depth, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, depth, height, width)`.
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, pool_size, strides,
+               padding='valid', data_format='channels_last',
+               name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
+    super(AveragePooling3D, self).__init__(
+        pool_size=pool_size, strides=strides,
+        padding=padding, data_format=data_format, name=name, **kwargs)
+
+
+@deprecation.deprecated(
+    date=None, instructions='Use keras.layers.AveragePooling3D instead.')
+@tf_export(v1=['layers.average_pooling3d'])
+def average_pooling3d(inputs,
+                      pool_size, strides,
+                      padding='valid', data_format='channels_last',
+                      name=None):
+  """Average pooling layer for 3D inputs (e.g. volumes).
+
+  Arguments:
+    inputs: The tensor over which to pool. Must have rank 5.
+    pool_size: An integer or tuple/list of 3 integers:
+      (pool_depth, pool_height, pool_width)
+      specifying the size of the pooling window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 3 integers,
+      specifying the strides of the pooling operation.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string. The ordering of the dimensions in the inputs.
+      `channels_last` (default) and `channels_first` are supported.
+      `channels_last` corresponds to inputs with shape
+      `(batch, depth, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, depth, height, width)`.
+    name: A string, the name of the layer.
+
+  Returns:
+    Output tensor.
+
+  Raises:
+    ValueError: if eager execution is enabled.
+  """
+  layer = AveragePooling3D(pool_size=pool_size, strides=strides,
+                           padding=padding, data_format=data_format,
+                           name=name)
+  return layer.apply(inputs)
+
+
+@tf_export(v1=['layers.MaxPooling3D'])
+class MaxPooling3D(keras_layers.MaxPooling3D, base.Layer):
+  """Max pooling layer for 3D inputs (e.g. volumes).
+
+  Arguments:
+    pool_size: An integer or tuple/list of 3 integers:
+      (pool_depth, pool_height, pool_width)
+      specifying the size of the pooling window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 3 integers,
+      specifying the strides of the pooling operation.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string. The ordering of the dimensions in the inputs.
+      `channels_last` (default) and `channels_first` are supported.
+      `channels_last` corresponds to inputs with shape
+      `(batch, depth, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, depth, height, width)`.
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, pool_size, strides,
+               padding='valid', data_format='channels_last',
+               name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
+    super(MaxPooling3D, self).__init__(
+        pool_size=pool_size, strides=strides,
+        padding=padding, data_format=data_format, name=name, **kwargs)
+
+
+@deprecation.deprecated(
+    date=None, instructions='Use keras.layers.MaxPooling3D instead.')
+@tf_export(v1=['layers.max_pooling3d'])
+def max_pooling3d(inputs,
+                  pool_size, strides,
+                  padding='valid', data_format='channels_last',
+                  name=None):
+  """Max pooling layer for 3D inputs (e.g.
+
+  volumes).
+
+  Arguments:
+    inputs: The tensor over which to pool. Must have rank 5.
+    pool_size: An integer or tuple/list of 3 integers: (pool_depth, pool_height,
+      pool_width) specifying the size of the pooling window. Can be a single
+      integer to specify the same value for all spatial dimensions.
+    strides: An integer or tuple/list of 3 integers, specifying the strides of
+      the pooling operation. Can be a single integer to specify the same value
+      for all spatial dimensions.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string. The ordering of the dimensions in the inputs.
+      `channels_last` (default) and `channels_first` are supported.
+      `channels_last` corresponds to inputs with shape `(batch, depth, height,
+      width, channels)` while `channels_first` corresponds to inputs with shape
+      `(batch, channels, depth, height, width)`.
+    name: A string, the name of the layer.
+
+  Returns:
+    Output tensor.
+
+  Raises:
+    ValueError: if eager execution is enabled.
+  """
+  layer = MaxPooling3D(pool_size=pool_size, strides=strides,
+                       padding=padding, data_format=data_format,
+                       name=name)
+  return layer.apply(inputs)
+
+# Aliases
+
+AvgPool2D = AveragePooling2D
+MaxPool2D = MaxPooling2D
+max_pool2d = max_pooling2d
+avg_pool2d = average_pooling2d
diff --git a/tensorflow/python/layers/pooling_test.py b/tensorflow/python/keras/legacy_tf_layers/pooling_test.py
similarity index 99%
rename from tensorflow/python/layers/pooling_test.py
rename to tensorflow/python/keras/legacy_tf_layers/pooling_test.py
index cf1fa1e6915..0fd63ed335f 100644
--- a/tensorflow/python/layers/pooling_test.py
+++ b/tensorflow/python/keras/legacy_tf_layers/pooling_test.py
@@ -19,7 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import test_util
-from tensorflow.python.layers import pooling as pooling_layers
+from tensorflow.python.keras.legacy_tf_layers import pooling as pooling_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index 61fda819c74..5e9f49faf31 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -861,7 +861,7 @@ class CategoricalHinge(LossFunctionWrapper):
   """Computes the categorical hinge loss between `y_true` and `y_pred`.
 
   `loss = maximum(neg - pos + 1, 0)`
-  where `neg = sum(y_true * y_pred)` and `pos = maximum(1 - y_true)`
+  where `neg=maximum((1-y_true)*y_pred) and pos=sum(y_true*y_pred)`
 
   Usage:
 
@@ -1387,7 +1387,7 @@ def categorical_hinge(y_true, y_pred):
   """Computes the categorical hinge loss between `y_true` and `y_pred`.
 
   `loss = maximum(neg - pos + 1, 0)`
-  where `neg = sum(y_true * y_pred)` and `pos = maximum(1 - y_true)`
+  where `neg=maximum((1-y_true)*y_pred) and pos=sum(y_true*y_pred)`
 
   Usage:
 
@@ -1698,8 +1698,8 @@ def cosine_similarity(y_true, y_pred, axis=-1):
   >>> # l2_norm(y_true) = [[0., 1.], [1./1.414], 1./1.414]]]
   >>> # l2_norm(y_pred) = [[1., 0.], [1./1.414], 1./1.414]]]
   >>> # l2_norm(y_true) . l2_norm(y_pred) = [[0., 0.], [0.5, 0.5]]
-  >>> # loss = mean(sum(l2_norm(y_true) . l2_norm(y_pred), axis=1))
-  >>> #       = ((0. + 0.) +  (0.5 + 0.5)) / 2
+  >>> # loss = -sum(l2_norm(y_true) . l2_norm(y_pred), axis=1)
+  >>> #       = -[0. + 0., 0.5 + 0.5]
   >>> loss.numpy()
   array([-0., -0.999], dtype=float32)
 
@@ -1718,9 +1718,16 @@ def cosine_similarity(y_true, y_pred, axis=-1):
 
 @keras_export('keras.losses.CosineSimilarity')
 class CosineSimilarity(LossFunctionWrapper):
-  """Computes the cosine similarity between `y_true` and `y_pred`.
+  """Computes the cosine similarity between labels and predictions.
 
-  `loss = -sum(y_true * y_pred)`
+  Note that it is a negative quantity between -1 and 0, where 0 indicates
+  orthogonality and values closer to -1 indicate greater similarity. This makes
+  it usable as a loss function in a setting where you try to maximize the
+  proximity between predictions and targets. If either `y_true` or `y_pred`
+  is a zero vector, cosine similarity will be 0 regardless of the proximity
+  between predictions and targets.
+
+  `loss = -sum(l2_norm(y_true) * l2_norm(y_pred))`
 
   Usage:
 
@@ -1732,7 +1739,7 @@ class CosineSimilarity(LossFunctionWrapper):
   >>> # l2_norm(y_pred) = [[1., 0.], [1./1.414], 1./1.414]]]
   >>> # l2_norm(y_true) . l2_norm(y_pred) = [[0., 0.], [0.5, 0.5]]
   >>> # loss = mean(sum(l2_norm(y_true) . l2_norm(y_pred), axis=1))
-  >>> #       = ((0. + 0.) +  (0.5 + 0.5)) / 2
+  >>> #       = -((0. + 0.) +  (0.5 + 0.5)) / 2
   >>> cosine_loss(y_true, y_pred).numpy()
   -0.5
 
@@ -1765,13 +1772,12 @@ class CosineSimilarity(LossFunctionWrapper):
     reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
       Default value is `AUTO`. `AUTO` indicates that the reduction option will
       be determined by the usage context. For almost all cases this defaults to
-      `SUM_OVER_BATCH_SIZE`.
-      When used with `tf.distribute.Strategy`, outside of built-in training
-      loops such as `tf.keras` `compile` and `fit`, using `AUTO` or
-      `SUM_OVER_BATCH_SIZE` will raise an error. Please see this custom training
-      [tutorial]
-      (https://www.tensorflow.org/tutorials/distribute/custom_training)
-      for more details.
+      `SUM_OVER_BATCH_SIZE`. When used with `tf.distribute.Strategy`, outside of
+      built-in training loops such as `tf.keras` `compile` and `fit`, using
+      `AUTO` or `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+      custom training [tutorial]
+      (https://www.tensorflow.org/tutorials/distribute/custom_training) for more
+        details.
     name: Optional name for the op.
   """
 
@@ -1837,15 +1843,34 @@ def deserialize(name, custom_objects=None):
 
 @keras_export('keras.losses.get')
 def get(identifier):
-  """Retrieves a Keras loss function.
+  """Retrieves a Keras loss as a `function`/`Loss` class instance.
+
+  The `identifier` may be the string name of a loss function or `Loss` class.
+
+  >>> loss = tf.keras.losses.get("categorical_crossentropy")
+  >>> type(loss)
+  <class 'function'>
+  >>> loss = tf.keras.losses.get("CategoricalCrossentropy")
+  >>> type(loss)
+  <class '...tensorflow.python.keras.losses.CategoricalCrossentropy'>
+
+  You can also specify `config` of the loss to this function by passing dict
+  containing `class_name` and `config` as an identifier. Also note that the
+  `class_name` must map to a `Loss` class
+
+  >>> identifier = {"class_name": "CategoricalCrossentropy",
+  ...               "config": {"from_logits": True}}
+  >>> loss = tf.keras.losses.get(identifier)
+  >>> type(loss)
+  <class '...tensorflow.python.keras.losses.CategoricalCrossentropy'>
 
   Arguments:
     identifier: A loss identifier. One of None or string name of a loss
-      function/class or loss configuration dictionary or a loss function or
-      a loss class instance
+      function/class or loss configuration dictionary or a loss function or a
+      loss class instance
 
   Returns:
-    A Keras loss function/ `Loss` class instance.
+    A Keras loss as a `function`/ `Loss` class instance.
 
   Raises:
     ValueError: If `identifier` cannot be interpreted.
diff --git a/tensorflow/python/keras/losses_test.py b/tensorflow/python/keras/losses_test.py
index 010889090fc..855a1ed41a3 100644
--- a/tensorflow/python/keras/losses_test.py
+++ b/tensorflow/python/keras/losses_test.py
@@ -18,46 +18,43 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python import keras
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import combinations
+from tensorflow.python.keras import losses
 from tensorflow.python.keras.utils import losses_utils
 from tensorflow.python.platform import test
 
-ALL_LOSSES = [keras.losses.mean_squared_error,
-              keras.losses.mean_absolute_error,
-              keras.losses.mean_absolute_percentage_error,
-              keras.losses.mean_squared_logarithmic_error,
-              keras.losses.squared_hinge,
-              keras.losses.hinge,
-              keras.losses.categorical_crossentropy,
-              keras.losses.binary_crossentropy,
-              keras.losses.kullback_leibler_divergence,
-              keras.losses.poisson,
-              keras.losses.cosine_similarity,
-              keras.losses.logcosh,
-              keras.losses.categorical_hinge]
+ALL_LOSSES = [
+    losses.mean_squared_error, losses.mean_absolute_error,
+    losses.mean_absolute_percentage_error,
+    losses.mean_squared_logarithmic_error, losses.squared_hinge, losses.hinge,
+    losses.categorical_crossentropy, losses.binary_crossentropy,
+    losses.kullback_leibler_divergence, losses.poisson,
+    losses.cosine_similarity, losses.logcosh, losses.categorical_hinge
+]
 
 
-class KerasLossesTest(test.TestCase):
+class KerasLossesTest(test.TestCase, parameterized.TestCase):
 
   def test_objective_shapes_3d(self):
     with self.cached_session():
-      y_a = keras.backend.variable(np.random.random((5, 6, 7)))
-      y_b = keras.backend.variable(np.random.random((5, 6, 7)))
+      y_a = backend.variable(np.random.random((5, 6, 7)))
+      y_b = backend.variable(np.random.random((5, 6, 7)))
       for obj in ALL_LOSSES:
         objective_output = obj(y_a, y_b)
         self.assertListEqual(objective_output.shape.as_list(), [5, 6])
 
   def test_objective_shapes_2d(self):
     with self.cached_session():
-      y_a = keras.backend.variable(np.random.random((6, 7)))
-      y_b = keras.backend.variable(np.random.random((6, 7)))
+      y_a = backend.variable(np.random.random((6, 7)))
+      y_b = backend.variable(np.random.random((6, 7)))
       for obj in ALL_LOSSES:
         objective_output = obj(y_a, y_b)
         self.assertListEqual(objective_output.shape.as_list(), [
@@ -66,120 +63,121 @@ class KerasLossesTest(test.TestCase):
 
   def test_cce_one_hot(self):
     with self.cached_session():
-      y_a = keras.backend.variable(np.random.randint(0, 7, (5, 6)))
-      y_b = keras.backend.variable(np.random.random((5, 6, 7)))
-      objective_output = keras.losses.sparse_categorical_crossentropy(y_a, y_b)
-      assert keras.backend.eval(objective_output).shape == (5, 6)
+      y_a = backend.variable(np.random.randint(0, 7, (5, 6)))
+      y_b = backend.variable(np.random.random((5, 6, 7)))
+      objective_output = losses.sparse_categorical_crossentropy(y_a, y_b)
+      assert backend.eval(objective_output).shape == (5, 6)
 
-      y_a = keras.backend.variable(np.random.randint(0, 7, (6,)))
-      y_b = keras.backend.variable(np.random.random((6, 7)))
-      objective_output = keras.losses.sparse_categorical_crossentropy(y_a, y_b)
-      assert keras.backend.eval(objective_output).shape == (6,)
+      y_a = backend.variable(np.random.randint(0, 7, (6,)))
+      y_b = backend.variable(np.random.random((6, 7)))
+      objective_output = losses.sparse_categorical_crossentropy(y_a, y_b)
+      assert backend.eval(objective_output).shape == (6,)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_categorical_crossentropy_loss(self):
-    target = keras.backend.variable(np.random.randint(0, 1, (5, 1)))
-    logits = keras.backend.variable(np.random.random((5, 1)))
-    softmax_output = keras.backend.softmax(logits)
-    output_from_logit = keras.losses.categorical_crossentropy(
+    target = backend.variable(np.random.randint(0, 1, (5, 1)))
+    logits = backend.variable(np.random.random((5, 1)))
+    softmax_output = backend.softmax(logits)
+    output_from_logit = losses.categorical_crossentropy(
         target, logits, from_logits=True)
-    output_from_softmax = keras.losses.categorical_crossentropy(
+    output_from_softmax = losses.categorical_crossentropy(
         target, softmax_output)
     np.testing.assert_allclose(
-        keras.backend.eval(output_from_logit),
-        keras.backend.eval(output_from_softmax), atol=1e-5)
+        backend.eval(output_from_logit),
+        backend.eval(output_from_softmax),
+        atol=1e-5)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_categorical_crossentropy_loss_with_unknown_rank_tensor(self):
-    t = keras.backend.placeholder()
-    p = keras.backend.placeholder()
-    o = keras.losses.categorical_crossentropy(t, p)
+    t = backend.placeholder()
+    p = backend.placeholder()
+    o = losses.categorical_crossentropy(t, p)
 
     t_val = ops.convert_to_tensor_v2([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])
     p_val = ops.convert_to_tensor_v2([[.9, .05, .05], [.05, .89, .06],
                                       [.05, .01, .94]])
-    f = keras.backend.function([t, p], o)
+    f = backend.function([t, p], o)
 
     result = f([t_val, p_val])
     self.assertArrayNear(result, [.105, .116, .062], 1e-3)
 
     # from logits
     p_val = ops.convert_to_tensor_v2([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    o = keras.losses.categorical_crossentropy(t, p, from_logits=True)
-    f = keras.backend.function([t, p], o)
+    o = losses.categorical_crossentropy(t, p, from_logits=True)
+    f = backend.function([t, p], o)
 
     result = f([t_val, p_val])
     self.assertArrayNear(result, [.002, 0, .17], 1e-3)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_sparse_categorical_crossentropy_loss(self):
-    target = keras.backend.variable(np.random.randint(0, 1, (5, 1)))
-    logits = keras.backend.variable(np.random.random((5, 1)))
-    softmax_output = keras.backend.softmax(logits)
-    output_from_logit = keras.losses.sparse_categorical_crossentropy(
+    target = backend.variable(np.random.randint(0, 1, (5, 1)))
+    logits = backend.variable(np.random.random((5, 1)))
+    softmax_output = backend.softmax(logits)
+    output_from_logit = losses.sparse_categorical_crossentropy(
         target, logits, from_logits=True)
-    output_from_softmax = keras.losses.sparse_categorical_crossentropy(
+    output_from_softmax = losses.sparse_categorical_crossentropy(
         target, softmax_output)
     np.testing.assert_allclose(
-        keras.backend.eval(output_from_logit),
-        keras.backend.eval(output_from_softmax), atol=1e-5)
+        backend.eval(output_from_logit),
+        backend.eval(output_from_softmax),
+        atol=1e-5)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_sparse_categorical_crossentropy_loss_with_unknown_rank_tensor(self):
-    t = keras.backend.placeholder()
-    p = keras.backend.placeholder()
-    o = keras.losses.sparse_categorical_crossentropy(t, p)
+    t = backend.placeholder()
+    p = backend.placeholder()
+    o = losses.sparse_categorical_crossentropy(t, p)
 
     t_val = ops.convert_to_tensor_v2([0, 1, 2])
     p_val = ops.convert_to_tensor_v2([[.9, .05, .05], [.05, .89, .06],
                                       [.05, .01, .94]])
-    f = keras.backend.function([t, p], o)
+    f = backend.function([t, p], o)
 
     result = f([t_val, p_val])
     self.assertArrayNear(result, [.105, .116, .062], 1e-3)
 
     # from logits
     p_val = ops.convert_to_tensor_v2([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    o = keras.losses.sparse_categorical_crossentropy(t, p, from_logits=True)
-    f = keras.backend.function([t, p], o)
+    o = losses.sparse_categorical_crossentropy(t, p, from_logits=True)
+    f = backend.function([t, p], o)
 
     result = f([t_val, p_val])
     self.assertArrayNear(result, [.002, 0, .17], 1e-3)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_binary_crossentropy_loss(self):
-    target = keras.backend.variable(np.random.randint(0, 1, (5, 1)))
-    logits = keras.backend.variable(np.random.random((5, 1)))
-    sigmoid_output = keras.backend.sigmoid(logits)
-    output_from_logit = keras.losses.binary_crossentropy(
+    target = backend.variable(np.random.randint(0, 1, (5, 1)))
+    logits = backend.variable(np.random.random((5, 1)))
+    sigmoid_output = backend.sigmoid(logits)
+    output_from_logit = losses.binary_crossentropy(
         target, logits, from_logits=True)
-    output_from_sigmoid = keras.losses.binary_crossentropy(
-        target, sigmoid_output)
+    output_from_sigmoid = losses.binary_crossentropy(target, sigmoid_output)
     np.testing.assert_allclose(
-        keras.backend.eval(output_from_logit),
-        keras.backend.eval(output_from_sigmoid), atol=1e-5)
+        backend.eval(output_from_logit),
+        backend.eval(output_from_sigmoid),
+        atol=1e-5)
 
   def test_get_bce(self):
-    bce_fn = keras.losses.get('bce')
-    self.assertEqual(bce_fn, keras.losses.binary_crossentropy)
+    bce_fn = losses.get('bce')
+    self.assertEqual(bce_fn, losses.binary_crossentropy)
 
   def test_serialization(self):
-    fn = keras.losses.get('mse')
-    config = keras.losses.serialize(fn)
-    new_fn = keras.losses.deserialize(config)
+    fn = losses.get('mse')
+    config = losses.serialize(fn)
+    new_fn = losses.deserialize(config)
     self.assertEqual(fn, new_fn)
 
   def test_categorical_hinge(self):
-    y_pred = keras.backend.variable(np.array([[0.3, 0.2, 0.1],
-                                              [0.1, 0.2, 0.7]]))
-    y_true = keras.backend.variable(np.array([[0, 1, 0], [1, 0, 0]]))
+    y_pred = backend.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
+    y_true = backend.variable(np.array([[0, 1, 0], [1, 0, 0]]))
     expected_loss = ((0.3 - 0.2 + 1) + (0.7 - 0.1 + 1)) / 2.0
-    loss = keras.backend.eval(keras.losses.categorical_hinge(y_true, y_pred))
+    loss = backend.eval(losses.categorical_hinge(y_true, y_pred))
     self.assertAllClose(expected_loss, np.mean(loss))
 
   def test_loss_wrapper(self):
-    loss_fn = keras.losses.get('mse')
-    mse_obj = keras.losses.LossFunctionWrapper(loss_fn, name=loss_fn.__name__)
+    loss_fn = losses.get('mse')
+    mse_obj = losses.LossFunctionWrapper(loss_fn, name=loss_fn.__name__)
 
     self.assertEqual(mse_obj.name, 'mean_squared_error')
     self.assertEqual(mse_obj.reduction, losses_utils.ReductionV2.AUTO)
@@ -197,32 +195,32 @@ class KerasLossesTest(test.TestCase):
 
   def test_invalid_reduction(self):
     with self.assertRaisesRegexp(ValueError, 'Invalid Reduction Key Foo.'):
-      keras.losses.MeanSquaredError(reduction='Foo')
+      losses.MeanSquaredError(reduction='Foo')
 
-    mse_obj = keras.losses.MeanSquaredError()
+    mse_obj = losses.MeanSquaredError()
     y = constant_op.constant([1])
     mse_obj.reduction = 'Bar'
     with self.assertRaisesRegexp(ValueError, 'Invalid Reduction Key Bar.'):
       mse_obj(y, y)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class MeanSquaredErrorTest(test.TestCase):
 
   def test_config(self):
-    mse_obj = keras.losses.MeanSquaredError(
+    mse_obj = losses.MeanSquaredError(
         reduction=losses_utils.ReductionV2.SUM, name='mse_1')
     self.assertEqual(mse_obj.name, 'mse_1')
     self.assertEqual(mse_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_all_correct_unweighted(self):
-    mse_obj = keras.losses.MeanSquaredError()
+    mse_obj = losses.MeanSquaredError()
     y_true = constant_op.constant([4, 8, 12, 8, 1, 3], shape=(2, 3))
     loss = mse_obj(y_true, y_true)
     self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
 
   def test_unweighted(self):
-    mse_obj = keras.losses.MeanSquaredError()
+    mse_obj = losses.MeanSquaredError()
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -231,7 +229,7 @@ class MeanSquaredErrorTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 49.5, 3)
 
   def test_scalar_weighted(self):
-    mse_obj = keras.losses.MeanSquaredError()
+    mse_obj = losses.MeanSquaredError()
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -240,7 +238,7 @@ class MeanSquaredErrorTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 113.85, 3)
 
   def test_sample_weighted(self):
-    mse_obj = keras.losses.MeanSquaredError()
+    mse_obj = losses.MeanSquaredError()
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -250,7 +248,7 @@ class MeanSquaredErrorTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 767.8 / 6, 3)
 
   def test_timestep_weighted(self):
-    mse_obj = keras.losses.MeanSquaredError()
+    mse_obj = losses.MeanSquaredError()
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3, 1),
@@ -260,7 +258,7 @@ class MeanSquaredErrorTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 587 / 6, 3)
 
   def test_zero_weighted(self):
-    mse_obj = keras.losses.MeanSquaredError()
+    mse_obj = losses.MeanSquaredError()
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -269,7 +267,7 @@ class MeanSquaredErrorTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
 
   def test_invalid_sample_weight(self):
-    mse_obj = keras.losses.MeanSquaredError()
+    mse_obj = losses.MeanSquaredError()
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3], shape=(2, 3, 1))
     sample_weight = constant_op.constant([3, 6, 5, 0], shape=(2, 2))
@@ -279,8 +277,7 @@ class MeanSquaredErrorTest(test.TestCase):
       mse_obj(y_true, y_pred, sample_weight=sample_weight)
 
   def test_no_reduction(self):
-    mse_obj = keras.losses.MeanSquaredError(
-        reduction=losses_utils.ReductionV2.NONE)
+    mse_obj = losses.MeanSquaredError(reduction=losses_utils.ReductionV2.NONE)
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -290,8 +287,7 @@ class MeanSquaredErrorTest(test.TestCase):
     self.assertArrayNear(loss, [84.3333, 143.3666], 1e-3)
 
   def test_sum_reduction(self):
-    mse_obj = keras.losses.MeanSquaredError(
-        reduction=losses_utils.ReductionV2.SUM)
+    mse_obj = losses.MeanSquaredError(reduction=losses_utils.ReductionV2.SUM)
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -300,23 +296,23 @@ class MeanSquaredErrorTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 227.69998, 3)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class MeanAbsoluteErrorTest(test.TestCase):
 
   def test_config(self):
-    mae_obj = keras.losses.MeanAbsoluteError(
+    mae_obj = losses.MeanAbsoluteError(
         reduction=losses_utils.ReductionV2.SUM, name='mae_1')
     self.assertEqual(mae_obj.name, 'mae_1')
     self.assertEqual(mae_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_all_correct_unweighted(self):
-    mae_obj = keras.losses.MeanAbsoluteError()
+    mae_obj = losses.MeanAbsoluteError()
     y_true = constant_op.constant([4, 8, 12, 8, 1, 3], shape=(2, 3))
     loss = mae_obj(y_true, y_true)
     self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
 
   def test_unweighted(self):
-    mae_obj = keras.losses.MeanAbsoluteError()
+    mae_obj = losses.MeanAbsoluteError()
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -325,7 +321,7 @@ class MeanAbsoluteErrorTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 5.5, 3)
 
   def test_scalar_weighted(self):
-    mae_obj = keras.losses.MeanAbsoluteError()
+    mae_obj = losses.MeanAbsoluteError()
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -334,7 +330,7 @@ class MeanAbsoluteErrorTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 12.65, 3)
 
   def test_sample_weighted(self):
-    mae_obj = keras.losses.MeanAbsoluteError()
+    mae_obj = losses.MeanAbsoluteError()
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -344,7 +340,7 @@ class MeanAbsoluteErrorTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 81.4 / 6, 3)
 
   def test_timestep_weighted(self):
-    mae_obj = keras.losses.MeanAbsoluteError()
+    mae_obj = losses.MeanAbsoluteError()
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3, 1),
@@ -354,7 +350,7 @@ class MeanAbsoluteErrorTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 83 / 6, 3)
 
   def test_zero_weighted(self):
-    mae_obj = keras.losses.MeanAbsoluteError()
+    mae_obj = losses.MeanAbsoluteError()
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -363,7 +359,7 @@ class MeanAbsoluteErrorTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
 
   def test_invalid_sample_weight(self):
-    mae_obj = keras.losses.MeanAbsoluteError()
+    mae_obj = losses.MeanAbsoluteError()
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3], shape=(2, 3, 1))
     sample_weight = constant_op.constant([3, 6, 5, 0], shape=(2, 2))
@@ -373,8 +369,7 @@ class MeanAbsoluteErrorTest(test.TestCase):
       mae_obj(y_true, y_pred, sample_weight=sample_weight)
 
   def test_no_reduction(self):
-    mae_obj = keras.losses.MeanAbsoluteError(
-        reduction=losses_utils.ReductionV2.NONE)
+    mae_obj = losses.MeanAbsoluteError(reduction=losses_utils.ReductionV2.NONE)
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -384,8 +379,7 @@ class MeanAbsoluteErrorTest(test.TestCase):
     self.assertArrayNear(loss, [10.7333, 14.5666], 1e-3)
 
   def test_sum_reduction(self):
-    mae_obj = keras.losses.MeanAbsoluteError(
-        reduction=losses_utils.ReductionV2.SUM)
+    mae_obj = losses.MeanAbsoluteError(reduction=losses_utils.ReductionV2.SUM)
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -394,17 +388,17 @@ class MeanAbsoluteErrorTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 25.29999, 3)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class MeanAbsolutePercentageErrorTest(test.TestCase):
 
   def test_config(self):
-    mape_obj = keras.losses.MeanAbsolutePercentageError(
+    mape_obj = losses.MeanAbsolutePercentageError(
         reduction=losses_utils.ReductionV2.SUM, name='mape_1')
     self.assertEqual(mape_obj.name, 'mape_1')
     self.assertEqual(mape_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_all_correct_unweighted(self):
-    mape_obj = keras.losses.MeanAbsolutePercentageError()
+    mape_obj = losses.MeanAbsolutePercentageError()
     y_true = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
                                   dtype=dtypes.float32)
@@ -412,7 +406,7 @@ class MeanAbsolutePercentageErrorTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
 
   def test_unweighted(self):
-    mape_obj = keras.losses.MeanAbsolutePercentageError()
+    mape_obj = losses.MeanAbsolutePercentageError()
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -421,7 +415,7 @@ class MeanAbsolutePercentageErrorTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 211.8518, 3)
 
   def test_scalar_weighted(self):
-    mape_obj = keras.losses.MeanAbsolutePercentageError()
+    mape_obj = losses.MeanAbsolutePercentageError()
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -430,7 +424,7 @@ class MeanAbsolutePercentageErrorTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 487.259, 3)
 
   def test_sample_weighted(self):
-    mape_obj = keras.losses.MeanAbsolutePercentageError()
+    mape_obj = losses.MeanAbsolutePercentageError()
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -440,7 +434,7 @@ class MeanAbsolutePercentageErrorTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 422.8888, 3)
 
   def test_timestep_weighted(self):
-    mape_obj = keras.losses.MeanAbsolutePercentageError()
+    mape_obj = losses.MeanAbsolutePercentageError()
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3, 1),
@@ -450,7 +444,7 @@ class MeanAbsolutePercentageErrorTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 694.4445, 3)
 
   def test_zero_weighted(self):
-    mape_obj = keras.losses.MeanAbsolutePercentageError()
+    mape_obj = losses.MeanAbsolutePercentageError()
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -459,7 +453,7 @@ class MeanAbsolutePercentageErrorTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
 
   def test_no_reduction(self):
-    mape_obj = keras.losses.MeanAbsolutePercentageError(
+    mape_obj = losses.MeanAbsolutePercentageError(
         reduction=losses_utils.ReductionV2.NONE)
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
@@ -470,17 +464,17 @@ class MeanAbsolutePercentageErrorTest(test.TestCase):
     self.assertArrayNear(loss, [621.8518, 352.6666], 1e-3)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class MeanSquaredLogarithmicErrorTest(test.TestCase):
 
   def test_config(self):
-    msle_obj = keras.losses.MeanSquaredLogarithmicError(
+    msle_obj = losses.MeanSquaredLogarithmicError(
         reduction=losses_utils.ReductionV2.SUM, name='mape_1')
     self.assertEqual(msle_obj.name, 'mape_1')
     self.assertEqual(msle_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_unweighted(self):
-    msle_obj = keras.losses.MeanSquaredLogarithmicError()
+    msle_obj = losses.MeanSquaredLogarithmicError()
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -489,7 +483,7 @@ class MeanSquaredLogarithmicErrorTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 1.4370, 3)
 
   def test_scalar_weighted(self):
-    msle_obj = keras.losses.MeanSquaredLogarithmicError()
+    msle_obj = losses.MeanSquaredLogarithmicError()
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -498,7 +492,7 @@ class MeanSquaredLogarithmicErrorTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 3.3051, 3)
 
   def test_sample_weighted(self):
-    msle_obj = keras.losses.MeanSquaredLogarithmicError()
+    msle_obj = losses.MeanSquaredLogarithmicError()
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -508,7 +502,7 @@ class MeanSquaredLogarithmicErrorTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 3.7856, 3)
 
   def test_timestep_weighted(self):
-    msle_obj = keras.losses.MeanSquaredLogarithmicError()
+    msle_obj = losses.MeanSquaredLogarithmicError()
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3, 1),
@@ -518,7 +512,7 @@ class MeanSquaredLogarithmicErrorTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 2.6473, 3)
 
   def test_zero_weighted(self):
-    msle_obj = keras.losses.MeanSquaredLogarithmicError()
+    msle_obj = losses.MeanSquaredLogarithmicError()
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -527,7 +521,7 @@ class MeanSquaredLogarithmicErrorTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class CosineSimilarityTest(test.TestCase):
 
   def l2_norm(self, x, axis):
@@ -548,21 +542,21 @@ class CosineSimilarityTest(test.TestCase):
     self.y_pred = constant_op.constant(self.np_y_pred)
 
   def test_config(self):
-    cosine_obj = keras.losses.CosineSimilarity(
+    cosine_obj = losses.CosineSimilarity(
         axis=2, reduction=losses_utils.ReductionV2.SUM, name='cosine_loss')
     self.assertEqual(cosine_obj.name, 'cosine_loss')
     self.assertEqual(cosine_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_unweighted(self):
     self.setup()
-    cosine_obj = keras.losses.CosineSimilarity()
+    cosine_obj = losses.CosineSimilarity()
     loss = cosine_obj(self.y_true, self.y_pred)
     expected_loss = -np.mean(self.expected_loss)
     self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
 
   def test_scalar_weighted(self):
     self.setup()
-    cosine_obj = keras.losses.CosineSimilarity()
+    cosine_obj = losses.CosineSimilarity()
     sample_weight = 2.3
     loss = cosine_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
     expected_loss = -np.mean(self.expected_loss * sample_weight)
@@ -570,7 +564,7 @@ class CosineSimilarityTest(test.TestCase):
 
   def test_sample_weighted(self):
     self.setup()
-    cosine_obj = keras.losses.CosineSimilarity()
+    cosine_obj = losses.CosineSimilarity()
     sample_weight = np.asarray([1.2, 3.4])
     loss = cosine_obj(
         self.y_true,
@@ -581,7 +575,7 @@ class CosineSimilarityTest(test.TestCase):
 
   def test_timestep_weighted(self):
     self.setup()
-    cosine_obj = keras.losses.CosineSimilarity()
+    cosine_obj = losses.CosineSimilarity()
     np_y_true = self.np_y_true.reshape((2, 3, 1))
     np_y_pred = self.np_y_pred.reshape((2, 3, 1))
     sample_weight = np.asarray([3, 6, 5, 0, 4, 2]).reshape((2, 3))
@@ -600,23 +594,23 @@ class CosineSimilarityTest(test.TestCase):
 
   def test_zero_weighted(self):
     self.setup()
-    cosine_obj = keras.losses.CosineSimilarity()
+    cosine_obj = losses.CosineSimilarity()
     loss = cosine_obj(self.y_true, self.y_pred, sample_weight=0)
     self.assertAlmostEqual(self.evaluate(loss), 0., 3)
 
   def test_axis(self):
     self.setup(axis=1)
-    cosine_obj = keras.losses.CosineSimilarity(axis=1)
+    cosine_obj = losses.CosineSimilarity(axis=1)
     loss = cosine_obj(self.y_true, self.y_pred)
     expected_loss = -np.mean(self.expected_loss)
     self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class BinaryCrossentropyTest(test.TestCase):
 
   def test_config(self):
-    bce_obj = keras.losses.BinaryCrossentropy(
+    bce_obj = losses.BinaryCrossentropy(
         reduction=losses_utils.ReductionV2.SUM, name='bce_1')
     self.assertEqual(bce_obj.name, 'bce_1')
     self.assertEqual(bce_obj.reduction, losses_utils.ReductionV2.SUM)
@@ -624,7 +618,7 @@ class BinaryCrossentropyTest(test.TestCase):
   def test_all_correct_unweighted(self):
     y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]],
                                   dtype=dtypes.float32)
-    bce_obj = keras.losses.BinaryCrossentropy()
+    bce_obj = losses.BinaryCrossentropy()
     loss = bce_obj(y_true, y_true)
     self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
 
@@ -632,14 +626,14 @@ class BinaryCrossentropyTest(test.TestCase):
     logits = constant_op.constant([[100.0, -100.0, -100.0],
                                    [-100.0, 100.0, -100.0],
                                    [-100.0, -100.0, 100.0]])
-    bce_obj = keras.losses.BinaryCrossentropy(from_logits=True)
+    bce_obj = losses.BinaryCrossentropy(from_logits=True)
     loss = bce_obj(y_true, logits)
     self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
 
   def test_unweighted(self):
     y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
     y_pred = np.asarray([1, 1, 1, 0], dtype=np.float32).reshape([2, 2])
-    bce_obj = keras.losses.BinaryCrossentropy()
+    bce_obj = losses.BinaryCrossentropy()
     loss = bce_obj(y_true, y_pred)
 
     # EPSILON = 1e-7, y = y_true, y` = y_pred, Y_MAX = 0.9999999
@@ -658,7 +652,7 @@ class BinaryCrossentropyTest(test.TestCase):
     y_true = constant_op.constant([[1, 0, 1], [0, 1, 1]])
     logits = constant_op.constant([[100.0, -100.0, 100.0],
                                    [100.0, 100.0, -100.0]])
-    bce_obj = keras.losses.BinaryCrossentropy(from_logits=True)
+    bce_obj = losses.BinaryCrossentropy(from_logits=True)
     loss = bce_obj(y_true, logits)
 
     # Loss = max(x, 0) - x * z + log(1 + exp(-abs(x)))
@@ -675,7 +669,7 @@ class BinaryCrossentropyTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 33.333, 3)
 
   def test_scalar_weighted(self):
-    bce_obj = keras.losses.BinaryCrossentropy()
+    bce_obj = losses.BinaryCrossentropy()
     y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
     y_pred = np.asarray([1, 1, 1, 0], dtype=np.float32).reshape([2, 2])
     loss = bce_obj(y_true, y_pred, sample_weight=2.3)
@@ -697,7 +691,7 @@ class BinaryCrossentropyTest(test.TestCase):
     y_true = constant_op.constant([[1, 0, 1], [0, 1, 1]])
     logits = constant_op.constant([[100.0, -100.0, 100.0],
                                    [100.0, 100.0, -100.0]])
-    bce_obj = keras.losses.BinaryCrossentropy(from_logits=True)
+    bce_obj = losses.BinaryCrossentropy(from_logits=True)
     loss = bce_obj(y_true, logits, sample_weight=2.3)
 
     # Loss = max(x, 0) - x * z + log(1 + exp(-abs(x)))
@@ -709,7 +703,7 @@ class BinaryCrossentropyTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 76.667, 3)
 
   def test_sample_weighted(self):
-    bce_obj = keras.losses.BinaryCrossentropy()
+    bce_obj = losses.BinaryCrossentropy()
     y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
     y_pred = np.asarray([1, 1, 1, 0], dtype=np.float32).reshape([2, 2])
     sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
@@ -732,7 +726,7 @@ class BinaryCrossentropyTest(test.TestCase):
     logits = constant_op.constant([[100.0, -100.0, 100.0],
                                    [100.0, 100.0, -100.0]])
     weights = constant_op.constant([4, 3])
-    bce_obj = keras.losses.BinaryCrossentropy(from_logits=True)
+    bce_obj = losses.BinaryCrossentropy(from_logits=True)
     loss = bce_obj(y_true, logits, sample_weight=weights)
 
     # Loss = max(x, 0) - x * z + log(1 + exp(-abs(x)))
@@ -747,7 +741,7 @@ class BinaryCrossentropyTest(test.TestCase):
     y_true = constant_op.constant([[1, 0, 1], [0, 1, 1]])
     logits = constant_op.constant([[100.0, -100.0, 100.0],
                                    [100.0, 100.0, -100.0]])
-    bce_obj = keras.losses.BinaryCrossentropy(
+    bce_obj = losses.BinaryCrossentropy(
         from_logits=True, reduction=losses_utils.ReductionV2.NONE)
     loss = bce_obj(y_true, logits)
 
@@ -771,18 +765,18 @@ class BinaryCrossentropyTest(test.TestCase):
     #  0   + 100 * (0.5 L)      + 0 +
     #  0   + 100 * (1 - 0.5 L)  + 0) * (1/3)
     #  = (100 + 50L) * 1/3
-    bce_obj = keras.losses.BinaryCrossentropy(
+    bce_obj = losses.BinaryCrossentropy(
         from_logits=True, label_smoothing=label_smoothing)
     loss = bce_obj(y_true, logits)
     expected_value = (100.0 + 50.0 * label_smoothing) / 3.0
     self.assertAlmostEqual(self.evaluate(loss), expected_value, 3)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class CategoricalCrossentropyTest(test.TestCase):
 
   def test_config(self):
-    cce_obj = keras.losses.CategoricalCrossentropy(
+    cce_obj = losses.CategoricalCrossentropy(
         reduction=losses_utils.ReductionV2.SUM, name='bce_1')
     self.assertEqual(cce_obj.name, 'bce_1')
     self.assertEqual(cce_obj.reduction, losses_utils.ReductionV2.SUM)
@@ -792,18 +786,18 @@ class CategoricalCrossentropyTest(test.TestCase):
                                   dtype=dtypes.int64)
     y_pred = constant_op.constant([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]],
                                   dtype=dtypes.float32)
-    cce_obj = keras.losses.CategoricalCrossentropy()
+    cce_obj = losses.CategoricalCrossentropy()
     loss = cce_obj(y_true, y_pred)
     self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
 
     # Test with logits.
     logits = constant_op.constant([[10., 0., 0.], [0., 10., 0.], [0., 0., 10.]])
-    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    cce_obj = losses.CategoricalCrossentropy(from_logits=True)
     loss = cce_obj(y_true, logits)
     self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
 
   def test_unweighted(self):
-    cce_obj = keras.losses.CategoricalCrossentropy()
+    cce_obj = losses.CategoricalCrossentropy()
     y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
     y_pred = constant_op.constant(
         [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
@@ -812,12 +806,12 @@ class CategoricalCrossentropyTest(test.TestCase):
 
     # Test with logits.
     logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    cce_obj = losses.CategoricalCrossentropy(from_logits=True)
     loss = cce_obj(y_true, logits)
     self.assertAlmostEqual(self.evaluate(loss), .0573, 3)
 
   def test_scalar_weighted(self):
-    cce_obj = keras.losses.CategoricalCrossentropy()
+    cce_obj = losses.CategoricalCrossentropy()
     y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
     y_pred = constant_op.constant(
         [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
@@ -826,12 +820,12 @@ class CategoricalCrossentropyTest(test.TestCase):
 
     # Test with logits.
     logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    cce_obj = losses.CategoricalCrossentropy(from_logits=True)
     loss = cce_obj(y_true, logits, sample_weight=2.3)
     self.assertAlmostEqual(self.evaluate(loss), .1317, 3)
 
   def test_sample_weighted(self):
-    cce_obj = keras.losses.CategoricalCrossentropy()
+    cce_obj = losses.CategoricalCrossentropy()
     y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
     y_pred = constant_op.constant(
         [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
@@ -841,14 +835,14 @@ class CategoricalCrossentropyTest(test.TestCase):
 
     # Test with logits.
     logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    cce_obj = losses.CategoricalCrossentropy(from_logits=True)
     loss = cce_obj(y_true, logits, sample_weight=sample_weight)
     self.assertAlmostEqual(self.evaluate(loss), 0.31829, 3)
 
   def test_no_reduction(self):
     y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
     logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    cce_obj = keras.losses.CategoricalCrossentropy(
+    cce_obj = losses.CategoricalCrossentropy(
         from_logits=True, reduction=losses_utils.ReductionV2.NONE)
     loss = cce_obj(y_true, logits)
     self.assertAllClose((0.001822, 0.000459, 0.169846), self.evaluate(loss), 3)
@@ -869,7 +863,7 @@ class CategoricalCrossentropyTest(test.TestCase):
     #                  0  = L/n
     # Applying the above two fns to the given input:
     # -0 * (1 - L + L/n) + 200 * L/n + 200 * L/n = 400 L/n
-    cce_obj = keras.losses.CategoricalCrossentropy(
+    cce_obj = losses.CategoricalCrossentropy(
         from_logits=True, label_smoothing=label_smoothing)
     loss = cce_obj(y_true, logits)
     expected_value = 400.0 * label_smoothing / 3.0
@@ -880,16 +874,16 @@ class CategoricalCrossentropyTest(test.TestCase):
     y_pred = constant_op.constant([[.9, .05, .05], [.5, .89, .6],
                                    [.05, .01, .94]])
 
-    cce_obj = keras.losses.CategoricalCrossentropy()
+    cce_obj = losses.CategoricalCrossentropy()
     with self.assertRaisesRegexp(ValueError, 'Shapes .+ are incompatible'):
       cce_obj(y_true, y_pred)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class SparseCategoricalCrossentropyTest(test.TestCase):
 
   def test_config(self):
-    cce_obj = keras.losses.SparseCategoricalCrossentropy(
+    cce_obj = losses.SparseCategoricalCrossentropy(
         reduction=losses_utils.ReductionV2.SUM, name='scc')
     self.assertEqual(cce_obj.name, 'scc')
     self.assertEqual(cce_obj.reduction, losses_utils.ReductionV2.SUM)
@@ -898,18 +892,18 @@ class SparseCategoricalCrossentropyTest(test.TestCase):
     y_true = constant_op.constant([[0], [1], [2]], dtype=dtypes.int64)
     y_pred = constant_op.constant([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]],
                                   dtype=dtypes.float32)
-    cce_obj = keras.losses.SparseCategoricalCrossentropy()
+    cce_obj = losses.SparseCategoricalCrossentropy()
     loss = cce_obj(y_true, y_pred)
     self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
 
     # Test with logits.
     logits = constant_op.constant([[10., 0., 0.], [0., 10., 0.], [0., 0., 10.]])
-    cce_obj = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+    cce_obj = losses.SparseCategoricalCrossentropy(from_logits=True)
     loss = cce_obj(y_true, logits)
     self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
 
   def test_unweighted(self):
-    cce_obj = keras.losses.SparseCategoricalCrossentropy()
+    cce_obj = losses.SparseCategoricalCrossentropy()
     y_true = constant_op.constant([0, 1, 2])
     y_pred = constant_op.constant(
         [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
@@ -918,12 +912,12 @@ class SparseCategoricalCrossentropyTest(test.TestCase):
 
     # Test with logits.
     logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    cce_obj = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+    cce_obj = losses.SparseCategoricalCrossentropy(from_logits=True)
     loss = cce_obj(y_true, logits)
     self.assertAlmostEqual(self.evaluate(loss), .0573, 3)
 
   def test_scalar_weighted(self):
-    cce_obj = keras.losses.SparseCategoricalCrossentropy()
+    cce_obj = losses.SparseCategoricalCrossentropy()
     y_true = constant_op.constant([[0], [1], [2]])
     y_pred = constant_op.constant(
         [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
@@ -932,12 +926,12 @@ class SparseCategoricalCrossentropyTest(test.TestCase):
 
     # Test with logits.
     logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    cce_obj = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+    cce_obj = losses.SparseCategoricalCrossentropy(from_logits=True)
     loss = cce_obj(y_true, logits, sample_weight=2.3)
     self.assertAlmostEqual(self.evaluate(loss), .1317, 3)
 
   def test_sample_weighted(self):
-    cce_obj = keras.losses.SparseCategoricalCrossentropy()
+    cce_obj = losses.SparseCategoricalCrossentropy()
     y_true = constant_op.constant([[0], [1], [2]])
     y_pred = constant_op.constant(
         [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
@@ -947,38 +941,38 @@ class SparseCategoricalCrossentropyTest(test.TestCase):
 
     # Test with logits.
     logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    cce_obj = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+    cce_obj = losses.SparseCategoricalCrossentropy(from_logits=True)
     loss = cce_obj(y_true, logits, sample_weight=sample_weight)
     self.assertAlmostEqual(self.evaluate(loss), 0.31829, 3)
 
   def test_no_reduction(self):
     y_true = constant_op.constant([[0], [1], [2]])
     logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    cce_obj = keras.losses.SparseCategoricalCrossentropy(
+    cce_obj = losses.SparseCategoricalCrossentropy(
         from_logits=True, reduction=losses_utils.ReductionV2.NONE)
     loss = cce_obj(y_true, logits)
     self.assertAllClose((0.001822, 0.000459, 0.169846), self.evaluate(loss), 3)
 
   def test_non_tensor(self):
     # Test case for GitHub issue 33394.
-    cce_obj = keras.losses.SparseCategoricalCrossentropy()
+    cce_obj = losses.SparseCategoricalCrossentropy()
     y_true = [[0], [1], [2]]
     y_pred = [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]]
     loss = cce_obj(y_true, y_pred, sample_weight=2.3)
     self.assertAlmostEqual(self.evaluate(loss), .7449, 3)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class HingeTest(test.TestCase):
 
   def test_config(self):
-    hinge_obj = keras.losses.Hinge(
+    hinge_obj = losses.Hinge(
         reduction=losses_utils.ReductionV2.SUM, name='hinge_loss')
     self.assertEqual(hinge_obj.name, 'hinge_loss')
     self.assertEqual(hinge_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_unweighted(self):
-    hinge_obj = keras.losses.Hinge()
+    hinge_obj = losses.Hinge()
     y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
     y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
                                    [-0.25, -1., 0.5, 0.6]])
@@ -996,7 +990,7 @@ class HingeTest(test.TestCase):
     self.assertAllClose(0.506, self.evaluate(loss), atol=1e-3)
 
   def test_scalar_weighted(self):
-    hinge_obj = keras.losses.Hinge()
+    hinge_obj = losses.Hinge()
     y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
     y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
                                    [-0.25, -1., 0.5, 0.6]])
@@ -1019,7 +1013,7 @@ class HingeTest(test.TestCase):
     self.assertAllClose(self.evaluate(loss), self.evaluate(loss_2), 1e-3)
 
   def test_sample_weighted(self):
-    hinge_obj = keras.losses.Hinge()
+    hinge_obj = losses.Hinge()
     y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
     y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
                                    [-0.25, -1., 0.5, 0.6]])
@@ -1039,7 +1033,7 @@ class HingeTest(test.TestCase):
     self.assertAllClose(self.evaluate(loss), 1.061, 1e-3)
 
   def test_timestep_weighted(self):
-    hinge_obj = keras.losses.Hinge()
+    hinge_obj = losses.Hinge()
     y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]], shape=(2, 4, 1))
     y_pred = constant_op.constant(
         [[-0.3, 0.2, -0.1, 1.6], [-0.25, -1., 0.5, 0.6]], shape=(2, 4, 1))
@@ -1060,7 +1054,7 @@ class HingeTest(test.TestCase):
     self.assertAllClose(self.evaluate(loss), 2.012, 1e-3)
 
   def test_zero_weighted(self):
-    hinge_obj = keras.losses.Hinge()
+    hinge_obj = losses.Hinge()
     y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
     y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
                                    [-0.25, -1., 0.5, 0.6]])
@@ -1068,17 +1062,17 @@ class HingeTest(test.TestCase):
     self.assertAllClose(self.evaluate(loss), 0., 1e-3)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class SquaredHingeTest(test.TestCase):
 
   def test_config(self):
-    sq_hinge_obj = keras.losses.SquaredHinge(
+    sq_hinge_obj = losses.SquaredHinge(
         reduction=losses_utils.ReductionV2.SUM, name='sq_hinge_loss')
     self.assertEqual(sq_hinge_obj.name, 'sq_hinge_loss')
     self.assertEqual(sq_hinge_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_unweighted(self):
-    sq_hinge_obj = keras.losses.SquaredHinge()
+    sq_hinge_obj = losses.SquaredHinge()
     y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
     y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
                                    [-0.25, -1., 0.5, 0.6]])
@@ -1099,7 +1093,7 @@ class SquaredHingeTest(test.TestCase):
     self.assertAllClose(self.evaluate(loss), 0.364, 1e-3)
 
   def test_scalar_weighted(self):
-    sq_hinge_obj = keras.losses.SquaredHinge()
+    sq_hinge_obj = losses.SquaredHinge()
     y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
     y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
                                    [-0.25, -1., 0.5, 0.6]])
@@ -1125,7 +1119,7 @@ class SquaredHingeTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
 
   def test_sample_weighted(self):
-    sq_hinge_obj = keras.losses.SquaredHinge()
+    sq_hinge_obj = losses.SquaredHinge()
     y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
     y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
                                    [-0.25, -1., 0.5, 0.6]])
@@ -1148,7 +1142,7 @@ class SquaredHingeTest(test.TestCase):
     self.assertAllClose(self.evaluate(loss), 0.704, 1e-3)
 
   def test_timestep_weighted(self):
-    sq_hinge_obj = keras.losses.SquaredHinge()
+    sq_hinge_obj = losses.SquaredHinge()
     y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]], shape=(2, 4, 1))
     y_pred = constant_op.constant(
         [[-0.3, 0.2, -0.1, 1.6], [-0.25, -1., 0.5, 0.6]], shape=(2, 4, 1))
@@ -1169,7 +1163,7 @@ class SquaredHingeTest(test.TestCase):
     self.assertAllClose(self.evaluate(loss), 1.542, 1e-3)
 
   def test_zero_weighted(self):
-    sq_hinge_obj = keras.losses.SquaredHinge()
+    sq_hinge_obj = losses.SquaredHinge()
     y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
     y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
                                    [-0.25, -1., 0.5, 0.6]])
@@ -1177,17 +1171,17 @@ class SquaredHingeTest(test.TestCase):
     self.assertAllClose(self.evaluate(loss), 0., 1e-3)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class CategoricalHingeTest(test.TestCase):
 
   def test_config(self):
-    cat_hinge_obj = keras.losses.CategoricalHinge(
+    cat_hinge_obj = losses.CategoricalHinge(
         reduction=losses_utils.ReductionV2.SUM, name='cat_hinge_loss')
     self.assertEqual(cat_hinge_obj.name, 'cat_hinge_loss')
     self.assertEqual(cat_hinge_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_unweighted(self):
-    cat_hinge_obj = keras.losses.CategoricalHinge()
+    cat_hinge_obj = losses.CategoricalHinge()
     y_true = constant_op.constant([1, 9, 2, -5], shape=(2, 2))
     y_pred = constant_op.constant([4, 8, 12, 8],
                                   shape=(2, 2),
@@ -1201,7 +1195,7 @@ class CategoricalHingeTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 32.5, 3)
 
   def test_scalar_weighted(self):
-    cat_hinge_obj = keras.losses.CategoricalHinge()
+    cat_hinge_obj = losses.CategoricalHinge()
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -1214,7 +1208,7 @@ class CategoricalHingeTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
 
   def test_sample_weighted(self):
-    cat_hinge_obj = keras.losses.CategoricalHinge()
+    cat_hinge_obj = losses.CategoricalHinge()
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -1224,7 +1218,7 @@ class CategoricalHingeTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 124.1, 3)
 
   def test_timestep_weighted(self):
-    cat_hinge_obj = keras.losses.CategoricalHinge()
+    cat_hinge_obj = losses.CategoricalHinge()
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3, 1),
@@ -1234,7 +1228,7 @@ class CategoricalHingeTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 4.0, 3)
 
   def test_zero_weighted(self):
-    cat_hinge_obj = keras.losses.CategoricalHinge()
+    cat_hinge_obj = losses.CategoricalHinge()
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -1243,7 +1237,7 @@ class CategoricalHingeTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 0., 3)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class LogCoshTest(test.TestCase):
 
   def setup(self):
@@ -1258,14 +1252,14 @@ class LogCoshTest(test.TestCase):
     self.y_true = constant_op.constant(y_true)
 
   def test_config(self):
-    logcosh_obj = keras.losses.LogCosh(
+    logcosh_obj = losses.LogCosh(
         reduction=losses_utils.ReductionV2.SUM, name='logcosh_loss')
     self.assertEqual(logcosh_obj.name, 'logcosh_loss')
     self.assertEqual(logcosh_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_unweighted(self):
     self.setup()
-    logcosh_obj = keras.losses.LogCosh()
+    logcosh_obj = losses.LogCosh()
 
     loss = logcosh_obj(self.y_true, self.y_pred)
     expected_loss = np.sum(self.expected_losses) / self.batch_size
@@ -1273,7 +1267,7 @@ class LogCoshTest(test.TestCase):
 
   def test_scalar_weighted(self):
     self.setup()
-    logcosh_obj = keras.losses.LogCosh()
+    logcosh_obj = losses.LogCosh()
     sample_weight = 2.3
 
     loss = logcosh_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
@@ -1287,7 +1281,7 @@ class LogCoshTest(test.TestCase):
 
   def test_sample_weighted(self):
     self.setup()
-    logcosh_obj = keras.losses.LogCosh()
+    logcosh_obj = losses.LogCosh()
 
     sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
     loss = logcosh_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
@@ -1300,7 +1294,7 @@ class LogCoshTest(test.TestCase):
 
   def test_timestep_weighted(self):
     self.setup()
-    logcosh_obj = keras.losses.LogCosh()
+    logcosh_obj = losses.LogCosh()
     y_true = np.asarray([1, 9, 2, -5, -2, 6]).reshape(2, 3, 1)
     y_pred = np.asarray([4, 8, 12, 8, 1, 3]).reshape(2, 3, 1)
     error = y_pred - y_true
@@ -1318,13 +1312,13 @@ class LogCoshTest(test.TestCase):
 
   def test_zero_weighted(self):
     self.setup()
-    logcosh_obj = keras.losses.LogCosh()
+    logcosh_obj = losses.LogCosh()
     sample_weight = 0
     loss = logcosh_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
     self.assertAlmostEqual(self.evaluate(loss), 0., 3)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class PoissonTest(test.TestCase):
 
   def setup(self):
@@ -1339,14 +1333,14 @@ class PoissonTest(test.TestCase):
     self.y_true = constant_op.constant(self.np_y_true)
 
   def test_config(self):
-    poisson_obj = keras.losses.Poisson(
+    poisson_obj = losses.Poisson(
         reduction=losses_utils.ReductionV2.SUM, name='poisson')
     self.assertEqual(poisson_obj.name, 'poisson')
     self.assertEqual(poisson_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_unweighted(self):
     self.setup()
-    poisson_obj = keras.losses.Poisson()
+    poisson_obj = losses.Poisson()
 
     loss = poisson_obj(self.y_true, self.y_pred)
     expected_loss = np.sum(self.expected_losses) / self.batch_size
@@ -1354,7 +1348,7 @@ class PoissonTest(test.TestCase):
 
   def test_scalar_weighted(self):
     self.setup()
-    poisson_obj = keras.losses.Poisson()
+    poisson_obj = losses.Poisson()
     sample_weight = 2.3
     loss = poisson_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
 
@@ -1369,7 +1363,7 @@ class PoissonTest(test.TestCase):
 
   def test_sample_weighted(self):
     self.setup()
-    poisson_obj = keras.losses.Poisson()
+    poisson_obj = losses.Poisson()
 
     sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
     loss = poisson_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
@@ -1382,7 +1376,7 @@ class PoissonTest(test.TestCase):
 
   def test_timestep_weighted(self):
     self.setup()
-    poisson_obj = keras.losses.Poisson()
+    poisson_obj = losses.Poisson()
     y_true = self.np_y_true.reshape(2, 3, 1)
     y_pred = self.np_y_pred.reshape(2, 3, 1)
     sample_weight = np.asarray([3, 6, 5, 0, 4, 2]).reshape(2, 3, 1)
@@ -1400,12 +1394,12 @@ class PoissonTest(test.TestCase):
 
   def test_zero_weighted(self):
     self.setup()
-    poisson_obj = keras.losses.Poisson()
+    poisson_obj = losses.Poisson()
     loss = poisson_obj(self.y_true, self.y_pred, sample_weight=0)
     self.assertAlmostEqual(self.evaluate(loss), 0., 3)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class KLDivergenceTest(test.TestCase):
 
   def setup(self):
@@ -1420,14 +1414,14 @@ class KLDivergenceTest(test.TestCase):
     self.y_true = constant_op.constant(self.np_y_true)
 
   def test_config(self):
-    k_obj = keras.losses.KLDivergence(
+    k_obj = losses.KLDivergence(
         reduction=losses_utils.ReductionV2.SUM, name='kld')
     self.assertEqual(k_obj.name, 'kld')
     self.assertEqual(k_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_unweighted(self):
     self.setup()
-    k_obj = keras.losses.KLDivergence()
+    k_obj = losses.KLDivergence()
 
     loss = k_obj(self.y_true, self.y_pred)
     expected_loss = np.sum(self.expected_losses) / self.batch_size
@@ -1435,7 +1429,7 @@ class KLDivergenceTest(test.TestCase):
 
   def test_scalar_weighted(self):
     self.setup()
-    k_obj = keras.losses.KLDivergence()
+    k_obj = losses.KLDivergence()
     sample_weight = 2.3
 
     loss = k_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
@@ -1449,7 +1443,7 @@ class KLDivergenceTest(test.TestCase):
 
   def test_sample_weighted(self):
     self.setup()
-    k_obj = keras.losses.KLDivergence()
+    k_obj = losses.KLDivergence()
     sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
     loss = k_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
 
@@ -1461,7 +1455,7 @@ class KLDivergenceTest(test.TestCase):
 
   def test_timestep_weighted(self):
     self.setup()
-    k_obj = keras.losses.KLDivergence()
+    k_obj = losses.KLDivergence()
     y_true = self.np_y_true.reshape(2, 3, 1)
     y_pred = self.np_y_pred.reshape(2, 3, 1)
     sample_weight = np.asarray([3, 6, 5, 0, 4, 2]).reshape(2, 3)
@@ -1480,12 +1474,12 @@ class KLDivergenceTest(test.TestCase):
 
   def test_zero_weighted(self):
     self.setup()
-    k_obj = keras.losses.KLDivergence()
+    k_obj = losses.KLDivergence()
     loss = k_obj(self.y_true, self.y_pred, sample_weight=0)
     self.assertAlmostEqual(self.evaluate(loss), 0., 3)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class HuberLossTest(test.TestCase):
 
   def huber_loss(self, y_true, y_pred, delta=1.0):
@@ -1510,27 +1504,26 @@ class HuberLossTest(test.TestCase):
     self.y_true = constant_op.constant(self.np_y_true)
 
   def test_config(self):
-    h_obj = keras.losses.Huber(
-        reduction=losses_utils.ReductionV2.SUM, name='huber')
+    h_obj = losses.Huber(reduction=losses_utils.ReductionV2.SUM, name='huber')
     self.assertEqual(h_obj.name, 'huber')
     self.assertEqual(h_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_all_correct(self):
     self.setup()
-    h_obj = keras.losses.Huber()
+    h_obj = losses.Huber()
     loss = h_obj(self.y_true, self.y_true)
     self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
 
   def test_unweighted(self):
     self.setup()
-    h_obj = keras.losses.Huber()
+    h_obj = losses.Huber()
     loss = h_obj(self.y_true, self.y_pred)
     actual_loss = np.sum(self.expected_losses) / self.batch_size
     self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
 
   def test_scalar_weighted(self):
     self.setup()
-    h_obj = keras.losses.Huber()
+    h_obj = losses.Huber()
     sample_weight = 2.3
     loss = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
     actual_loss = sample_weight * np.sum(self.expected_losses) / self.batch_size
@@ -1542,7 +1535,7 @@ class HuberLossTest(test.TestCase):
 
   def test_sample_weighted(self):
     self.setup()
-    h_obj = keras.losses.Huber()
+    h_obj = losses.Huber()
     sample_weight = constant_op.constant((1.2, 3.4), shape=(2, 1))
 
     loss = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
@@ -1554,7 +1547,7 @@ class HuberLossTest(test.TestCase):
 
   def test_timestep_weighted(self):
     self.setup()
-    h_obj = keras.losses.Huber()
+    h_obj = losses.Huber()
     y_pred = self.np_y_pred.reshape((2, 3, 1))
     y_true = self.np_y_true.reshape((2, 3, 1))
     expected_losses = self.huber_loss(y_true, y_pred)
@@ -1572,14 +1565,14 @@ class HuberLossTest(test.TestCase):
 
   def test_zero_weighted(self):
     self.setup()
-    h_obj = keras.losses.Huber()
+    h_obj = losses.Huber()
     sample_weight = 0
     loss = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
     self.assertAlmostEqual(self.evaluate(loss), 0., 3)
 
   def test_non_default_delta(self):
     self.setup(delta=0.8)
-    h_obj = keras.losses.Huber(delta=0.8)
+    h_obj = losses.Huber(delta=0.8)
     sample_weight = 2.3
     loss = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
     actual_loss = sample_weight * np.sum(self.expected_losses) / self.batch_size
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index 81abaabbe32..e053191d9c4 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -770,6 +770,10 @@ class CategoricalAccuracy(MeanMetricWrapper):
 class SparseCategoricalAccuracy(MeanMetricWrapper):
   """Calculates how often predictions matches integer labels.
 
+  ```python
+  acc = np.dot(sample_weight, np.equal(y_true, np.argmax(y_pred, axis=1))
+  ```
+  
   You can provide logits of classes as `y_pred`, since argmax of
   logits and probabilities are same.
 
@@ -784,12 +788,12 @@ class SparseCategoricalAccuracy(MeanMetricWrapper):
   Usage:
 
   >>> m = tf.keras.metrics.SparseCategoricalAccuracy()
-  >>> _ = m.update_state([[2], [1]], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
+  >>> _ = m.update_state([[2], [1]], [[0.1, 0.6, 0.3], [0.05, 0.95, 0]])
   >>> m.result().numpy()
   0.5
 
   >>> m.reset_states()
-  >>> _ = m.update_state([[2], [1]], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]],
+  >>> _ = m.update_state([[2], [1]], [[0.1, 0.6, 0.3], [0.05, 0.95, 0]],
   ...                    sample_weight=[0.7, 0.3])
   >>> m.result().numpy()
   0.3
@@ -1191,6 +1195,18 @@ class Precision(Metric):
   >>> m.result().numpy()
   1.0
 
+  >>> # With top_k=2, it will calculate precision over y_true[:2] and y_pred[:2]
+  >>> m = tf.keras.metrics.Precision(top_k=2)
+  >>> _ = m.update_state([0, 0, 1, 1], [1, 1, 1, 1])
+  >>> m.result().numpy()
+  0.0
+
+  >>> # With top_k=4, it will calculate precision over y_true[:4] and y_pred[:4]
+  >>> m = tf.keras.metrics.Precision(top_k=4)
+  >>> _ = m.update_state([0, 0, 1, 1], [1, 1, 1, 1])
+  >>> m.result().numpy()
+  0.5
+
   Usage with tf.keras API:
 
   ```python
diff --git a/tensorflow/python/keras/metrics_confusion_matrix_test.py b/tensorflow/python/keras/metrics_confusion_matrix_test.py
index 36b1e98a8f9..2ea6282cb27 100644
--- a/tensorflow/python/keras/metrics_confusion_matrix_test.py
+++ b/tensorflow/python/keras/metrics_confusion_matrix_test.py
@@ -26,7 +26,7 @@ from scipy.special import expit
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras import layers
 from tensorflow.python.keras import metrics
 from tensorflow.python.keras import models
@@ -37,19 +37,19 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class FalsePositivesTest(test.TestCase):
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
+class FalsePositivesTest(test.TestCase, parameterized.TestCase):
 
   def test_config(self):
     fp_obj = metrics.FalsePositives(name='my_fp', thresholds=[0.4, 0.9])
     self.assertEqual(fp_obj.name, 'my_fp')
-    self.assertEqual(len(fp_obj.variables), 1)
+    self.assertLen(fp_obj.variables, 1)
     self.assertEqual(fp_obj.thresholds, [0.4, 0.9])
 
     # Check save and restore config
     fp_obj2 = metrics.FalsePositives.from_config(fp_obj.get_config())
     self.assertEqual(fp_obj2.name, 'my_fp')
-    self.assertEqual(len(fp_obj2.variables), 1)
+    self.assertLen(fp_obj2.variables, 1)
     self.assertEqual(fp_obj2.thresholds, [0.4, 0.9])
 
   def test_unweighted(self):
@@ -117,19 +117,19 @@ class FalsePositivesTest(test.TestCase):
       metrics.FalsePositives(thresholds=[None])
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class FalseNegativesTest(test.TestCase):
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
+class FalseNegativesTest(test.TestCase, parameterized.TestCase):
 
   def test_config(self):
     fn_obj = metrics.FalseNegatives(name='my_fn', thresholds=[0.4, 0.9])
     self.assertEqual(fn_obj.name, 'my_fn')
-    self.assertEqual(len(fn_obj.variables), 1)
+    self.assertLen(fn_obj.variables, 1)
     self.assertEqual(fn_obj.thresholds, [0.4, 0.9])
 
     # Check save and restore config
     fn_obj2 = metrics.FalseNegatives.from_config(fn_obj.get_config())
     self.assertEqual(fn_obj2.name, 'my_fn')
-    self.assertEqual(len(fn_obj2.variables), 1)
+    self.assertLen(fn_obj2.variables, 1)
     self.assertEqual(fn_obj2.thresholds, [0.4, 0.9])
 
   def test_unweighted(self):
@@ -185,19 +185,19 @@ class FalseNegativesTest(test.TestCase):
     self.assertAllClose([4., 16., 23.], self.evaluate(result))
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class TrueNegativesTest(test.TestCase):
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
+class TrueNegativesTest(test.TestCase, parameterized.TestCase):
 
   def test_config(self):
     tn_obj = metrics.TrueNegatives(name='my_tn', thresholds=[0.4, 0.9])
     self.assertEqual(tn_obj.name, 'my_tn')
-    self.assertEqual(len(tn_obj.variables), 1)
+    self.assertLen(tn_obj.variables, 1)
     self.assertEqual(tn_obj.thresholds, [0.4, 0.9])
 
     # Check save and restore config
     tn_obj2 = metrics.TrueNegatives.from_config(tn_obj.get_config())
     self.assertEqual(tn_obj2.name, 'my_tn')
-    self.assertEqual(len(tn_obj2.variables), 1)
+    self.assertLen(tn_obj2.variables, 1)
     self.assertEqual(tn_obj2.thresholds, [0.4, 0.9])
 
   def test_unweighted(self):
@@ -253,19 +253,19 @@ class TrueNegativesTest(test.TestCase):
     self.assertAllClose([5., 15., 23.], self.evaluate(result))
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class TruePositivesTest(test.TestCase):
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
+class TruePositivesTest(test.TestCase, parameterized.TestCase):
 
   def test_config(self):
     tp_obj = metrics.TruePositives(name='my_tp', thresholds=[0.4, 0.9])
     self.assertEqual(tp_obj.name, 'my_tp')
-    self.assertEqual(len(tp_obj.variables), 1)
+    self.assertLen(tp_obj.variables, 1)
     self.assertEqual(tp_obj.thresholds, [0.4, 0.9])
 
     # Check save and restore config
     tp_obj2 = metrics.TruePositives.from_config(tp_obj.get_config())
     self.assertEqual(tp_obj2.name, 'my_tp')
-    self.assertEqual(len(tp_obj2.variables), 1)
+    self.assertLen(tp_obj2.variables, 1)
     self.assertEqual(tp_obj2.thresholds, [0.4, 0.9])
 
   def test_unweighted(self):
@@ -320,14 +320,14 @@ class TruePositivesTest(test.TestCase):
     self.assertAllClose([222., 111., 37.], self.evaluate(result))
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class PrecisionTest(test.TestCase):
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
+class PrecisionTest(test.TestCase, parameterized.TestCase):
 
   def test_config(self):
     p_obj = metrics.Precision(
         name='my_precision', thresholds=[0.4, 0.9], top_k=15, class_id=12)
     self.assertEqual(p_obj.name, 'my_precision')
-    self.assertEqual(len(p_obj.variables), 2)
+    self.assertLen(p_obj.variables, 2)
     self.assertEqual([v.name for v in p_obj.variables],
                      ['true_positives:0', 'false_positives:0'])
     self.assertEqual(p_obj.thresholds, [0.4, 0.9])
@@ -337,7 +337,7 @@ class PrecisionTest(test.TestCase):
     # Check save and restore config
     p_obj2 = metrics.Precision.from_config(p_obj.get_config())
     self.assertEqual(p_obj2.name, 'my_precision')
-    self.assertEqual(len(p_obj2.variables), 2)
+    self.assertLen(p_obj2.variables, 2)
     self.assertEqual(p_obj2.thresholds, [0.4, 0.9])
     self.assertEqual(p_obj2.top_k, 15)
     self.assertEqual(p_obj2.class_id, 12)
@@ -525,14 +525,14 @@ class PrecisionTest(test.TestCase):
     self.assertAlmostEqual(0, self.evaluate(p_obj.false_positives))
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class RecallTest(test.TestCase):
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
+class RecallTest(test.TestCase, parameterized.TestCase):
 
   def test_config(self):
     r_obj = metrics.Recall(
         name='my_recall', thresholds=[0.4, 0.9], top_k=15, class_id=12)
     self.assertEqual(r_obj.name, 'my_recall')
-    self.assertEqual(len(r_obj.variables), 2)
+    self.assertLen(r_obj.variables, 2)
     self.assertEqual([v.name for v in r_obj.variables],
                      ['true_positives:0', 'false_negatives:0'])
     self.assertEqual(r_obj.thresholds, [0.4, 0.9])
@@ -542,7 +542,7 @@ class RecallTest(test.TestCase):
     # Check save and restore config
     r_obj2 = metrics.Recall.from_config(r_obj.get_config())
     self.assertEqual(r_obj2.name, 'my_recall')
-    self.assertEqual(len(r_obj2.variables), 2)
+    self.assertLen(r_obj2.variables, 2)
     self.assertEqual(r_obj2.thresholds, [0.4, 0.9])
     self.assertEqual(r_obj2.top_k, 15)
     self.assertEqual(r_obj2.class_id, 12)
@@ -729,7 +729,7 @@ class RecallTest(test.TestCase):
     self.assertAlmostEqual(3, self.evaluate(r_obj.false_negatives))
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class SensitivityAtSpecificityTest(test.TestCase, parameterized.TestCase):
 
   def test_config(self):
@@ -771,13 +771,14 @@ class SensitivityAtSpecificityTest(test.TestCase, parameterized.TestCase):
                              1e-3)
 
   def test_unweighted_all_correct(self):
-    s_obj = metrics.SensitivityAtSpecificity(0.7)
-    inputs = np.random.randint(0, 2, size=(100, 1))
-    y_pred = constant_op.constant(inputs, dtype=dtypes.float32)
-    y_true = constant_op.constant(inputs)
-    self.evaluate(variables.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred)
-    self.assertAlmostEqual(1, self.evaluate(result))
+    with self.test_session():
+      s_obj = metrics.SensitivityAtSpecificity(0.7)
+      inputs = np.random.randint(0, 2, size=(100, 1))
+      y_pred = constant_op.constant(inputs, dtype=dtypes.float32)
+      y_true = constant_op.constant(inputs)
+      self.evaluate(variables.variables_initializer(s_obj.variables))
+      result = s_obj(y_true, y_pred)
+      self.assertAlmostEqual(1, self.evaluate(result))
 
   def test_unweighted_high_specificity(self):
     s_obj = metrics.SensitivityAtSpecificity(0.8)
@@ -825,7 +826,7 @@ class SensitivityAtSpecificityTest(test.TestCase, parameterized.TestCase):
       metrics.SensitivityAtSpecificity(0.4, num_thresholds=-1)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class SpecificityAtSensitivityTest(test.TestCase, parameterized.TestCase):
 
   def test_config(self):
@@ -921,7 +922,7 @@ class SpecificityAtSensitivityTest(test.TestCase, parameterized.TestCase):
       metrics.SpecificityAtSensitivity(0.4, num_thresholds=-1)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class PrecisionAtRecallTest(test.TestCase, parameterized.TestCase):
 
   def test_config(self):
@@ -1018,7 +1019,7 @@ class PrecisionAtRecallTest(test.TestCase, parameterized.TestCase):
       metrics.PrecisionAtRecall(0.4, num_thresholds=-1)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class RecallAtPrecisionTest(test.TestCase, parameterized.TestCase):
 
   def test_config(self):
@@ -1133,8 +1134,8 @@ class RecallAtPrecisionTest(test.TestCase, parameterized.TestCase):
       metrics.RecallAtPrecision(0.4, num_thresholds=-1)
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class AUCTest(test.TestCase):
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
+class AUCTest(test.TestCase, parameterized.TestCase):
 
   def setup(self):
     self.num_thresholds = 3
@@ -1172,7 +1173,7 @@ class AUCTest(test.TestCase):
         name='auc_1')
     auc_obj.update_state(self.y_true, self.y_pred)
     self.assertEqual(auc_obj.name, 'auc_1')
-    self.assertEqual(len(auc_obj.variables), 4)
+    self.assertLen(auc_obj.variables, 4)
     self.assertEqual(auc_obj.num_thresholds, 100)
     self.assertEqual(auc_obj.curve, metrics_utils.AUCCurve.PR)
     self.assertEqual(auc_obj.summation_method,
@@ -1184,7 +1185,7 @@ class AUCTest(test.TestCase):
     auc_obj2 = metrics.AUC.from_config(auc_obj.get_config())
     auc_obj2.update_state(self.y_true, self.y_pred)
     self.assertEqual(auc_obj2.name, 'auc_1')
-    self.assertEqual(len(auc_obj2.variables), 4)
+    self.assertLen(auc_obj2.variables, 4)
     self.assertEqual(auc_obj2.num_thresholds, 100)
     self.assertEqual(auc_obj2.curve, metrics_utils.AUCCurve.PR)
     self.assertEqual(auc_obj2.summation_method,
@@ -1203,7 +1204,7 @@ class AUCTest(test.TestCase):
         thresholds=[0.3, 0.5])
     auc_obj.update_state(self.y_true, self.y_pred)
     self.assertEqual(auc_obj.name, 'auc_1')
-    self.assertEqual(len(auc_obj.variables), 4)
+    self.assertLen(auc_obj.variables, 4)
     self.assertEqual(auc_obj.num_thresholds, 4)
     self.assertAllClose(auc_obj.thresholds, [0.0, 0.3, 0.5, 1.0])
     self.assertEqual(auc_obj.curve, metrics_utils.AUCCurve.PR)
@@ -1216,7 +1217,7 @@ class AUCTest(test.TestCase):
     auc_obj2 = metrics.AUC.from_config(auc_obj.get_config())
     auc_obj2.update_state(self.y_true, self.y_pred)
     self.assertEqual(auc_obj2.name, 'auc_1')
-    self.assertEqual(len(auc_obj2.variables), 4)
+    self.assertLen(auc_obj2.variables, 4)
     self.assertEqual(auc_obj2.num_thresholds, 4)
     self.assertEqual(auc_obj2.curve, metrics_utils.AUCCurve.PR)
     self.assertEqual(auc_obj2.summation_method,
@@ -1407,8 +1408,8 @@ class AUCTest(test.TestCase):
     self.assertEqual(self.evaluate(result), 0.5)
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class MultiAUCTest(test.TestCase):
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
+class MultiAUCTest(test.TestCase, parameterized.TestCase):
 
   def setup(self):
     self.num_thresholds = 5
@@ -1457,26 +1458,28 @@ class MultiAUCTest(test.TestCase):
     # fpr = [[1, 0.67, 0,    0,    0], [1, 0, 0, 0, 0]]
 
   def test_value_is_idempotent(self):
-    self.setup()
-    auc_obj = metrics.AUC(num_thresholds=5, multi_label=True)
-    self.evaluate(variables.variables_initializer(auc_obj.variables))
+    with self.test_session():
+      self.setup()
+      auc_obj = metrics.AUC(num_thresholds=5, multi_label=True)
+      self.evaluate(variables.variables_initializer(auc_obj.variables))
 
-    # Run several updates.
-    update_op = auc_obj.update_state(self.y_true_good, self.y_pred)
-    for _ in range(10):
-      self.evaluate(update_op)
+      # Run several updates.
+      update_op = auc_obj.update_state(self.y_true_good, self.y_pred)
+      for _ in range(10):
+        self.evaluate(update_op)
 
-    # Then verify idempotency.
-    initial_auc = self.evaluate(auc_obj.result())
-    for _ in range(10):
-      self.assertAllClose(initial_auc, self.evaluate(auc_obj.result()), 1e-3)
+      # Then verify idempotency.
+      initial_auc = self.evaluate(auc_obj.result())
+      for _ in range(10):
+        self.assertAllClose(initial_auc, self.evaluate(auc_obj.result()), 1e-3)
 
   def test_unweighted_all_correct(self):
-    self.setup()
-    auc_obj = metrics.AUC(multi_label=True)
-    self.evaluate(variables.variables_initializer(auc_obj.variables))
-    result = auc_obj(self.y_true_good, self.y_true_good)
-    self.assertEqual(self.evaluate(result), 1)
+    with self.test_session():
+      self.setup()
+      auc_obj = metrics.AUC(multi_label=True)
+      self.evaluate(variables.variables_initializer(auc_obj.variables))
+      result = auc_obj(self.y_true_good, self.y_true_good)
+      self.assertEqual(self.evaluate(result), 1)
 
   def test_unweighted_all_correct_flat(self):
     self.setup()
@@ -1486,15 +1489,17 @@ class MultiAUCTest(test.TestCase):
     self.assertEqual(self.evaluate(result), 1)
 
   def test_unweighted(self):
-    self.setup()
-    auc_obj = metrics.AUC(num_thresholds=self.num_thresholds, multi_label=True)
-    self.evaluate(variables.variables_initializer(auc_obj.variables))
-    result = auc_obj(self.y_true_good, self.y_pred)
+    with self.test_session():
+      self.setup()
+      auc_obj = metrics.AUC(num_thresholds=self.num_thresholds,
+                            multi_label=True)
+      self.evaluate(variables.variables_initializer(auc_obj.variables))
+      result = auc_obj(self.y_true_good, self.y_pred)
 
-    # tpr = [[1, 1, 0.5, 0.5, 0], [1, 1, 0, 0, 0]]
-    # fpr = [[1, 0.5, 0, 0, 0], [1, 0, 0, 0, 0]]
-    expected_result = (0.875 + 1.0) / 2.0
-    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+      # tpr = [[1, 1, 0.5, 0.5, 0], [1, 1, 0, 0, 0]]
+      # fpr = [[1, 0.5, 0, 0, 0], [1, 0, 0, 0, 0]]
+      expected_result = (0.875 + 1.0) / 2.0
+      self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
 
   def test_sample_weight_flat(self):
     self.setup()
@@ -1521,18 +1526,19 @@ class MultiAUCTest(test.TestCase):
     self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
 
   def test_label_weights(self):
-    self.setup()
-    auc_obj = metrics.AUC(
-        num_thresholds=self.num_thresholds,
-        multi_label=True,
-        label_weights=[0.75, 0.25])
-    self.evaluate(variables.variables_initializer(auc_obj.variables))
-    result = auc_obj(self.y_true_good, self.y_pred)
+    with self.test_session():
+      self.setup()
+      auc_obj = metrics.AUC(
+          num_thresholds=self.num_thresholds,
+          multi_label=True,
+          label_weights=[0.75, 0.25])
+      self.evaluate(variables.variables_initializer(auc_obj.variables))
+      result = auc_obj(self.y_true_good, self.y_pred)
 
-    # tpr = [[1, 1, 0.5, 0.5, 0], [1, 1, 0, 0, 0]]
-    # fpr = [[1, 0.5, 0, 0, 0], [1, 0, 0, 0, 0]]
-    expected_result = (0.875 * 0.75 + 1.0 * 0.25) / (0.75 + 0.25)
-    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+      # tpr = [[1, 1, 0.5, 0.5, 0], [1, 1, 0, 0, 0]]
+      # fpr = [[1, 0.5, 0, 0, 0], [1, 0, 0, 0, 0]]
+      expected_result = (0.875 * 0.75 + 1.0 * 0.25) / (0.75 + 0.25)
+      self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
 
   def test_label_weights_flat(self):
     self.setup()
@@ -1565,65 +1571,72 @@ class MultiAUCTest(test.TestCase):
     self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
 
   def test_manual_thresholds(self):
-    self.setup()
-    # Verify that when specified, thresholds are used instead of num_thresholds.
-    auc_obj = metrics.AUC(num_thresholds=2, thresholds=[0.5], multi_label=True)
-    self.assertEqual(auc_obj.num_thresholds, 3)
-    self.assertAllClose(auc_obj.thresholds, [0.0, 0.5, 1.0])
-    self.evaluate(variables.variables_initializer(auc_obj.variables))
-    result = auc_obj(self.y_true_good, self.y_pred)
+    with self.test_session():
+      self.setup()
+      # Verify that when specified, thresholds are used instead of
+      # num_thresholds.
+      auc_obj = metrics.AUC(num_thresholds=2, thresholds=[0.5],
+                            multi_label=True)
+      self.assertEqual(auc_obj.num_thresholds, 3)
+      self.assertAllClose(auc_obj.thresholds, [0.0, 0.5, 1.0])
+      self.evaluate(variables.variables_initializer(auc_obj.variables))
+      result = auc_obj(self.y_true_good, self.y_pred)
 
-    # tp = [[2, 1, 0], [2, 0, 0]]
-    # fp = [2, 0, 0], [2, 0, 0]]
-    # fn = [[0, 1, 2], [0, 2, 2]]
-    # tn = [[0, 2, 2], [0, 2, 2]]
+      # tp = [[2, 1, 0], [2, 0, 0]]
+      # fp = [2, 0, 0], [2, 0, 0]]
+      # fn = [[0, 1, 2], [0, 2, 2]]
+      # tn = [[0, 2, 2], [0, 2, 2]]
 
-    # tpr = [[1, 0.5, 0], [1, 0, 0]]
-    # fpr = [[1, 0, 0], [1, 0, 0]]
+      # tpr = [[1, 0.5, 0], [1, 0, 0]]
+      # fpr = [[1, 0, 0], [1, 0, 0]]
 
-    # auc by slice = [0.75, 0.5]
-    expected_result = (0.75 + 0.5) / 2.0
+      # auc by slice = [0.75, 0.5]
+      expected_result = (0.75 + 0.5) / 2.0
 
-    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+      self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
 
   def test_weighted_roc_interpolation(self):
-    self.setup()
-    auc_obj = metrics.AUC(num_thresholds=self.num_thresholds, multi_label=True)
-    self.evaluate(variables.variables_initializer(auc_obj.variables))
-    result = auc_obj(
-        self.y_true_good, self.y_pred, sample_weight=self.sample_weight)
+    with self.test_session():
+      self.setup()
+      auc_obj = metrics.AUC(num_thresholds=self.num_thresholds,
+                            multi_label=True)
+      self.evaluate(variables.variables_initializer(auc_obj.variables))
+      result = auc_obj(
+          self.y_true_good, self.y_pred, sample_weight=self.sample_weight)
 
-    # tpr = [[1, 1,    0.57, 0.57, 0], [1, 1, 0, 0, 0]]
-    # fpr = [[1, 0.67, 0,    0,    0], [1, 0, 0, 0, 0]]
-    expected_result = 1.0 - 0.5 * 0.43 * 0.67
-    self.assertAllClose(self.evaluate(result), expected_result, 1e-1)
+      # tpr = [[1, 1,    0.57, 0.57, 0], [1, 1, 0, 0, 0]]
+      # fpr = [[1, 0.67, 0,    0,    0], [1, 0, 0, 0, 0]]
+      expected_result = 1.0 - 0.5 * 0.43 * 0.67
+      self.assertAllClose(self.evaluate(result), expected_result, 1e-1)
 
   def test_pr_interpolation_unweighted(self):
-    self.setup()
-    auc_obj = metrics.AUC(num_thresholds=self.num_thresholds, curve='PR',
-                          multi_label=True)
-    self.evaluate(variables.variables_initializer(auc_obj.variables))
-    good_result = auc_obj(self.y_true_good, self.y_pred)
-    with self.subTest(name='good'):
-      # PR AUCs are 0.917 and 1.0 respectively
-      self.assertAllClose(self.evaluate(good_result), (0.91667 + 1.0) / 2.0,
-                          1e-1)
-    bad_result = auc_obj(self.y_true_bad, self.y_pred)
-    with self.subTest(name='bad'):
-      # PR AUCs are 0.917 and 0.5 respectively
-      self.assertAllClose(self.evaluate(bad_result), (0.91667 + 0.5) / 2.0,
-                          1e-1)
+    with self.test_session():
+      self.setup()
+      auc_obj = metrics.AUC(num_thresholds=self.num_thresholds, curve='PR',
+                            multi_label=True)
+      self.evaluate(variables.variables_initializer(auc_obj.variables))
+      good_result = auc_obj(self.y_true_good, self.y_pred)
+      with self.subTest(name='good'):
+        # PR AUCs are 0.917 and 1.0 respectively
+        self.assertAllClose(self.evaluate(good_result), (0.91667 + 1.0) / 2.0,
+                            1e-1)
+      bad_result = auc_obj(self.y_true_bad, self.y_pred)
+      with self.subTest(name='bad'):
+        # PR AUCs are 0.917 and 0.5 respectively
+        self.assertAllClose(self.evaluate(bad_result), (0.91667 + 0.5) / 2.0,
+                            1e-1)
 
   def test_pr_interpolation(self):
-    self.setup()
-    auc_obj = metrics.AUC(num_thresholds=self.num_thresholds, curve='PR',
-                          multi_label=True)
-    self.evaluate(variables.variables_initializer(auc_obj.variables))
-    good_result = auc_obj(self.y_true_good, self.y_pred,
-                          sample_weight=self.sample_weight)
-    # PR AUCs are 0.939 and 1.0 respectively
-    self.assertAllClose(self.evaluate(good_result), (0.939 + 1.0) / 2.0,
-                        1e-1)
+    with self.test_session():
+      self.setup()
+      auc_obj = metrics.AUC(num_thresholds=self.num_thresholds, curve='PR',
+                            multi_label=True)
+      self.evaluate(variables.variables_initializer(auc_obj.variables))
+      good_result = auc_obj(self.y_true_good, self.y_pred,
+                            sample_weight=self.sample_weight)
+      # PR AUCs are 0.939 and 1.0 respectively
+      self.assertAllClose(self.evaluate(good_result), (0.939 + 1.0) / 2.0,
+                          1e-1)
 
   def test_keras_model_compiles(self):
     inputs = layers.Input(shape=(10,))
@@ -1635,12 +1648,14 @@ class MultiAUCTest(test.TestCase):
     )
 
   def test_reset_states(self):
-    self.setup()
-    auc_obj = metrics.AUC(num_thresholds=self.num_thresholds, multi_label=True)
-    self.evaluate(variables.variables_initializer(auc_obj.variables))
-    auc_obj(self.y_true_good, self.y_pred)
-    auc_obj.reset_states()
-    self.assertAllEqual(auc_obj.true_positives, np.zeros((5, 2)))
+    with self.test_session():
+      self.setup()
+      auc_obj = metrics.AUC(num_thresholds=self.num_thresholds,
+                            multi_label=True)
+      self.evaluate(variables.variables_initializer(auc_obj.variables))
+      auc_obj(self.y_true_good, self.y_pred)
+      auc_obj.reset_states()
+      self.assertAllEqual(auc_obj.true_positives, np.zeros((5, 2)))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
index c2703e63cd6..ba1b76bab32 100644
--- a/tensorflow/python/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -22,6 +22,7 @@ import json
 import math
 import os
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.eager import context
@@ -32,6 +33,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import layers
 from tensorflow.python.keras import metrics
@@ -46,35 +48,36 @@ from tensorflow.python.platform import test
 from tensorflow.python.training.tracking import util as trackable_utils
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class KerasSumTest(test.TestCase):
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
+class KerasSumTest(test.TestCase, parameterized.TestCase):
 
   def test_sum(self):
-    m = metrics.Sum(name='my_sum')
+    with self.test_session():
+      m = metrics.Sum(name='my_sum')
 
-    # check config
-    self.assertEqual(m.name, 'my_sum')
-    self.assertTrue(m.stateful)
-    self.assertEqual(m.dtype, dtypes.float32)
-    self.assertEqual(len(m.variables), 1)
-    self.evaluate(variables.variables_initializer(m.variables))
+      # check config
+      self.assertEqual(m.name, 'my_sum')
+      self.assertTrue(m.stateful)
+      self.assertEqual(m.dtype, dtypes.float32)
+      self.assertLen(m.variables, 1)
+      self.evaluate(variables.variables_initializer(m.variables))
 
-    # check initial state
-    self.assertEqual(self.evaluate(m.total), 0)
+      # check initial state
+      self.assertEqual(self.evaluate(m.total), 0)
 
-    # check __call__()
-    self.assertEqual(self.evaluate(m(100)), 100)
-    self.assertEqual(self.evaluate(m.total), 100)
+      # check __call__()
+      self.assertEqual(self.evaluate(m(100)), 100)
+      self.assertEqual(self.evaluate(m.total), 100)
 
-    # check update_state() and result() + state accumulation + tensor input
-    update_op = m.update_state(ops.convert_n_to_tensor([1, 5]))
-    self.evaluate(update_op)
-    self.assertAlmostEqual(self.evaluate(m.result()), 106)
-    self.assertEqual(self.evaluate(m.total), 106)  # 100 + 1 + 5
+      # check update_state() and result() + state accumulation + tensor input
+      update_op = m.update_state(ops.convert_n_to_tensor([1, 5]))
+      self.evaluate(update_op)
+      self.assertAlmostEqual(self.evaluate(m.result()), 106)
+      self.assertEqual(self.evaluate(m.total), 106)  # 100 + 1 + 5
 
-    # check reset_states()
-    m.reset_states()
-    self.assertEqual(self.evaluate(m.total), 0)
+      # check reset_states()
+      m.reset_states()
+      self.assertEqual(self.evaluate(m.total), 0)
 
   def test_sum_with_sample_weight(self):
     m = metrics.Sum(dtype=dtypes.float64)
@@ -133,33 +136,34 @@ class KerasSumTest(test.TestCase):
       self.assertAlmostEqual(self.evaluate(m.total), 52., 2)
 
   def test_save_restore(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
-    m = metrics.Sum()
-    checkpoint = trackable_utils.Checkpoint(sum=m)
-    self.evaluate(variables.variables_initializer(m.variables))
+    with self.test_session():
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
+      m = metrics.Sum()
+      checkpoint = trackable_utils.Checkpoint(sum=m)
+      self.evaluate(variables.variables_initializer(m.variables))
 
-    # update state
-    self.evaluate(m(100.))
-    self.evaluate(m(200.))
+      # update state
+      self.evaluate(m(100.))
+      self.evaluate(m(200.))
 
-    # save checkpoint and then add an update
-    save_path = checkpoint.save(checkpoint_prefix)
-    self.evaluate(m(1000.))
+      # save checkpoint and then add an update
+      save_path = checkpoint.save(checkpoint_prefix)
+      self.evaluate(m(1000.))
 
-    # restore to the same checkpoint sum object (= 300)
-    checkpoint.restore(save_path).assert_consumed().run_restore_ops()
-    self.evaluate(m(300.))
-    self.assertEqual(600., self.evaluate(m.result()))
+      # restore to the same checkpoint sum object (= 300)
+      checkpoint.restore(save_path).assert_consumed().run_restore_ops()
+      self.evaluate(m(300.))
+      self.assertEqual(600., self.evaluate(m.result()))
 
-    # restore to a different checkpoint sum object
-    restore_sum = metrics.Sum()
-    restore_checkpoint = trackable_utils.Checkpoint(sum=restore_sum)
-    status = restore_checkpoint.restore(save_path)
-    restore_update = restore_sum(300.)
-    status.assert_consumed().run_restore_ops()
-    self.evaluate(restore_update)
-    self.assertEqual(600., self.evaluate(restore_sum.result()))
+      # restore to a different checkpoint sum object
+      restore_sum = metrics.Sum()
+      restore_checkpoint = trackable_utils.Checkpoint(sum=restore_sum)
+      status = restore_checkpoint.restore(save_path)
+      restore_update = restore_sum(300.)
+      status.assert_consumed().run_restore_ops()
+      self.evaluate(restore_update)
+      self.assertEqual(600., self.evaluate(restore_sum.result()))
 
 
 class MeanTest(keras_parameterized.TestCase):
@@ -354,7 +358,7 @@ class MeanTest(keras_parameterized.TestCase):
     self.assertEqual(self.evaluate(m.count), 1)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class KerasAccuracyTest(test.TestCase):
 
   def test_accuracy(self):
@@ -598,7 +602,7 @@ class KerasAccuracyTest(test.TestCase):
     self.assertEqual(acc_fn, metrics.accuracy)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class CosineSimilarityTest(test.TestCase):
 
   def l2_norm(self, x, axis):
@@ -659,7 +663,7 @@ class CosineSimilarityTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class MeanAbsoluteErrorTest(test.TestCase):
 
   def test_config(self):
@@ -697,7 +701,7 @@ class MeanAbsoluteErrorTest(test.TestCase):
     self.assertAllClose(0.54285, self.evaluate(result), atol=1e-5)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class MeanAbsolutePercentageErrorTest(test.TestCase):
 
   def test_config(self):
@@ -737,7 +741,7 @@ class MeanAbsolutePercentageErrorTest(test.TestCase):
     self.assertAllClose(40e7, self.evaluate(result), atol=1e-5)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class MeanSquaredErrorTest(test.TestCase):
 
   def test_config(self):
@@ -775,7 +779,7 @@ class MeanSquaredErrorTest(test.TestCase):
     self.assertAllClose(0.54285, self.evaluate(result), atol=1e-5)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class MeanSquaredLogarithmicErrorTest(test.TestCase):
 
   def test_config(self):
@@ -815,7 +819,7 @@ class MeanSquaredLogarithmicErrorTest(test.TestCase):
     self.assertAllClose(0.26082, self.evaluate(result), atol=1e-5)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class HingeTest(test.TestCase):
 
   def test_config(self):
@@ -870,7 +874,7 @@ class HingeTest(test.TestCase):
     self.assertAllClose(0.493, self.evaluate(result), atol=1e-3)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class SquaredHingeTest(test.TestCase):
 
   def test_config(self):
@@ -931,7 +935,7 @@ class SquaredHingeTest(test.TestCase):
     self.assertAllClose(0.347, self.evaluate(result), atol=1e-3)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class CategoricalHingeTest(test.TestCase):
 
   def test_config(self):
@@ -971,7 +975,7 @@ class CategoricalHingeTest(test.TestCase):
     self.assertAllClose(0.5, self.evaluate(result), atol=1e-5)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class RootMeanSquaredErrorTest(test.TestCase):
 
   def test_config(self):
@@ -1005,7 +1009,7 @@ class RootMeanSquaredErrorTest(test.TestCase):
     self.assertAllClose(math.sqrt(13), self.evaluate(result), atol=1e-3)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class TopKCategoricalAccuracyTest(test.TestCase):
 
   def test_config(self):
@@ -1052,7 +1056,7 @@ class TopKCategoricalAccuracyTest(test.TestCase):
     self.assertAllClose(1.0, self.evaluate(result), atol=1e-5)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class SparseTopKCategoricalAccuracyTest(test.TestCase):
 
   def test_config(self):
@@ -1099,7 +1103,7 @@ class SparseTopKCategoricalAccuracyTest(test.TestCase):
     self.assertAllClose(1.0, self.evaluate(result), atol=1e-5)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class LogCoshErrorTest(test.TestCase):
 
   def setup(self):
@@ -1142,7 +1146,7 @@ class LogCoshErrorTest(test.TestCase):
     self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class PoissonTest(test.TestCase):
 
   def setup(self):
@@ -1188,7 +1192,7 @@ class PoissonTest(test.TestCase):
     self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class KLDivergenceTest(test.TestCase):
 
   def setup(self):
@@ -1235,7 +1239,7 @@ class KLDivergenceTest(test.TestCase):
     self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class MeanRelativeErrorTest(test.TestCase):
 
   def test_config(self):
@@ -1291,7 +1295,7 @@ class MeanRelativeErrorTest(test.TestCase):
     self.assertEqual(self.evaluate(result), 0)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class MeanIoUTest(test.TestCase):
 
   def test_config(self):
@@ -1374,90 +1378,93 @@ class MeanIoUTest(test.TestCase):
     self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
 
 
-class MeanTensorTest(test.TestCase):
+class MeanTensorTest(test.TestCase, parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_config(self):
-    m = metrics.MeanTensor(name='mean_by_element')
+    with self.test_session():
+      m = metrics.MeanTensor(name='mean_by_element')
 
-    # check config
-    self.assertEqual(m.name, 'mean_by_element')
-    self.assertTrue(m.stateful)
-    self.assertEqual(m.dtype, dtypes.float32)
-    self.assertEqual(len(m.variables), 0)
+      # check config
+      self.assertEqual(m.name, 'mean_by_element')
+      self.assertTrue(m.stateful)
+      self.assertEqual(m.dtype, dtypes.float32)
+      self.assertEmpty(m.variables)
 
-    with self.assertRaisesRegexp(ValueError, 'does not have any result yet'):
-      m.result()
+      with self.assertRaisesRegexp(ValueError, 'does not have any result yet'):
+        m.result()
 
-    self.evaluate(m([[3], [5], [3]]))
-    self.assertAllEqual(m._shape, [3, 1])
+      self.evaluate(m([[3], [5], [3]]))
+      self.assertAllEqual(m._shape, [3, 1])
 
-    m2 = metrics.MeanTensor.from_config(m.get_config())
-    self.assertEqual(m2.name, 'mean_by_element')
-    self.assertTrue(m2.stateful)
-    self.assertEqual(m2.dtype, dtypes.float32)
-    self.assertEqual(len(m2.variables), 0)
+      m2 = metrics.MeanTensor.from_config(m.get_config())
+      self.assertEqual(m2.name, 'mean_by_element')
+      self.assertTrue(m2.stateful)
+      self.assertEqual(m2.dtype, dtypes.float32)
+      self.assertEmpty(m2.variables)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_unweighted(self):
-    m = metrics.MeanTensor(dtype=dtypes.float64)
+    with self.test_session():
+      m = metrics.MeanTensor(dtype=dtypes.float64)
 
-    # check __call__()
-    self.assertAllClose(self.evaluate(m([100, 40])), [100, 40])
-    self.assertAllClose(self.evaluate(m.total), [100, 40])
-    self.assertAllClose(self.evaluate(m.count), [1, 1])
+      # check __call__()
+      self.assertAllClose(self.evaluate(m([100, 40])), [100, 40])
+      self.assertAllClose(self.evaluate(m.total), [100, 40])
+      self.assertAllClose(self.evaluate(m.count), [1, 1])
 
-    # check update_state() and result() + state accumulation + tensor input
-    update_op = m.update_state(ops.convert_n_to_tensor([1, 5]))
-    self.evaluate(update_op)
-    self.assertAllClose(self.evaluate(m.result()), [50.5, 22.5])
-    self.assertAllClose(self.evaluate(m.total), [101, 45])
-    self.assertAllClose(self.evaluate(m.count), [2, 2])
+      # check update_state() and result() + state accumulation + tensor input
+      update_op = m.update_state(ops.convert_n_to_tensor([1, 5]))
+      self.evaluate(update_op)
+      self.assertAllClose(self.evaluate(m.result()), [50.5, 22.5])
+      self.assertAllClose(self.evaluate(m.total), [101, 45])
+      self.assertAllClose(self.evaluate(m.count), [2, 2])
 
-    # check reset_states()
-    m.reset_states()
-    self.assertAllClose(self.evaluate(m.total), [0, 0])
-    self.assertAllClose(self.evaluate(m.count), [0, 0])
+      # check reset_states()
+      m.reset_states()
+      self.assertAllClose(self.evaluate(m.total), [0, 0])
+      self.assertAllClose(self.evaluate(m.count), [0, 0])
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_weighted(self):
-    m = metrics.MeanTensor(dtype=dtypes.float64)
-    self.assertEqual(m.dtype, dtypes.float64)
+    with self.test_session():
+      m = metrics.MeanTensor(dtype=dtypes.float64)
+      self.assertEqual(m.dtype, dtypes.float64)
 
-    # check scalar weight
-    result_t = m([100, 30], sample_weight=0.5)
-    self.assertAllClose(self.evaluate(result_t), [100, 30])
-    self.assertAllClose(self.evaluate(m.total), [50, 15])
-    self.assertAllClose(self.evaluate(m.count), [0.5, 0.5])
+      # check scalar weight
+      result_t = m([100, 30], sample_weight=0.5)
+      self.assertAllClose(self.evaluate(result_t), [100, 30])
+      self.assertAllClose(self.evaluate(m.total), [50, 15])
+      self.assertAllClose(self.evaluate(m.count), [0.5, 0.5])
 
-    # check weights not scalar and weights rank matches values rank
-    result_t = m([1, 5], sample_weight=[1, 0.2])
-    result = self.evaluate(result_t)
-    self.assertAllClose(result, [51 / 1.5, 16 / 0.7], 2)
-    self.assertAllClose(self.evaluate(m.total), [51, 16])
-    self.assertAllClose(self.evaluate(m.count), [1.5, 0.7])
+      # check weights not scalar and weights rank matches values rank
+      result_t = m([1, 5], sample_weight=[1, 0.2])
+      result = self.evaluate(result_t)
+      self.assertAllClose(result, [51 / 1.5, 16 / 0.7], 2)
+      self.assertAllClose(self.evaluate(m.total), [51, 16])
+      self.assertAllClose(self.evaluate(m.count), [1.5, 0.7])
 
-    # check weights broadcast
-    result_t = m([1, 2], sample_weight=0.5)
-    self.assertAllClose(self.evaluate(result_t), [51.5 / 2, 17 / 1.2])
-    self.assertAllClose(self.evaluate(m.total), [51.5, 17])
-    self.assertAllClose(self.evaluate(m.count), [2, 1.2])
+      # check weights broadcast
+      result_t = m([1, 2], sample_weight=0.5)
+      self.assertAllClose(self.evaluate(result_t), [51.5 / 2, 17 / 1.2])
+      self.assertAllClose(self.evaluate(m.total), [51.5, 17])
+      self.assertAllClose(self.evaluate(m.count), [2, 1.2])
 
-    # check weights squeeze
-    result_t = m([1, 5], sample_weight=[[1], [0.2]])
-    self.assertAllClose(self.evaluate(result_t), [52.5 / 3, 18 / 1.4])
-    self.assertAllClose(self.evaluate(m.total), [52.5, 18])
-    self.assertAllClose(self.evaluate(m.count), [3, 1.4])
+      # check weights squeeze
+      result_t = m([1, 5], sample_weight=[[1], [0.2]])
+      self.assertAllClose(self.evaluate(result_t), [52.5 / 3, 18 / 1.4])
+      self.assertAllClose(self.evaluate(m.total), [52.5, 18])
+      self.assertAllClose(self.evaluate(m.count), [3, 1.4])
 
-    # check weights expand
-    m = metrics.MeanTensor(dtype=dtypes.float64)
-    self.evaluate(variables.variables_initializer(m.variables))
-    result_t = m([[1], [5]], sample_weight=[1, 0.2])
-    self.assertAllClose(self.evaluate(result_t), [[1], [5]])
-    self.assertAllClose(self.evaluate(m.total), [[1], [1]])
-    self.assertAllClose(self.evaluate(m.count), [[1], [0.2]])
+      # check weights expand
+      m = metrics.MeanTensor(dtype=dtypes.float64)
+      self.evaluate(variables.variables_initializer(m.variables))
+      result_t = m([[1], [5]], sample_weight=[1, 0.2])
+      self.assertAllClose(self.evaluate(result_t), [[1], [5]])
+      self.assertAllClose(self.evaluate(m.total), [[1], [1]])
+      self.assertAllClose(self.evaluate(m.count), [[1], [0.2]])
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_invalid_value_shape(self):
     m = metrics.MeanTensor(dtype=dtypes.float64)
     m([1])
@@ -1465,7 +1472,7 @@ class MeanTensorTest(test.TestCase):
         ValueError, 'MeanTensor input values must always have the same shape'):
       m([1, 5])
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_build_in_tf_function(self):
     """Ensure that variables are created correctly in a tf function."""
     m = metrics.MeanTensor(dtype=dtypes.float64)
@@ -1474,10 +1481,11 @@ class MeanTensorTest(test.TestCase):
     def call_metric(x):
       return m(x)
 
-    self.assertAllClose(self.evaluate(call_metric([100, 40])), [100, 40])
-    self.assertAllClose(self.evaluate(m.total), [100, 40])
-    self.assertAllClose(self.evaluate(m.count), [1, 1])
-    self.assertAllClose(self.evaluate(call_metric([20, 2])), [60, 21])
+    with self.test_session():
+      self.assertAllClose(self.evaluate(call_metric([100, 40])), [100, 40])
+      self.assertAllClose(self.evaluate(m.total), [100, 40])
+      self.assertAllClose(self.evaluate(m.count), [1, 1])
+      self.assertAllClose(self.evaluate(call_metric([20, 2])), [60, 21])
 
   def test_in_keras_model(self):
     with context.eager_mode():
@@ -1522,7 +1530,7 @@ class MeanTensorTest(test.TestCase):
                           np.full((4, 3), 4))
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class BinaryCrossentropyTest(test.TestCase):
 
   def test_config(self):
@@ -1642,7 +1650,7 @@ class BinaryCrossentropyTest(test.TestCase):
     self.assertAllClose(expected_value, self.evaluate(result), atol=1e-3)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class CategoricalCrossentropyTest(test.TestCase):
 
   def test_config(self):
@@ -1768,7 +1776,7 @@ class CategoricalCrossentropyTest(test.TestCase):
     self.assertAllClose(self.evaluate(loss), 3.667, atol=1e-3)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class SparseCategoricalCrossentropyTest(test.TestCase):
 
   def test_config(self):
@@ -1943,7 +1951,7 @@ class BinaryTruePositives(metrics.Metric):
     return self.true_positives
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class CustomMetricsTest(test.TestCase):
 
   def test_config(self):
diff --git a/tensorflow/python/keras/mixed_precision/experimental/BUILD b/tensorflow/python/keras/mixed_precision/experimental/BUILD
index e364f346746..c672bd51cc7 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/BUILD
+++ b/tensorflow/python/keras/mixed_precision/experimental/BUILD
@@ -63,6 +63,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:platform_test",
         "//tensorflow/python/keras",
+        "//tensorflow/python/keras:combinations",
         "//tensorflow/python/keras/mixed_precision/experimental:loss_scale_optimizer",
         "//tensorflow/python/keras/optimizer_v2",
     ],
@@ -86,6 +87,7 @@ cuda_py_test(
     deps = [
         ":device_compatibility_check",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras:combinations",
     ],
 )
 
@@ -137,6 +139,7 @@ py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python/distribute:mirrored_strategy",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/keras:combinations",
         "//tensorflow/python/keras/optimizer_v2",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -177,6 +180,7 @@ cuda_py_test(
         "//tensorflow/python/distribute:mirrored_strategy",
         "//tensorflow/python/distribute:one_device_strategy",
         "//tensorflow/python/keras",
+        "//tensorflow/python/keras:combinations",
     ],
 )
 
@@ -219,6 +223,7 @@ cuda_py_test(
         "//tensorflow/python/distribute:central_storage_strategy",
         "//tensorflow/python/distribute:mirrored_strategy",
         "//tensorflow/python/keras",
+        "//tensorflow/python/keras:combinations",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
index 204afd3913e..6ce3079788c 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
@@ -30,7 +30,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras.mixed_precision.experimental import autocast_variable
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_v2
 from tensorflow.python.ops import array_ops
@@ -69,7 +69,7 @@ def get_var(val, dtype, name=None):
   return variables.VariableV1(val, use_resource=True, dtype=dtype, name=name)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(*TESTCASES)
@@ -156,7 +156,7 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
   def test_method_delegations(self, distribute):
     # Test AutoCastVariable correctly delegates Variable methods to the
     # underlying variable.
-    with get_distribute_scope(distribute):
+    with self.test_session(), get_distribute_scope(distribute):
       for read_dtype in (dtypes.float32, dtypes.float16):
         if distribute:
           # MirroredVariable.assign will (incorrectly) return a Mirrored value
@@ -383,18 +383,19 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(*TESTCASES)
   def test_checkpoint(self, distribute):
-    with get_distribute_scope(distribute):
-      x = get_var(1., dtypes.float32)
-      x = autocast_variable.create_autocast_variable(x)
-    self.evaluate(x.initializer)
-    self.evaluate(x.assign(123.))
+    with self.test_session():
+      with get_distribute_scope(distribute):
+        x = get_var(1., dtypes.float32)
+        x = autocast_variable.create_autocast_variable(x)
+      self.evaluate(x.initializer)
+      self.evaluate(x.assign(123.))
 
-    checkpoint = trackable_utils.Checkpoint(x=x)
-    prefix = os.path.join(self.get_temp_dir(), 'ckpt')
-    save_path = checkpoint.save(prefix)
-    self.evaluate(x.assign(234.))
-    checkpoint.restore(save_path).assert_consumed().run_restore_ops()
-    self.assertEqual(self.evaluate(x), 123.)
+      checkpoint = trackable_utils.Checkpoint(x=x)
+      prefix = os.path.join(self.get_temp_dir(), 'ckpt')
+      save_path = checkpoint.save(prefix)
+      self.evaluate(x.assign(234.))
+      checkpoint.restore(save_path).assert_consumed().run_restore_ops()
+      self.assertEqual(self.evaluate(x), 123.)
 
   @parameterized.named_parameters(*TESTCASES)
   def test_invalid_wrapped_variable(self, distribute):
diff --git a/tensorflow/python/keras/mixed_precision/experimental/device_compatibility_check_test.py b/tensorflow/python/keras/mixed_precision/experimental/device_compatibility_check_test.py
index c3315caab88..a88594ae346 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/device_compatibility_check_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/device_compatibility_check_test.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 import re
 
 from tensorflow.core.framework import device_attributes_pb2
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras.mixed_precision.experimental import device_compatibility_check
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
@@ -42,7 +42,7 @@ def _get_device_attrs(device_type, device_name=None, cc_major=None,
       device_type='GPU', physical_device_desc=physical_device_desc)
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class DeviceCompatibilityCheckTest(test.TestCase):
 
   def _test_compat_check(self, device_attr_list, should_warn, expected_regex,
diff --git a/tensorflow/python/keras/mixed_precision/experimental/get_layer_policy.py b/tensorflow/python/keras/mixed_precision/experimental/get_layer_policy.py
index 8c6fc40705d..47826b48a97 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/get_layer_policy.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/get_layer_policy.py
@@ -25,7 +25,7 @@ from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.mixed_precision.experimental.get_layer_policy')
+@keras_export('keras.mixed_precision.experimental.get_layer_policy', v1=[])
 def get_layer_policy(layer):
   """Returns the dtype policy of a layer.
 
diff --git a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
index 753179717e6..c46ee9123ce 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
@@ -31,8 +31,8 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import layers
 from tensorflow.python.keras import models
@@ -119,11 +119,11 @@ TESTCASES = ({
 })
 
 
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class KerasLayerTest(keras_parameterized.TestCase):
   """Test mixed precision with Keras layers."""
 
   @parameterized.named_parameters(*TESTCASES)
-  @test_util.run_in_graph_and_eager_modes
   def test_mixed_policies_(self, strategy_fn):
     for dtype in 'float16', 'bfloat16':
       x = constant_op.constant([1.])
@@ -142,7 +142,6 @@ class KerasLayerTest(keras_parameterized.TestCase):
         self.evaluate(variables.global_variables_initializer())
         self.assertEqual(self.evaluate(y), 1.)
 
-  @test_util.run_in_graph_and_eager_modes
   def test_layer_with_int_variable(self):
     class LayerWithIntVar(base_layer.Layer):
 
@@ -159,7 +158,6 @@ class KerasLayerTest(keras_parameterized.TestCase):
     self.assertEqual(layer(x).dtype, 'int32')
 
   @parameterized.named_parameters(*TESTCASES)
-  @test_util.run_in_graph_and_eager_modes
   def test_layer_with_non_autocast_variable(self, strategy_fn):
     x = constant_op.constant([1.])
     with strategy_fn().scope():
@@ -172,7 +170,6 @@ class KerasLayerTest(keras_parameterized.TestCase):
         self.assertEqual(self.evaluate(y), 1.)
 
   @parameterized.named_parameters(*TESTCASES)
-  @test_util.run_in_graph_and_eager_modes
   def test_layer_calling_tf_function(self, strategy_fn):
     x = constant_op.constant([1.])
     with strategy_fn().scope():
@@ -185,7 +182,6 @@ class KerasLayerTest(keras_parameterized.TestCase):
         self.assertEqual(self.evaluate(y), 1.)
 
   @parameterized.named_parameters(*TESTCASES)
-  @test_util.run_in_graph_and_eager_modes
   def test_layer_regularizer_runs_in_var_dtype(self, strategy_fn):
     x = constant_op.constant([1.])
     with strategy_fn().scope():
@@ -211,7 +207,6 @@ class KerasLayerTest(keras_parameterized.TestCase):
         self.assertEqual(self.evaluate(regularizer_loss), 1.)
 
   @parameterized.named_parameters(*TESTCASES)
-  @test_util.run_in_graph_and_eager_modes
   def test_passing_policy_to_layer(self, strategy_fn):
     x = constant_op.constant([1.], dtype=dtypes.float16)
     with strategy_fn().scope():
@@ -230,7 +225,6 @@ class KerasLayerTest(keras_parameterized.TestCase):
         self.assertEqual(layer(x).dtype, dtypes.float64)
         self.assertEqual(layer.v.dtype, dtypes.float64)
 
-  @test_util.run_in_graph_and_eager_modes
   def test_error_passing_policy_string_to_layer(self):
     with self.assertRaisesRegexp(
         TypeError, "Cannot convert value 'mixed_float16' to a "
@@ -240,7 +234,6 @@ class KerasLayerTest(keras_parameterized.TestCase):
       mp_test_util.MultiplyLayer(dtype='mixed_float16')
 
   @parameterized.named_parameters(*TESTCASES)
-  @test_util.run_in_graph_and_eager_modes
   def test_gradient(self, strategy_fn):
     x = constant_op.constant([1.])
     with strategy_fn().scope() as strategy:
@@ -308,17 +301,19 @@ class KerasLayerTest(keras_parameterized.TestCase):
     self.assertEqual(self.evaluate(layer(x)), 100.)
 
   @parameterized.named_parameters(*TESTCASES)
-  @test_util.run_in_graph_and_eager_modes
   def test_checkpointing_layer_weights(self, strategy_fn):
-    self._test_checkpointing_layer_weights(
-        strategy_fn, mixed_prec_when_saving=True, mixed_prec_when_loading=True)
-    self._test_checkpointing_layer_weights(
-        strategy_fn, mixed_prec_when_saving=True, mixed_prec_when_loading=False)
-    self._test_checkpointing_layer_weights(
-        strategy_fn, mixed_prec_when_saving=False, mixed_prec_when_loading=True)
+    with self.test_session():
+      self._test_checkpointing_layer_weights(
+          strategy_fn, mixed_prec_when_saving=True,
+          mixed_prec_when_loading=True)
+      self._test_checkpointing_layer_weights(
+          strategy_fn, mixed_prec_when_saving=True,
+          mixed_prec_when_loading=False)
+      self._test_checkpointing_layer_weights(
+          strategy_fn, mixed_prec_when_saving=False,
+          mixed_prec_when_loading=True)
 
   @parameterized.named_parameters(*TESTCASES)
-  @test_util.run_in_graph_and_eager_modes
   def test_config(self, strategy_fn):
     x = constant_op.constant([1.], dtype=dtypes.float16)
     with strategy_fn().scope():
@@ -371,24 +366,24 @@ class KerasLayerTest(keras_parameterized.TestCase):
       self.assertEqual(layer(x).dtype, 'float64')
       self.assertEqual(layer.v.dtype, 'float64')
 
-      layer = mp_test_util.MultiplyLayer(dtype=policy.Policy('infer'))
+      layer = mp_test_util.MultiplyLayer(dtype=policy.Policy('_infer'))
       config = layer.get_config()
       self.assertIsNone(config['dtype'])
       layer = mp_test_util.MultiplyLayer.from_config(config)
-      # If a layer is serialized with the "infer" policy, when deserialized into
-      # TF 2 it will have the global policy instead of "infer". This is because
-      # "infer" is serialized into None, and passing dtype=None in TensorFlow 2
-      # indicates to use the global policy.
+      # If a layer is serialized with the "_infer" policy, when deserialized
+      # into TF 2 it will have the global policy instead of "_infer". This is
+      # because "_infer" is serialized into None, and passing dtype=None in
+      # TensorFlow 2 indicates to use the global policy.
       self.assertEqual(layer.dtype, 'float32')
       self.assertEqual(layer(x).dtype, 'float32')
       self.assertEqual(layer.v.dtype, 'float32')
 
-      layer = mp_test_util.MultiplyLayer(dtype=policy.Policy('infer',
+      layer = mp_test_util.MultiplyLayer(dtype=policy.Policy('_infer',
                                                              loss_scale=2.))
       config = layer.get_config()
       self.assertEqual(config['dtype'],
                        {'class_name': 'Policy',
-                        'config': {'name': 'infer',
+                        'config': {'name': '_infer',
                                    'loss_scale': {
                                        'class_name': 'FixedLossScale',
                                        'config': {'loss_scale_value': 2.0}}}})
@@ -397,7 +392,6 @@ class KerasLayerTest(keras_parameterized.TestCase):
       self.assertEqual(layer(x).dtype, 'float16')
       self.assertEqual(layer.v.dtype, 'float16')
 
-  @test_util.run_in_graph_and_eager_modes
   def test_delete_variable(self):
     layer = base_layer.Layer(dtype=policy.Policy('mixed_float16'))
     layer.x = layer.add_weight('x')
@@ -405,7 +399,6 @@ class KerasLayerTest(keras_parameterized.TestCase):
     del layer.x
     self.assertEqual(layer.trainable_weights, [])
 
-  @test_util.run_in_graph_and_eager_modes
   def test_build_and_call_layer_in_function(self):
     layer = mp_test_util.MultiplyLayer(dtype=policy.Policy('mixed_float16'))
     @def_function.function
@@ -833,7 +826,7 @@ class KerasModelTest(keras_parameterized.TestCase):
     model.fit(dataset)
     self.assertEqual(backend.eval(layer.v), -3)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_loss_scale_optimizer_overrides_policy_loss_scale(self):
     with policy.policy_scope(policy.Policy('float32', loss_scale=10.)):
       opt = gradient_descent.SGD(1.)
@@ -844,7 +837,7 @@ class KerasModelTest(keras_parameterized.TestCase):
       model.compile(opt, loss='mse')
       self.assertEqual(self.evaluate(model.optimizer.loss_scale()), 5.)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_pass_invalid_optimizer_with_loss_scaling(self):
     with policy.policy_scope(policy.Policy('float32', loss_scale=10.)):
       x = layers.Input(shape=(1,))
@@ -857,7 +850,7 @@ class KerasModelTest(keras_parameterized.TestCase):
       with self.assertRaisesRegexp(ValueError, error_msg):
         model.compile(optimizers.SGD(1.), 'mse')
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_functional_model_loss_dtype(self):
     with policy.policy_scope('float16'):
       x = layers.Input(shape=(1,))
@@ -867,6 +860,7 @@ class KerasModelTest(keras_parameterized.TestCase):
       # The loss should not be casted to the policy's dtype.
       self.assertEqual(model.losses[0].dtype, 'float32')
 
+  @keras_parameterized.run_all_keras_modes
   @parameterized.named_parameters(
       {
           'testcase_name': 'base',
@@ -883,7 +877,6 @@ class KerasModelTest(keras_parameterized.TestCase):
           'strategy_fn': create_mirrored_strategy,
           'h5': True,
       })
-  @test_util.run_in_graph_and_eager_modes
   def test_save_weights_with_autocast_vars(self, strategy_fn, h5=False):
     with strategy_fn().scope():
       with policy.policy_scope('mixed_float16'):
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
index 93727c2d400..91e9ea6c587 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import optimizers
@@ -205,8 +206,10 @@ class LossScaleOptimizer(optimizer_v2.OptimizerV2):
     """
     loss_scale = self._loss_scale()
     loss_scale_reciprocal = 1. / loss_scale
-    return [g * math_ops.cast(loss_scale_reciprocal, g.dtype) if g is not None
-            else None for g in grads]
+    return [
+        _multiply_gradient(g, loss_scale_reciprocal) if g is not None else None
+        for g in grads
+    ]
 
   def _compute_gradients(self, loss, var_list, grad_loss=None):
     loss = self.get_scaled_loss(loss)
@@ -358,3 +361,15 @@ class LossScaleOptimizer(optimizer_v2.OptimizerV2):
 
   # TODO(reedwm): Maybe throw an error if mixed precision is used without this
   # optimizer being used.
+
+
+def _multiply_gradient(gradient, scale):
+  """Multiply a (possibly sparse) gradient by the given scale factor."""
+  scale = math_ops.cast(scale, gradient.dtype)
+  if isinstance(gradient, ops.IndexedSlices):
+    return ops.IndexedSlices(
+        gradient.values * scale,
+        gradient.indices,
+        dense_shape=gradient.dense_shape)
+  else:
+    return gradient * scale
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
index 409713f3c16..8d13c6a37a3 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer
 from tensorflow.python.keras.mixed_precision.experimental import test_util as mp_test_util
@@ -68,6 +69,7 @@ TESTCASES = ({
 
 
 @test_util.with_control_flow_v2
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
 
   def _run_if_in_graph_mode(self, val):
@@ -85,7 +87,6 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
     return lambda: opt.minimize(loss, var_list=[var])
 
   @parameterized.named_parameters(*TESTCASES)
-  @test_util.run_in_graph_and_eager_modes
   def testFixedLossScaleAppliedToLossWithMinimize(self, strategy_fn):
     with strategy_fn().scope() as strategy:
       var = variables.Variable([5.0])
@@ -119,7 +120,6 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
     # mp_test_util.create_identity_with_grad_check_fn added an assertion op.
     self.evaluate(run_op)
 
-  @test_util.run_in_graph_and_eager_modes
   def testGetScaledLoss(self):
     opt = gradient_descent.SGD(2.0)
     opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale=2.)
@@ -130,7 +130,6 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(10., self.evaluate(opt.get_scaled_loss(loss)))
     self.assertEqual(10., self.evaluate(opt.get_scaled_loss(lambda: loss)()))
 
-  @test_util.run_in_graph_and_eager_modes
   def testGetUnscaledGradients(self):
     opt = gradient_descent.SGD(2.0)
     opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale=2)
@@ -142,8 +141,19 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
     grads = [self.evaluate(g) if g is not None else g for g in grads]
     self.assertEqual([1.5, None, -2.], grads)
 
+  def testGetUnscaledSparseGradients(self):
+    opt = gradient_descent.SGD(2.0)
+    opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale=2)
+    sparse_scaled_grad = ops.IndexedSlices(
+        ops.convert_to_tensor_v2([[4., 2.], [8., 5.]]),
+        ops.convert_to_tensor_v2([1, 3], dtype='int32'),
+        dense_shape=ops.convert_to_tensor_v2([5, 2], dtype='int32'))
+    sparse_grad = opt.get_unscaled_gradients([sparse_scaled_grad])[0]
+    self.assertIsInstance(sparse_grad, ops.IndexedSlices)
+    self.assertAllEqual([[2., 1.], [4., 2.5]],
+                        self.evaluate(sparse_grad.values))
+
   @parameterized.named_parameters(*TESTCASES)
-  @test_util.run_in_graph_and_eager_modes
   def testDynamicLossScale(self, strategy_fn):
     strategy = strategy_fn()
     learning_rate = 2.
@@ -177,7 +187,6 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose([1.], self.evaluate(var))
 
   @parameterized.named_parameters(*TESTCASES)
-  @test_util.run_in_graph_and_eager_modes
   def testDynamicUpdate(self, strategy_fn):
     with strategy_fn().scope() as strategy:
       var = variables.Variable([1.0, 2.0])
@@ -208,7 +217,6 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(2., self.evaluate(opt.loss_scale()))
 
   @parameterized.named_parameters(*TESTCASES)
-  @test_util.run_in_graph_and_eager_modes
   def testDynamicLossScaleWithFloat16Loss(self, strategy_fn):
     strategy = strategy_fn()
     learning_rate = 2.
@@ -230,7 +238,6 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose([3.], self.evaluate(var))
 
   @parameterized.named_parameters(*TESTCASES)
-  @test_util.run_in_graph_and_eager_modes
   def testDynamicLossScaleWithSlots(self, strategy_fn):
     strategy_obj = strategy_fn()
     if (isinstance(strategy_obj, mirrored_strategy.MirroredStrategy) and
@@ -270,7 +277,6 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
 
       self.assertEqual(opt.get_slot_names(), ['momentum'])
 
-  @test_util.run_in_graph_and_eager_modes
   def testIterations(self):
     opt = gradient_descent.SGD(2.0)
     lso = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale=10.)
@@ -278,27 +284,26 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(lso.iterations, 7)
     self.assertEqual(opt.iterations, 7)
 
-  @test_util.run_in_graph_and_eager_modes
   def testWeightMethods(self):
-    var = variables.Variable([1.0])
-    opt = gradient_descent.SGD(1.0)
-    initial_loss_scale = 2.
-    loss_scale = loss_scale_module.DynamicLossScale(
-        initial_loss_scale=initial_loss_scale, increment_period=1,
-        multiplier=4)
-    opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
-    run_op = opt.minimize(lambda: var * 2, [var])
-    self.evaluate(variables.global_variables_initializer())
-    self._run_if_in_graph_mode(run_op)
+    with self.test_session():
+      var = variables.Variable([1.0])
+      opt = gradient_descent.SGD(1.0)
+      initial_loss_scale = 2.
+      loss_scale = loss_scale_module.DynamicLossScale(
+          initial_loss_scale=initial_loss_scale, increment_period=1,
+          multiplier=4)
+      opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+      run_op = opt.minimize(lambda: var * 2, [var])
+      self.evaluate(variables.global_variables_initializer())
+      self._run_if_in_graph_mode(run_op)
 
-    self.assertLen(opt.weights, 1)  # The 'iterations' weight
-    self.assertEqual(self.evaluate(opt.weights[0]), 1)
-    self.assertEqual(opt.get_weights()[0], 1)
-    self.assertEqual(self.evaluate(opt.variables()[0]), 1)
-    opt.set_weights([np.array(2.)])
-    self.assertEqual(self.evaluate(opt.variables()[0]), 2)
+      self.assertLen(opt.weights, 1)  # The 'iterations' weight
+      self.assertEqual(self.evaluate(opt.weights[0]), 1)
+      self.assertEqual(opt.get_weights()[0], 1)
+      self.assertEqual(self.evaluate(opt.variables()[0]), 1)
+      opt.set_weights([np.array(2.)])
+      self.assertEqual(self.evaluate(opt.variables()[0]), 2)
 
-  @test_util.run_in_graph_and_eager_modes
   def testSlotMethodErrors(self):
     opt = gradient_descent.SGD(1.0, momentum=1.0)
     opt = loss_scale_optimizer.LossScaleOptimizer(opt, 'dynamic')
@@ -319,9 +324,8 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       loss_scale_optimizer.LossScaleOptimizer(opt, None)
 
   @parameterized.named_parameters(*TESTCASES)
-  @test_util.run_in_graph_and_eager_modes
   def testGettingAndSettingLearningRate(self, strategy_fn):
-    with strategy_fn().scope() as strategy:
+    with self.test_session(), strategy_fn().scope() as strategy:
       var = variables.Variable([5.0])
       opt = adam.Adam(learning_rate=1.0)
       loss = lambda: var * 2.0
@@ -344,7 +348,6 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       with self.assertRaises(AttributeError):
         opt.not_an_attr += 3
 
-  @test_util.run_in_graph_and_eager_modes
   def testArbitraryAttributesNotExposed(self):
     opt = adam.Adam(learning_rate=1.0)
     # Test that Adam has attributes 'epsilon' and 'beta1'
@@ -362,7 +365,6 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
         "'LossScaleOptimizer' object has no attribute 'beta_1'"):
       opt.beta_1  # pylint: disable=pointless-statement
 
-  @test_util.run_in_graph_and_eager_modes
   def testApplyGradientsGetsUnwrappedTensors(self):
     # Tests that gradients passed to apply_gradients are not wrapped in a
     # DistributionStrategy wrapper, such as PerReplica, but instead are raw
@@ -390,7 +392,6 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       strategy.experimental_run(run_fn)
 
   @parameterized.named_parameters(*TESTCASES)
-  @test_util.run_in_graph_and_eager_modes
   def testCheckpoint(self, strategy_fn):
     strategy = strategy_fn()
     if (isinstance(strategy, mirrored_strategy.MirroredStrategy) and
@@ -434,7 +435,6 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(self.evaluate(loss_scale._num_good_steps), 1)
       self.assertAlmostEqual(self.evaluate(slot_var).item(), slot_value)
 
-  @test_util.run_in_graph_and_eager_modes
   def testGetConfig(self):
     opt = gradient_descent.SGD(2., momentum=0.5)
     loss_scale = loss_scale_module.DynamicLossScale(
@@ -453,7 +453,6 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(opt.loss_scale.increment_period, 3.)
     self.assertEqual(opt.loss_scale.multiplier, 4.)
 
-  @test_util.run_in_graph_and_eager_modes
   def testSerializationWithBuiltInOptimizer(self):
     opt = gradient_descent.SGD(2., momentum=0.5)
     loss_scale = loss_scale_module.DynamicLossScale(
@@ -472,7 +471,6 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(opt.loss_scale.increment_period, 3.)
     self.assertEqual(opt.loss_scale.multiplier, 4.)
 
-  @test_util.run_in_graph_and_eager_modes
   def testSerializationWithCustomOptimizer(self):
     class MySGD(gradient_descent.SGD):
 
diff --git a/tensorflow/python/keras/mixed_precision/experimental/policy.py b/tensorflow/python/keras/mixed_precision/experimental/policy.py
index cd10b888952..9afc3ce9251 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/policy.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/policy.py
@@ -37,7 +37,7 @@ from tensorflow.python.util.tf_export import keras_export
 USE_DEFAULT = 'USE_DEFAULT'
 
 
-@keras_export('keras.mixed_precision.experimental.Policy')
+@keras_export('keras.mixed_precision.experimental.Policy', v1=[])
 class Policy(object):
   """A dtype policy for a Keras layer.
 
@@ -287,16 +287,6 @@ class Policy(object):
   If you did not pass `dtype=inputs.dtype` to `tf.random.normal`, a `TypeError`
   would have occurred. This is because the dtype defaults to `"float32"`, so the
   layer would only work if the inputs were float32.
-
-  ### The deprecated "infer" policy
-
-  In addition to the above mentioned policies, a policy can also be "infer".
-  This Policy is deprecated, and it is not recommended. When a layer has an
-  infer policy, it will infer the computation and variable dtype from the first
-  input the first time the layer is called. Once the layer is called for the
-  first time, the layer's policy will change to the dtype of the first input.
-
-  In TensorFlow 1, only the "infer" policy is available.
   """
 
   def __init__(self, name, loss_scale=USE_DEFAULT):
@@ -315,14 +305,12 @@ class Policy(object):
           bfloat16, while the variable dtype is float32. With 'mixed_float16',
           a dynamic loss scale is used. These policies are used for mixed
           precision training.
-        * 'infer' (deprecated): Infer the compute and variable dtype from the
-          input dtype.
       loss_scale: A `tf.mixed_precision.experimental.LossScale`, an int (which
-      uses a `FixedLossScale`), or the string "dynamic" (which uses a
-      `DynamicLossScale`). Defaults to using no loss scaling unless `name` is
-      "mixed_float16", in which case this defaults to "dynamic". Only
-      `tf.keras.Model`s, not layers, use the loss scale, and it is only used
-      during `Model.fit`, `Model.train_on_batch`, and other similar methods.
+        uses a `FixedLossScale`), or the string "dynamic" (which uses a
+        `DynamicLossScale`). Defaults to using no loss scaling unless `name` is
+        "mixed_float16", in which case this defaults to "dynamic". Only
+        `tf.keras.Model`s, not layers, use the loss scale, and it is only used
+        during `Model.fit`, `Model.train_on_batch`, and other similar methods.
     """
     if isinstance(name, dtypes.DType):
       raise TypeError("'name' must be a string, not a DType. "
@@ -373,7 +361,17 @@ class Policy(object):
       return 'float16', 'float32'
     elif name == 'mixed_bfloat16':
       return 'bfloat16', 'float32'
-    elif name == 'infer':
+    elif name == '_infer':
+      # The "_infer" policy exists only for compatibility with TF 1, where
+      # "_infer" is the default. The behavior matches the behavior of TF 1's
+      # behavior before policies were introduced. With "_infer", the computation
+      # and variable dtype are inferred from the first input the first time the
+      # layer is called. Once the layer is called for the first time, the
+      # layer's policy will change to the dtype of the first input, and it will
+      # no longer have the "_infer" policy.
+      #
+      # The infer policy should be considered an implementation detail and may
+      # be removed in the future.
       return None, None
 
     try:
@@ -397,8 +395,7 @@ class Policy(object):
     avoid type errors.
 
     Returns:
-      The variable dtype of this policy, or None if the variable dtype should be
-      inferred from the inputs.
+      The variable dtype of this policy.
     """
     return self._variable_dtype
 
@@ -425,8 +422,7 @@ class Policy(object):
     keeping intermediate computations in float32.
 
     Returns:
-      The compute dtype of this policy, or None if the compute dtype should be
-      inferred from the inputs.
+      The compute dtype of this policy.
     """
     return self._compute_dtype
 
@@ -480,22 +476,28 @@ class Policy(object):
 
 # The current global policy in effect. If None, it means the current value of
 # floatx should be used as the policy if the V2 dtype behavior is enabled,
-# or "infer" otherwise.
+# or "_infer" otherwise.
 # TODO(reedwm): Make this thread local?
 _global_policy = None
 
 
-@keras_export('keras.mixed_precision.experimental.global_policy')
+@keras_export('keras.mixed_precision.experimental.global_policy', v1=[])
 def global_policy():
   """Returns the global Policy.
 
   The global policy is the default policy used for layers, if no policy is
   passed to the layer constructor. If no policy has been set with
   `keras.mixed_precision.experimental.set_policy`, this will return a policy
-  constructed from `tf.keras.backend.floatx()` in TensorFlow 2 (floatx defaults
-  to float32), or an "infer" policy in TensorFlow 1.
+  constructed from `tf.keras.backend.floatx()` (floatx defaults to float32).
 
-  See `keras.mixed_precision.experimental.Policy` for more information.
+  If TensorFlow 2 behavior has been disabled with
+  `tf.compat.v1.disable_v2_behavior()`, this will instead return a special
+  "_infer" policy which infers the dtype from the dtype of the first input the
+  first time the layer is called. This behavior matches the behavior that
+  existed in TensorFlow 1.
+
+  See `tf.keras.mixed_precision.experimental.Policy` for more information on
+  policies.
 
   Returns:
     The global Policy.
@@ -504,7 +506,7 @@ def global_policy():
     if base_layer_utils.v2_dtype_behavior_enabled():
       return Policy(backend.floatx())
     else:
-      return Policy('infer')
+      return Policy('_infer')
   return _global_policy
 
 
@@ -513,31 +515,29 @@ def policy_defaults_to_floatx():
   return _global_policy is None and base_layer_utils.v2_dtype_behavior_enabled()
 
 
-def _check_if_mixed_precision_graph_rewrite_is_enabled():
-  # TODO(reedwm): Update this comment once the Keras API is complete.
+def _check_if_mixed_precision_graph_rewrite_is_enabled(policy):
   if mixed_precision_global_state.mixed_precision_graph_rewrite_is_enabled:
     raise ValueError(
-        'The mixed precision policy cannot be set, because the mixed '
-        'precision graph rewrite has already been enabled.\n'
-        'At most, one of the following functions can be called:\n\n'
+        'The global dtype policy cannot be set to "{policy.name}", because the '
+        'mixed precision graph rewrite has already been enabled.\n'
+        'At most, one of the following can be called:\n\n'
         '  1. tf.train.experimental.enable_mixed_precision_graph_rewrite() '
         '(You called this first)\n'
-        '  2. tf.keras.mixed_precision.experimental.set_policy() (You called '
-        'this second)\n\n'
+        '  2. tf.keras.mixed_precision.experimental.set_policy() with a mixed '
+        'precision policy (You called this second)\n\n'
         'You called both functions, which is an error, because both functions '
         'enable you to use mixed precision. If in doubt which function to use, '
         'use the second, as it supports Eager execution and is more '
-        'customizable.')
+        'customizable.'.format(policy=policy))
 
 
-@keras_export('keras.mixed_precision.experimental.set_policy')
+@keras_export('keras.mixed_precision.experimental.set_policy', v1=[])
 def set_policy(policy):
   """Sets the global Policy.
 
   The global policy is the default policy used for layers, if no policy is
   passed to the layer constructor. If no global policy is set, layers will
-  instead default to a Policy constructed from `tf.keras.backend.floatx()` in
-  TensorFlow 2. In TensorFlow 1, layers default to an "infer" policy.
+  instead default to a Policy constructed from `tf.keras.backend.floatx()`.
 
   See `keras.mixed_precision.experimental.Policy` for more information.
 
@@ -545,17 +545,15 @@ def set_policy(policy):
     policy: A Policy, or a string that will be converted to a Policy..
   """
   global _global_policy
-  _check_if_mixed_precision_graph_rewrite_is_enabled()
+  if not base_layer_utils.v2_dtype_behavior_enabled():
+    raise ValueError('The global policy can only be set in TensorFlow 2')
   if policy is not None and not isinstance(policy, Policy):
     policy = Policy(policy)
-  if (policy and not base_layer_utils.v2_dtype_behavior_enabled() and
-      policy.compute_dtype):
-    raise ValueError(
-        'The global policy can only be set to a non-infer policy in TensorFlow '
-        '2')
+  is_mixed_policy = policy is not None and policy.should_cast_variables
+  if is_mixed_policy:
+    _check_if_mixed_precision_graph_rewrite_is_enabled(policy)
   _global_policy = policy
-  mixed_precision_global_state.using_default_mixed_precision_policy = (
-      _global_policy is None)
+  mixed_precision_global_state.using_mixed_precision_policy = is_mixed_policy
 
 
 # TODO(reedwm): Make this thread local
@@ -592,7 +590,7 @@ def _policy_equivalent_to_dtype(policy):
   dtypes are the same and the policy does not cause the layer/model to have
   additional behavior, such as loss scaling.
 
-  The "infer" policy is considered equivalent to a single dtype.
+  The "_infer" policy is considered equivalent to a single dtype.
 
   Args:
     policy: A Policy.
@@ -604,7 +602,7 @@ def _policy_equivalent_to_dtype(policy):
   # equivalent to a dtype.
   return (type(policy) == Policy and  # pylint: disable=unidiomatic-typecheck
           list(policy.get_config().keys()) == ['name'] and
-          (policy.name == 'infer' or _is_convertible_to_dtype(policy.name)))
+          (policy.name == '_infer' or _is_convertible_to_dtype(policy.name)))
 
 
 def serialize(policy):
@@ -612,7 +610,7 @@ def serialize(policy):
     # We return either None or the policy name for compatibility with older
     # versions of Keras. If the policy name is returned, it is a dtype string
     # such as 'float32'.
-    return None if policy.name == 'infer' else policy.name
+    return None if policy.name == '_infer' else policy.name
   return generic_utils.serialize_keras_object(policy)
 
 
@@ -620,7 +618,7 @@ def deserialize(config, custom_objects=None):
   if isinstance(config, str) and _is_convertible_to_dtype(config):
     return Policy(config)
   if config is None:
-    return Policy('infer')
+    return Policy('_infer')
   module_objects = {'Policy': Policy}
   return generic_utils.deserialize_keras_object(
       config,
diff --git a/tensorflow/python/keras/mixed_precision/experimental/policy_test.py b/tensorflow/python/keras/mixed_precision/experimental/policy_test.py
index 330ed9e4a0c..b345039b406 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/policy_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/policy_test.py
@@ -18,10 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.mixed_precision.experimental import device_compatibility_check
@@ -33,16 +35,12 @@ from tensorflow.python.training.experimental import loss_scale as loss_scale_mod
 from tensorflow.python.training.experimental import mixed_precision
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class PolicyTest(test.TestCase):
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
+class PolicyTest(test.TestCase, parameterized.TestCase):
   """Tests Policies."""
 
   @testing_utils.enable_v2_dtype_behavior
   def test_dtype_attributes(self):
-    policy = mp_policy.Policy('infer')
-    self.assertEqual(policy.compute_dtype, None)
-    self.assertEqual(policy.variable_dtype, None)
-
     for dtype in 'int32', 'bool', 'float16', 'float32':
       policy = mp_policy.Policy(dtype)
       self.assertEqual(policy.name, dtype)
@@ -55,9 +53,13 @@ class PolicyTest(test.TestCase):
       self.assertEqual(policy.compute_dtype, dtype)
       self.assertEqual(policy.variable_dtype, 'float32')
 
+    policy = mp_policy.Policy('_infer')
+    self.assertEqual(policy.compute_dtype, None)
+    self.assertEqual(policy.variable_dtype, None)
+
   @testing_utils.enable_v2_dtype_behavior
   def test_repr(self):
-    for policy in ('infer', 'float32', 'int8', 'mixed_bfloat16'):
+    for policy in ('float32', 'int8', 'mixed_bfloat16', '_infer'):
       self.assertEqual(repr(mp_policy.Policy(policy)),
                        '<Policy "%s", loss_scale=None>' % policy)
     self.assertEqual(repr(mp_policy.Policy('float16', loss_scale=2)),
@@ -137,15 +139,15 @@ class PolicyTest(test.TestCase):
     if base_layer_utils.v2_dtype_behavior_enabled():
       default_policy = 'float32'
     else:
-      default_policy = 'infer'
+      default_policy = '_infer'
     self.assertEqual(mp_policy.global_policy().name, default_policy)
     try:
       mp_policy.set_policy('mixed_float16')
       self.assertEqual(mp_policy.global_policy().name, 'mixed_float16')
       with ops.Graph().as_default():  # Policies are not associated with a graph
         self.assertEqual(mp_policy.global_policy().name, 'mixed_float16')
-      mp_policy.set_policy('infer')
-      self.assertEqual(mp_policy.global_policy().name, 'infer')
+      mp_policy.set_policy('_infer')
+      self.assertEqual(mp_policy.global_policy().name, '_infer')
       policy = mp_policy.Policy('mixed_bfloat16')
       mp_policy.set_policy(policy)
       self.assertIs(mp_policy.global_policy(), policy)
@@ -196,11 +198,11 @@ class PolicyTest(test.TestCase):
     if base_layer_utils.v2_dtype_behavior_enabled():
       default_policy = 'float32'
     else:
-      default_policy = 'infer'
+      default_policy = '_infer'
     with mp_policy.policy_scope('mixed_float16'):
       self.assertEqual(mp_policy.global_policy().name, 'mixed_float16')
-      with mp_policy.policy_scope('infer'):
-        self.assertEqual(mp_policy.global_policy().name, 'infer')
+      with mp_policy.policy_scope('_infer'):
+        self.assertEqual(mp_policy.global_policy().name, '_infer')
       self.assertEqual(mp_policy.global_policy().name, 'mixed_float16')
     self.assertEqual(mp_policy.global_policy().name, default_policy)
 
@@ -212,7 +214,7 @@ class PolicyTest(test.TestCase):
         mp_policy.Policy('int16'),
         mp_policy.Policy('mixed_float16'),
         mp_policy.Policy('mixed_bfloat16'),
-        mp_policy.Policy('infer'),
+        mp_policy.Policy('_infer'),
         mp_policy.Policy('float32', loss_scale=2.),
         mp_policy.Policy('float32', loss_scale=None),
         mp_policy.Policy('mixed_float16', loss_scale=2.),
@@ -236,8 +238,8 @@ class PolicyTest(test.TestCase):
       new_policy = mp_policy.deserialize(config)
       self.assertEqual(str(policy), str(new_policy))
 
-    # Test "infer" policy
-    policy = mp_policy.Policy('infer')
+    # Test "_infer" policy
+    policy = mp_policy.Policy('_infer')
     config = mp_policy.serialize(policy)
     self.assertIsNone(config)
     new_policy = mp_policy.deserialize(config)
@@ -293,27 +295,30 @@ class PolicyTest(test.TestCase):
       mixed_precision.enable_mixed_precision_graph_rewrite(
           gradient_descent.SGD(1.))
       with self.assertRaisesRegexp(
-          ValueError, 'the mixed precision graph rewrite has already been '
-                      'enabled'):
+          ValueError, 'cannot be set to "mixed_float16", .* the mixed '
+                      'precision graph rewrite has already been enabled'):
         mp_policy.set_policy('mixed_float16')
+      with mp_policy.policy_scope('float64'):
+        pass  # Non-mixed policies are allowed
     finally:
       mixed_precision.disable_mixed_precision_graph_rewrite()
 
   @testing_utils.disable_v2_dtype_behavior
   def test_v1_dtype_behavior(self):
-    # Only the "infer" policy is allowed with V1 dtype behavior
-    with mp_policy.policy_scope(mp_policy.Policy('infer')):
-      pass
-
-    # Non-infer policies are not allowed with V1 dtype behavior
+    # Setting global policies are not allowed with V1 dtype behavior
     with self.assertRaisesRegexp(
         ValueError,
-        'global policy can only be set to a non-infer policy in TensorFlow 2'):
+        'global policy can only be set in TensorFlow 2'):
+      with mp_policy.policy_scope(mp_policy.Policy('_infer')):
+        pass
+    with self.assertRaisesRegexp(
+        ValueError,
+        'global policy can only be set in TensorFlow 2'):
       with mp_policy.policy_scope(mp_policy.Policy('float32')):
         pass
     with self.assertRaisesRegexp(
         ValueError,
-        'global policy can only be set to a non-infer policy in TensorFlow 2'):
+        'global policy can only be set in TensorFlow 2'):
       with mp_policy.policy_scope(mp_policy.Policy('mixed_float16')):
         pass
 
diff --git a/tensorflow/python/keras/optimizer_v2/BUILD b/tensorflow/python/keras/optimizer_v2/BUILD
index ec6a9d5d663..03d9a0070f4 100644
--- a/tensorflow/python/keras/optimizer_v2/BUILD
+++ b/tensorflow/python/keras/optimizer_v2/BUILD
@@ -75,6 +75,7 @@ cuda_py_test(
         "//tensorflow/python:resources",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/keras:combinations",
     ],
 )
 
@@ -94,6 +95,7 @@ cuda_py_test(
         "//tensorflow/python:resources",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/keras:combinations",
     ],
 )
 
@@ -113,6 +115,7 @@ cuda_py_test(
         "//tensorflow/python:resources",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/keras:combinations",
     ],
 )
 
@@ -132,6 +135,7 @@ cuda_py_test(
         "//tensorflow/python:resources",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/keras:combinations",
     ],
 )
 
@@ -170,6 +174,7 @@ cuda_py_test(
         "//tensorflow/python:resources",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/keras:combinations",
     ],
 )
 
@@ -213,6 +218,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/keras",
+        "//tensorflow/python/keras:combinations",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -226,6 +232,7 @@ cuda_py_test(
         ":optimizer_v2",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
+        "//tensorflow/python/keras:combinations",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -248,6 +255,7 @@ cuda_py_test(
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/keras:combinations",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/keras/optimizer_v2/adadelta.py b/tensorflow/python/keras/optimizer_v2/adadelta.py
index 815d17c29d6..dfed74d8ab8 100644
--- a/tensorflow/python/keras/optimizer_v2/adadelta.py
+++ b/tensorflow/python/keras/optimizer_v2/adadelta.py
@@ -52,10 +52,23 @@ class Adadelta(optimizer_v2.OptimizerV2):
   $$E[\Delta x^2]_t := \rho * E[\Delta x^2]_{t-1} + (1 - \rho) * \Delta x_t^2$$
   $$x_t := x_{t-1} + \Delta x_{t}$$
 
+  Adadelta is a more robust extension of Adagrad that adapts learning rates
+  based on a moving window of gradient updates, instead of accumulating all
+  past gradients. This way, Adadelta continues learning even when many updates
+  have been done. Compared to Adagrad, in the original version of Adadelta you
+  don't have to set an initial learning rate. In this version, initial
+  learning rate can be set, as in most other Keras optimizers.
+
+  @compatibility(eager)
+  When eager execution is enabled, `learning_rate`, `rho`, and `epsilon` can
+  each be a callable that takes no arguments and returns the actual value to
+  use. This can be useful for changing these values across different
+  invocations of optimizer functions.
+  @end_compatibility
+
   References
     See [M. D. Zeiler](http://arxiv.org/abs/1212.5701)
       ([pdf](http://arxiv.org/pdf/1212.5701v1.pdf))
-
   """
 
   _HAS_ALL_REDUCE_SUM_GRAD = True
@@ -68,13 +81,6 @@ class Adadelta(optimizer_v2.OptimizerV2):
                **kwargs):
     """Construct a new Adadelta optimizer.
 
-    Adadelta is a more robust extension of Adagrad that adapts learning rates
-    based on a moving window of gradient updates, instead of accumulating all
-    past gradients. This way, Adadelta continues learning even when many updates
-    have been done. Compared to Adagrad, in the original version of Adadelta you
-    don't have to set an initial learning rate. In this version, initial
-    learning rate can be set, as in most other Keras optimizers.
-
     Args:
       learning_rate: A `Tensor`, floating point value, or a schedule that is a
         `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
@@ -89,13 +95,6 @@ class Adadelta(optimizer_v2.OptimizerV2):
         gradients by value, `decay` is included for backward compatibility to
         allow time inverse decay of learning rate. `lr` is included for backward
         compatibility, recommended to use `learning_rate` instead.
-
-    @compatibility(eager)
-    When eager execution is enabled, `learning_rate`, `rho`, and `epsilon` can
-    each be a callable that takes no arguments and returns the actual value to
-    use. This can be useful for changing these values across different
-    invocations of optimizer functions.
-    @end_compatibility
     """
     super(Adadelta, self).__init__(name, **kwargs)
     self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
diff --git a/tensorflow/python/keras/optimizer_v2/adadelta_test.py b/tensorflow/python/keras/optimizer_v2/adadelta_test.py
index da796d3263a..c5a7e0414ce 100644
--- a/tensorflow/python/keras/optimizer_v2/adadelta_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adadelta_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.eager import context
@@ -25,6 +26,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras.optimizer_v2 import adadelta
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
@@ -38,7 +40,7 @@ if (not test_util.IsBuiltWithNvcc() and not test.is_built_with_rocm()):
   _DATA_TYPES += [dtypes.complex64, dtypes.complex128]
 
 
-class AdadeltaOptimizerTest(test.TestCase):
+class AdadeltaOptimizerTest(test.TestCase, parameterized.TestCase):
 
   def doTestBasic(self, use_resource=False, use_callable_params=False):
     num_updates = 4  # number of ADADELTA steps to perform
@@ -145,7 +147,7 @@ class AdadeltaOptimizerTest(test.TestCase):
                   self.evaluate(var1),
                   rtol=1e-5)
 
-  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testResourceBasic(self):
     self.doTestBasic(use_resource=True)
 
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad.py b/tensorflow/python/keras/optimizer_v2/adagrad.py
index f251ef027b4..ca1cbd6d3fb 100644
--- a/tensorflow/python/keras/optimizer_v2/adagrad.py
+++ b/tensorflow/python/keras/optimizer_v2/adagrad.py
@@ -47,6 +47,13 @@ class Adagrad(optimizer_v2.OptimizerV2):
   $$accum_{g_t} := accum_{g_{t-1}} + g^2$$
   $$\theta_t := \theta_{t-1} - lr * g / (\sqrt{accum_{g_t}} + \epsilon)$$
 
+  @compatibility(eager)
+  When eager execution is enabled, `learning_rate` can be a callable that
+  takes no arguments and returns the actual value to use. This can be useful
+  for changing these values across different invocations of optimizer
+  functions.
+  @end_compatibility
+
   References:
 
   * [Paper](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf).
@@ -80,13 +87,6 @@ class Adagrad(optimizer_v2.OptimizerV2):
 
     Raises:
       ValueError: If the `initial_accumulator_value` or `epsilon` is invalid.
-
-    @compatibility(eager)
-    When eager execution is enabled, `learning_rate` can be a callable that
-    takes no arguments and returns the actual value to use. This can be useful
-    for changing these values across different invocations of optimizer
-    functions.
-    @end_compatibility
     """
     if initial_accumulator_value < 0.0:
       raise ValueError('initial_accumulator_value must be non-negative: %s' %
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad_test.py b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
index fb6f41e7519..8c69a19171a 100644
--- a/tensorflow/python/keras/optimizer_v2/adagrad_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import copy
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.eager import context
@@ -27,6 +28,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras.optimizer_v2 import adagrad
 from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 from tensorflow.python.ops import embedding_ops
@@ -68,7 +70,7 @@ def sparse_adagrad_update_numpy(param,
   return param_t, accum_t
 
 
-class AdagradOptimizerTest(test.TestCase):
+class AdagradOptimizerTest(test.TestCase, parameterized.TestCase):
 
   def doTestBasic(self, use_callable_params=False):
     for dtype in _DATA_TYPES:
@@ -113,7 +115,7 @@ class AdagradOptimizerTest(test.TestCase):
         self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
         self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
-  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testBasic(self):
     self.doTestBasic()
 
diff --git a/tensorflow/python/keras/optimizer_v2/adam.py b/tensorflow/python/keras/optimizer_v2/adam.py
index dcf11b6d0c3..6783d9324f6 100644
--- a/tensorflow/python/keras/optimizer_v2/adam.py
+++ b/tensorflow/python/keras/optimizer_v2/adam.py
@@ -30,7 +30,7 @@ from tensorflow.python.util.tf_export import keras_export
 
 @keras_export('keras.optimizers.Adam')
 class Adam(optimizer_v2.OptimizerV2):
-  """Optimizer that implements the Adam algorithm.
+  r"""Optimizer that implements the Adam algorithm.
 
   Adam optimization is a stochastic gradient descent method that is based on
   adaptive estimation of first-order and second-order moments.
@@ -43,6 +43,63 @@ class Adam(optimizer_v2.OptimizerV2):
 
   For AMSGrad see [On The Convergence Of Adam And Beyond.
   Reddi et al., 5-8](https://openreview.net/pdf?id=ryQu7f-RZ).
+
+  **If amsgrad = False**:
+
+  initialize $m_0$ as 1st moment vector
+  initialize $v_0$ as 2nd moment vector
+
+  The update rule for $\theta$ with gradient $g$ uses an optimization
+  described at the end of section 2 of the paper:
+
+  $$lr_t = \mathrm{learning\_rate} *
+    \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
+  $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
+  $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$
+  $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+
+  **If amsgrad = True**:
+
+  initialize $m_0$ as 1st moment vector
+  initialize $v_0$ as 2nd moment vector
+  initialize $\hat{v}_0$ as 2nd moment vector
+
+  The update rule for $\theta$ with gradient $g$ uses an optimization
+  described at the end of section 2 of the paper:
+
+  $$lr_t = \mathrm{learning\_rate} *
+    \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
+
+  $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
+  $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$
+  $$\hat{v}_t = \max(\hat{v}_{t-1}, v_t)$$
+  $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{\hat{v}_t} + \epsilon)$$
+
+  The default value of 1e-7 for epsilon might not be a good default in
+  general. For example, when training an Inception network on ImageNet a
+  current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
+  formulation just before Section 2.1 of the Kingma and Ba paper rather than
+  the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
+  hat" in the paper.
+
+  The sparse implementation of this algorithm (used when the gradient is an
+  IndexedSlices object, typically because of `tf.gather` or an embedding
+  lookup in the forward pass) does apply momentum to variable slices even if
+  they were not used in the forward pass (meaning they have a gradient equal
+  to zero). Momentum decay (beta1) is also applied to the entire momentum
+  accumulator. This means that the sparse behavior is equivalent to the dense
+  behavior (in contrast to some momentum implementations which ignore momentum
+  unless a variable slice was actually used).
+
+  Usage:
+
+  >>> opt = tf.keras.optimizers.Adam(learning_rate=0.1)
+  >>> var1 = tf.Variable(10.0)
+  >>> loss = lambda: (var1 ** 2)/2.0       # d(loss)/d(var1) == var1
+  >>> step_count = opt.minimize(loss, [var1]).numpy()
+  >>> # The first step is `-learning_rate*sign(grad)`
+  >>> var1.numpy()
+  9.9
   """
 
   _HAS_ALL_REDUCE_SUM_GRAD = True
@@ -55,64 +112,7 @@ class Adam(optimizer_v2.OptimizerV2):
                amsgrad=False,
                name='Adam',
                **kwargs):
-    r"""Construct a new Adam optimizer.
-
-    If amsgrad = False:
-
-      initialize $m_0$ as 1st moment vector
-      initialize $v_0$ as 2nd moment vector
-
-      The update rule for $\theta$ with gradient $g$ uses an optimization
-      described at the end of section 2 of the paper:
-
-      $$lr_t = \mathrm{learning\_rate} *
-        \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
-      $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
-      $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$
-      $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
-
-    If amsgrad = True:
-
-      initialize $m_0$ as 1st moment vector
-      initialize $v_0$ as 2nd moment vector
-      initialize $\hat{v}_0$ as 2nd moment vector
-
-      The update rule for $\theta$ with gradient $g$ uses an optimization
-      described at the end of section 2 of the paper:
-
-      $$lr_t = \mathrm{learning\_rate} *
-        \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
-
-      $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
-      $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$
-      $$\hat{v}_t = \max(\hat{v}_{t-1}, v_t)$$
-      $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{\hat{v}_t} + \epsilon)$$
-
-    The default value of 1e-7 for epsilon might not be a good default in
-    general. For example, when training an Inception network on ImageNet a
-    current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
-    formulation just before Section 2.1 of the Kingma and Ba paper rather than
-    the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
-    hat" in the paper.
-
-    The sparse implementation of this algorithm (used when the gradient is an
-    IndexedSlices object, typically because of `tf.gather` or an embedding
-    lookup in the forward pass) does apply momentum to variable slices even if
-    they were not used in the forward pass (meaning they have a gradient equal
-    to zero). Momentum decay (beta1) is also applied to the entire momentum
-    accumulator. This means that the sparse behavior is equivalent to the dense
-    behavior (in contrast to some momentum implementations which ignore momentum
-    unless a variable slice was actually used).
-
-    Usage:
-
-    >>> opt = tf.keras.optimizers.Adam(learning_rate=0.1)
-    >>> var1 = tf.Variable(10.0)
-    >>> loss = lambda: (var1 ** 2)/2.0       # d(loss)/d(var1) == var1
-    >>> step_count = opt.minimize(loss, [var1]).numpy()
-    >>> # The first step is `-learning_rate*sign(grad)`
-    >>> var1.numpy()
-    9.9
+    """Construct a new Adam optimizer.
 
     Args:
       learning_rate: A `Tensor`, floating point value, or a schedule that is a
@@ -138,7 +138,6 @@ class Adam(optimizer_v2.OptimizerV2):
         gradients by value, `decay` is included for backward compatibility to
         allow time inverse decay of learning rate. `lr` is included for backward
         compatibility, recommended to use `learning_rate` instead.
-
     """
 
     super(Adam, self).__init__(name, **kwargs)
diff --git a/tensorflow/python/keras/optimizer_v2/adam_test.py b/tensorflow/python/keras/optimizer_v2/adam_test.py
index 0ca3058840f..83ffc87d792 100644
--- a/tensorflow/python/keras/optimizer_v2/adam_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adam_test.py
@@ -18,13 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
@@ -108,7 +109,7 @@ def get_beta_accumulators(opt, dtype):
   return (beta_1_power, beta_2_power)
 
 
-class AdamOptimizerTest(test.TestCase):
+class AdamOptimizerTest(test.TestCase, parameterized.TestCase):
 
   def testSparse(self):
     # TODO(tanzheny, omalleyt): Fix test in eager mode.
@@ -252,7 +253,7 @@ class AdamOptimizerTest(test.TestCase):
           self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
           self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
-  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testResourceBasic(self):
     self.doTestBasic()
 
@@ -260,7 +261,7 @@ class AdamOptimizerTest(test.TestCase):
     with context.eager_mode():
       self.doTestBasic(use_callable_params=True)
 
-  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testBasicWithAmsgrad(self):
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
       with self.cached_session(use_gpu=True):
@@ -304,7 +305,7 @@ class AdamOptimizerTest(test.TestCase):
           self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
           self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testSparseWithAmsgrad(self):
     # dtypes.half does not work on gpu + eager.
     for dtype in [dtypes.float32, dtypes.float64]:
diff --git a/tensorflow/python/keras/optimizer_v2/adamax.py b/tensorflow/python/keras/optimizer_v2/adamax.py
index da2d9bb48d1..9166f637c1e 100644
--- a/tensorflow/python/keras/optimizer_v2/adamax.py
+++ b/tensorflow/python/keras/optimizer_v2/adamax.py
@@ -37,6 +37,37 @@ class Adamax(optimizer_v2.OptimizerV2):
   Default parameters follow those provided in the paper.
   Adamax is sometimes superior to adam, specially in models with embeddings.
 
+  Initialization:
+
+  ```
+  m_0 <- 0 (Initialize initial 1st moment vector)
+  v_0 <- 0 (Initialize the exponentially weighted infinity norm)
+  t <- 0 (Initialize timestep)
+  ```
+
+  The update rule for `variable` with gradient `g` uses an optimization
+  described at the end of section 7.1 of the paper:
+
+  ```
+  t <- t + 1
+
+  m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+  v_t <- max(beta2 * v_{t-1}, abs(g))
+  variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+  ```
+
+  Similar to AdamOptimizer, the epsilon is added for numerical stability
+  (especially to get rid of division by zero when v_t = 0).
+
+  Contrast to AdamOptimizer, the sparse implementation of this algorithm
+  (used when the gradient is an IndexedSlices object, typically because of
+  `tf.gather` or an embedding lookup in the forward pass) only updates
+  variable slices and corresponding `m_t`, `v_t` terms when that part of
+  the variable was used in the forward pass. This means that the sparse
+  behavior is contrast to the dense behavior (similar to some momentum
+  implementations which ignore momentum unless a variable slice was actually
+  used).
+
   References
     see Section 7 of [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
       ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
@@ -53,37 +84,6 @@ class Adamax(optimizer_v2.OptimizerV2):
                **kwargs):
     """Construct a new Adamax optimizer.
 
-    Initialization:
-
-    ```
-    m_0 <- 0 (Initialize initial 1st moment vector)
-    v_0 <- 0 (Initialize the exponentially weighted infinity norm)
-    t <- 0 (Initialize timestep)
-    ```
-
-    The update rule for `variable` with gradient `g` uses an optimization
-    described at the end of section 7.1 of the paper:
-
-    ```
-    t <- t + 1
-
-    m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-    v_t <- max(beta2 * v_{t-1}, abs(g))
-    variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
-    ```
-
-    Similar to AdamOptimizer, the epsilon is added for numerical stability
-    (especially to get rid of division by zero when v_t = 0).
-
-    Contrast to AdamOptimizer, the sparse implementation of this algorithm
-    (used when the gradient is an IndexedSlices object, typically because of
-    `tf.gather` or an embedding lookup in the forward pass) only updates
-    variable slices and corresponding `m_t`, `v_t` terms when that part of
-    the variable was used in the forward pass. This means that the sparse
-    behavior is contrast to the dense behavior (similar to some momentum
-    implementations which ignore momentum unless a variable slice was actually
-    used).
-
     Args:
       learning_rate: A `Tensor`, floating point value, or a schedule that is a
         `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
diff --git a/tensorflow/python/keras/optimizer_v2/adamax_test.py b/tensorflow/python/keras/optimizer_v2/adamax_test.py
index 4e7c4ea7c04..07c7ee96eeb 100644
--- a/tensorflow/python/keras/optimizer_v2/adamax_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adamax_test.py
@@ -18,13 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras.optimizer_v2 import adamax
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -76,7 +77,7 @@ def get_beta_accumulators(opt, dtype):
   return beta_1_power
 
 
-class AdamaxOptimizerTest(test.TestCase):
+class AdamaxOptimizerTest(test.TestCase, parameterized.TestCase):
 
   def testResourceSparse(self):
     # TODO(tanzheny, omalleyt): Fix test in eager mode.
@@ -171,7 +172,7 @@ class AdamaxOptimizerTest(test.TestCase):
           self.assertAllClose(aggregated_update_var.eval(),
                               repeated_index_update_var.eval())
 
-  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testBasic(self):
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
       with self.session(graph=ops.Graph(), use_gpu=True):
@@ -222,7 +223,7 @@ class AdamaxOptimizerTest(test.TestCase):
           self.assertAllCloseAccordingToType(
               var1_np, self.evaluate(var1), rtol=1e-2)
 
-  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testBasicWithLearningRateDecay(self):
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
       with self.session(graph=ops.Graph(), use_gpu=True):
diff --git a/tensorflow/python/keras/optimizer_v2/ftrl.py b/tensorflow/python/keras/optimizer_v2/ftrl.py
index b893271f805..17484395044 100644
--- a/tensorflow/python/keras/optimizer_v2/ftrl.py
+++ b/tensorflow/python/keras/optimizer_v2/ftrl.py
@@ -52,6 +52,9 @@ class Ftrl(optimizer_v2.OptimizerV2):
   Check the documentation for the l2_shrinkage_regularization_strength
   parameter for more details when shrinkage is enabled, where gradient is
   replaced with gradient_with_shrinkage.
+
+  References: See
+  [paper](https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
   """
 
   def __init__(self,
@@ -100,10 +103,6 @@ class Ftrl(optimizer_v2.OptimizerV2):
 
     Raises:
       ValueError: If one of the arguments is invalid.
-
-    References
-      See [paper]
-        (https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
     """
     super(Ftrl, self).__init__(name, **kwargs)
 
diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
index 133c0c24489..2c96192905f 100644
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.eager import backprop
@@ -26,7 +27,7 @@ from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 from tensorflow.python.ops import array_ops
@@ -37,9 +38,9 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
-class GradientDescentOptimizerTest(test.TestCase):
+class GradientDescentOptimizerTest(test.TestCase, parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testBasic(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
@@ -88,7 +89,7 @@ class GradientDescentOptimizerTest(test.TestCase):
         [3.0 - 3.0 * 0.01 - 2.0 * 0.01, 4.0 - 3.0 * 0.01 - 2.0 * 0.01],
         self.evaluate(var1))
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testBasicWithLearningRateDecay(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       learning_rate = 3.0
@@ -96,7 +97,7 @@ class GradientDescentOptimizerTest(test.TestCase):
       sgd = gradient_descent.SGD(learning_rate=learning_rate, decay=decay)
       self._test_basic_sgd_with_learning_rate_decay(sgd, dtype)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testBasicWithLearningRateInverseTimeDecay(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       learning_rate = learning_rate_schedule.InverseTimeDecay(
@@ -104,7 +105,7 @@ class GradientDescentOptimizerTest(test.TestCase):
       sgd = gradient_descent.SGD(learning_rate=learning_rate)
       self._test_basic_sgd_with_learning_rate_decay(sgd, dtype)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testBasicWithLearningRateInverseTimeDecaySerializeAndDeserialize(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       learning_rate = learning_rate_schedule.InverseTimeDecay(
@@ -113,7 +114,7 @@ class GradientDescentOptimizerTest(test.TestCase):
       sgd = gradient_descent.SGD.from_config(sgd.get_config())
       self._test_basic_sgd_with_learning_rate_decay(sgd, dtype)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testBasicCallableParams(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
@@ -132,7 +133,7 @@ class GradientDescentOptimizerTest(test.TestCase):
       self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
                                          self.evaluate(var1))
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testMinimizeResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
@@ -292,14 +293,14 @@ class GradientDescentOptimizerTest(test.TestCase):
     self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
 
 
-class MomentumOptimizerTest(test.TestCase):
+class MomentumOptimizerTest(test.TestCase, parameterized.TestCase):
 
   def _update_nesterov_momentum_numpy(self, var, accum, g, lr, momentum):
     accum = accum * momentum - g * lr
     var += (accum * momentum - g * lr)
     return var, accum
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testBasic(self):
     for _, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
       var0 = resource_variable_ops.ResourceVariable([1.0, 2.0],
@@ -454,7 +455,7 @@ class MomentumOptimizerTest(test.TestCase):
         # Validate updated params
         self.assertAllCloseAccordingToType([[-111, -138]], self.evaluate(var0))
 
-  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testMinimizeWith2DIndicesForEmbeddingLookup(self):
     var0 = resource_variable_ops.ResourceVariable(array_ops.ones([2, 2]))
 
@@ -662,7 +663,7 @@ class MomentumOptimizerTest(test.TestCase):
                 3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
             ]), self.evaluate(var1))
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testConfig(self):
     opt = gradient_descent.SGD(learning_rate=1.0, momentum=0.9, nesterov=True)
     config = opt.get_config()
diff --git a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py
index a735d8287d6..9efda8faa5d 100644
--- a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py
+++ b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py
@@ -62,7 +62,55 @@ class LearningRateSchedule(object):
 
 @keras_export("keras.optimizers.schedules.ExponentialDecay")
 class ExponentialDecay(LearningRateSchedule):
-  """A LearningRateSchedule that uses an exponential decay schedule."""
+  """A LearningRateSchedule that uses an exponential decay schedule.
+
+  When training a model, it is often recommended to lower the learning rate as
+  the training progresses. This schedule applies an exponential decay function
+  to an optimizer step, given a provided initial learning rate.
+
+  The schedule a 1-arg callable that produces a decayed learning
+  rate when passed the current optimizer step. This can be useful for changing
+  the learning rate value across different invocations of optimizer functions.
+  It is computed as:
+
+  ```python
+  def decayed_learning_rate(step):
+    return initial_learning_rate * decay_rate ^ (step / decay_steps)
+  ```
+
+  If the argument `staircase` is `True`, then `step / decay_steps` is
+  an integer division and the decayed learning rate follows a
+  staircase function.
+
+  You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+  as the learning rate.
+  Example: When fitting a Keras model, decay every 100000 steps with a base
+  of 0.96:
+
+  ```python
+  initial_learning_rate = 0.1
+  lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
+      initial_learning_rate,
+      decay_steps=100000,
+      decay_rate=0.96,
+      staircase=True)
+
+  model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=lr_schedule),
+                loss='sparse_categorical_crossentropy',
+                metrics=['accuracy'])
+
+  model.fit(data, labels, epochs=5)
+  ```
+
+  The learning rate schedule is also serializable and deserializable using
+  `tf.keras.optimizers.schedules.serialize` and
+  `tf.keras.optimizers.schedules.deserialize`.
+
+  Returns:
+    A 1-arg callable learning rate schedule that takes the current optimizer
+    step and outputs the decayed learning rate, a scalar `Tensor` of the same
+    type as `initial_learning_rate`.
+  """
 
   def __init__(
       self,
@@ -73,48 +121,6 @@ class ExponentialDecay(LearningRateSchedule):
       name=None):
     """Applies exponential decay to the learning rate.
 
-    When training a model, it is often recommended to lower the learning rate as
-    the training progresses. This schedule applies an exponential decay function
-    to an optimizer step, given a provided initial learning rate.
-
-    The schedule a 1-arg callable that produces a decayed learning
-    rate when passed the current optimizer step. This can be useful for changing
-    the learning rate value across different invocations of optimizer functions.
-    It is computed as:
-
-    ```python
-    def decayed_learning_rate(step):
-      return initial_learning_rate * decay_rate ^ (step / decay_steps)
-    ```
-
-    If the argument `staircase` is `True`, then `step / decay_steps` is
-    an integer division and the decayed learning rate follows a
-    staircase function.
-
-    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
-    as the learning rate.
-    Example: When fitting a Keras model, decay every 100000 steps with a base
-    of 0.96:
-
-    ```python
-    initial_learning_rate = 0.1
-    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
-        initial_learning_rate,
-        decay_steps=100000,
-        decay_rate=0.96,
-        staircase=True)
-
-    model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=lr_schedule),
-                  loss='sparse_categorical_crossentropy',
-                  metrics=['accuracy'])
-
-    model.fit(data, labels, epochs=5)
-    ```
-
-    The learning rate schedule is also serializable and deserializable using
-    `tf.keras.optimizers.schedules.serialize` and
-    `tf.keras.optimizers.schedules.deserialize`.
-
     Args:
       initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a
         Python number.  The initial learning rate.
@@ -126,11 +132,6 @@ class ExponentialDecay(LearningRateSchedule):
         intervals
       name: String.  Optional name of the operation.  Defaults to
         'ExponentialDecay'.
-
-    Returns:
-      A 1-arg callable learning rate schedule that takes the current optimizer
-      step and outputs the decayed learning rate, a scalar `Tensor` of the same
-      type as `initial_learning_rate`.
     """
     super(ExponentialDecay, self).__init__()
     self.initial_learning_rate = initial_learning_rate
@@ -166,7 +167,41 @@ class ExponentialDecay(LearningRateSchedule):
 
 @keras_export("keras.optimizers.schedules.PiecewiseConstantDecay")
 class PiecewiseConstantDecay(LearningRateSchedule):
-  """A LearningRateSchedule that uses a piecewise constant decay schedule."""
+  """A LearningRateSchedule that uses a piecewise constant decay schedule.
+
+  The function returns a 1-arg callable to compute the piecewise constant
+  when passed the current optimizer step. This can be useful for changing the
+  learning rate value across different invocations of optimizer functions.
+
+  Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5
+    for the next 10000 steps, and 0.1 for any additional steps.
+
+  ```python
+  step = tf.Variable(0, trainable=False)
+  boundaries = [100000, 110000]
+  values = [1.0, 0.5, 0.1]
+  learning_rate_fn = keras.optimizers.schedules.PiecewiseConstantDecay(
+      boundaries, values)
+
+  # Later, whenever we perform an optimization step, we pass in the step.
+  learning_rate = learning_rate_fn(step)
+  ```
+
+  You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+  as the learning rate. The learning rate schedule is also serializable and
+  deserializable using `tf.keras.optimizers.schedules.serialize` and
+  `tf.keras.optimizers.schedules.deserialize`.
+
+  Returns:
+    A 1-arg callable learning rate schedule that takes the current optimizer
+    step and outputs the decayed learning rate, a scalar `Tensor` of the same
+    type as the boundary tensors.
+
+    The output of the 1-arg function that takes the `step`
+    is `values[0]` when `step <= boundaries[0]`,
+    `values[1]` when `step > boundaries[0]` and `step <= boundaries[1]`, ...,
+    and values[-1] when `step > boundaries[-1]`.
+  """
 
   def __init__(
       self,
@@ -175,29 +210,6 @@ class PiecewiseConstantDecay(LearningRateSchedule):
       name=None):
     """Piecewise constant from boundaries and interval values.
 
-    The function returns a 1-arg callable to compute the piecewise constant
-    when passed the current optimizer step. This can be useful for changing the
-    learning rate value across different invocations of optimizer functions.
-
-    Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5
-      for the next 10000 steps, and 0.1 for any additional steps.
-
-    ```python
-    step = tf.Variable(0, trainable=False)
-    boundaries = [100000, 110000]
-    values = [1.0, 0.5, 0.1]
-    learning_rate_fn = keras.optimizers.schedules.PiecewiseConstantDecay(
-        boundaries, values)
-
-    # Later, whenever we perform an optimization step, we pass in the step.
-    learning_rate = learning_rate_fn(step)
-    ```
-
-    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
-    as the learning rate. The learning rate schedule is also serializable and
-    deserializable using `tf.keras.optimizers.schedules.serialize` and
-    `tf.keras.optimizers.schedules.deserialize`.
-
     Args:
       boundaries: A list of `Tensor`s or `int`s or `float`s with strictly
         increasing entries, and with all elements having the same type as the
@@ -209,16 +221,6 @@ class PiecewiseConstantDecay(LearningRateSchedule):
       name: A string. Optional name of the operation. Defaults to
         'PiecewiseConstant'.
 
-    Returns:
-      A 1-arg callable learning rate schedule that takes the current optimizer
-      step and outputs the decayed learning rate, a scalar `Tensor` of the same
-      type as the boundary tensors.
-
-      The output of the 1-arg function that takes the `step`
-      is `values[0]` when `step <= boundaries[0]`,
-      `values[1]` when `step > boundaries[0]` and `step <= boundaries[1]`, ...,
-      and values[-1] when `step > boundaries[-1]`.
-
     Raises:
       ValueError: if the number of elements in the lists do not match.
     """
@@ -265,7 +267,75 @@ class PiecewiseConstantDecay(LearningRateSchedule):
 
 @keras_export("keras.optimizers.schedules.PolynomialDecay")
 class PolynomialDecay(LearningRateSchedule):
-  """A LearningRateSchedule that uses a polynomial decay schedule."""
+  """A LearningRateSchedule that uses a polynomial decay schedule.
+
+  It is commonly observed that a monotonically decreasing learning rate, whose
+  degree of change is carefully chosen, results in a better performing model.
+  This schedule applies a polynomial decay function to an optimizer step,
+  given a provided `initial_learning_rate`, to reach an `end_learning_rate`
+  in the given `decay_steps`.
+
+  It requires a `step` value to compute the decayed learning rate. You
+  can just pass a TensorFlow variable that you increment at each training
+  step.
+
+  The schedule is a 1-arg callable that produces a decayed learning rate
+  when passed the current optimizer step. This can be useful for changing the
+  learning rate value across different invocations of optimizer functions.
+  It is computed as:
+
+  ```python
+  def decayed_learning_rate(step):
+    step = min(step, decay_steps)
+    return ((initial_learning_rate - end_learning_rate) *
+            (1 - step / decay_steps) ^ (power)
+           ) + end_learning_rate
+  ```
+
+  If `cycle` is True then a multiple of `decay_steps` is used, the first one
+  that is bigger than `step`.
+
+  ```python
+  def decayed_learning_rate(step):
+    decay_steps = decay_steps * ceil(step / decay_steps)
+    return ((initial_learning_rate - end_learning_rate) *
+            (1 - step / decay_steps) ^ (power)
+           ) + end_learning_rate
+  ```
+
+  You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+  as the learning rate.
+  Example: Fit a model while decaying from 0.1 to 0.01 in 10000 steps using
+  sqrt (i.e. power=0.5):
+
+  ```python
+  ...
+  starter_learning_rate = 0.1
+  end_learning_rate = 0.01
+  decay_steps = 10000
+  learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
+      starter_learning_rate,
+      decay_steps,
+      end_learning_rate,
+      power=0.5)
+
+  model.compile(optimizer=tf.keras.optimizers.SGD(
+                    learning_rate=learning_rate_fn),
+                loss='sparse_categorical_crossentropy',
+                metrics=['accuracy'])
+
+  model.fit(data, labels, epochs=5)
+  ```
+
+  The learning rate schedule is also serializable and deserializable using
+  `tf.keras.optimizers.schedules.serialize` and
+  `tf.keras.optimizers.schedules.deserialize`.
+
+  Returns:
+    A 1-arg callable learning rate schedule that takes the current optimizer
+    step and outputs the decayed learning rate, a scalar `Tensor` of the same
+    type as `initial_learning_rate`.
+  """
 
   def __init__(
       self,
@@ -277,68 +347,6 @@ class PolynomialDecay(LearningRateSchedule):
       name=None):
     """Applies a polynomial decay to the learning rate.
 
-    It is commonly observed that a monotonically decreasing learning rate, whose
-    degree of change is carefully chosen, results in a better performing model.
-    This schedule applies a polynomial decay function to an optimizer step,
-    given a provided `initial_learning_rate`, to reach an `end_learning_rate`
-    in the given `decay_steps`.
-
-    It requires a `step` value to compute the decayed learning rate. You
-    can just pass a TensorFlow variable that you increment at each training
-    step.
-
-    The schedule is a 1-arg callable that produces a decayed learning rate
-    when passed the current optimizer step. This can be useful for changing the
-    learning rate value across different invocations of optimizer functions.
-    It is computed as:
-
-    ```python
-    def decayed_learning_rate(step):
-      step = min(step, decay_steps)
-      return ((initial_learning_rate - end_learning_rate) *
-              (1 - step / decay_steps) ^ (power)
-             ) + end_learning_rate
-    ```
-
-    If `cycle` is True then a multiple of `decay_steps` is used, the first one
-    that is bigger than `step`.
-
-    ```python
-    def decayed_learning_rate(step):
-      decay_steps = decay_steps * ceil(step / decay_steps)
-      return ((initial_learning_rate - end_learning_rate) *
-              (1 - step / decay_steps) ^ (power)
-             ) + end_learning_rate
-    ```
-
-    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
-    as the learning rate.
-    Example: Fit a model while decaying from 0.1 to 0.01 in 10000 steps using
-    sqrt (i.e. power=0.5):
-
-    ```python
-    ...
-    starter_learning_rate = 0.1
-    end_learning_rate = 0.01
-    decay_steps = 10000
-    learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
-        starter_learning_rate,
-        decay_steps,
-        end_learning_rate,
-        power=0.5)
-
-    model.compile(optimizer=tf.keras.optimizers.SGD(
-                      learning_rate=learning_rate_fn),
-                  loss='sparse_categorical_crossentropy',
-                  metrics=['accuracy'])
-
-    model.fit(data, labels, epochs=5)
-    ```
-
-    The learning rate schedule is also serializable and deserializable using
-    `tf.keras.optimizers.schedules.serialize` and
-    `tf.keras.optimizers.schedules.deserialize`.
-
     Args:
       initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a
         Python number.  The initial learning rate.
@@ -351,11 +359,6 @@ class PolynomialDecay(LearningRateSchedule):
       cycle: A boolean, whether or not it should cycle beyond decay_steps.
       name: String.  Optional name of the operation. Defaults to
         'PolynomialDecay'.
-
-    Returns:
-      A 1-arg callable learning rate schedule that takes the current optimizer
-      step and outputs the decayed learning rate, a scalar `Tensor` of the same
-      type as `initial_learning_rate`.
     """
     super(PolynomialDecay, self).__init__()
 
@@ -408,7 +411,56 @@ class PolynomialDecay(LearningRateSchedule):
 
 @keras_export("keras.optimizers.schedules.InverseTimeDecay")
 class InverseTimeDecay(LearningRateSchedule):
-  """A LearningRateSchedule that uses an inverse time decay schedule."""
+  """A LearningRateSchedule that uses an inverse time decay schedule.
+
+  When training a model, it is often recommended to lower the learning rate as
+  the training progresses. This schedule applies the inverse decay function
+  to an optimizer step, given a provided initial learning rate.
+  It requires a `step` value to compute the decayed learning rate. You can
+  just pass a TensorFlow variable that you increment at each training step.
+
+  The schedule a 1-arg callable that produces a decayed learning
+  rate when passed the current optimizer step. This can be useful for changing
+  the learning rate value across different invocations of optimizer functions.
+  It is computed as:
+
+  ```python
+  def decayed_learning_rate(step):
+    return initial_learning_rate / (1 + decay_rate * step / decay_step)
+  ```
+
+  or, if `staircase` is `True`, as:
+
+  ```python
+  def decayed_learning_rate(step):
+    return initial_learning_rate / (1 + decay_rate * floor(step / decay_step))
+  ```
+
+  You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+  as the learning rate.
+  Example: Fit a Keras model when decaying 1/t with a rate of 0.5:
+
+  ```python
+  ...
+  initial_learning_rate = 0.1
+  decay_steps = 1.0
+  decay_rate = 0.5
+  learning_rate_fn = keras.optimizers.schedules.InverseTimeDecay(
+    initial_learning_rate, decay_steps, decay_rate)
+
+  model.compile(optimizer=tf.keras.optimizers.SGD(
+                    learning_rate=learning_rate_fn),
+                loss='sparse_categorical_crossentropy',
+                metrics=['accuracy'])
+
+  model.fit(data, labels, epochs=5)
+  ```
+
+  Returns:
+    A 1-arg callable learning rate schedule that takes the current optimizer
+    step and outputs the decayed learning rate, a scalar `Tensor` of the same
+    type as `initial_learning_rate`.
+  """
 
   def __init__(
       self,
@@ -419,49 +471,6 @@ class InverseTimeDecay(LearningRateSchedule):
       name=None):
     """Applies inverse time decay to the initial learning rate.
 
-    When training a model, it is often recommended to lower the learning rate as
-    the training progresses. This schedule applies the inverse decay function
-    to an optimizer step, given a provided initial learning rate.
-    It requires a `step` value to compute the decayed learning rate. You can
-    just pass a TensorFlow variable that you increment at each training step.
-
-    The schedule a 1-arg callable that produces a decayed learning
-    rate when passed the current optimizer step. This can be useful for changing
-    the learning rate value across different invocations of optimizer functions.
-    It is computed as:
-
-    ```python
-    def decayed_learning_rate(step):
-      return initial_learning_rate / (1 + decay_rate * step / decay_step)
-    ```
-
-    or, if `staircase` is `True`, as:
-
-    ```python
-    def decayed_learning_rate(step):
-      return initial_learning_rate / (1 + decay_rate * floor(step / decay_step))
-    ```
-
-    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
-    as the learning rate.
-    Example: Fit a Keras model when decaying 1/t with a rate of 0.5:
-
-    ```python
-    ...
-    initial_learning_rate = 0.1
-    decay_steps = 1.0
-    decay_rate = 0.5
-    learning_rate_fn = keras.optimizers.schedules.InverseTimeDecay(
-      initial_learning_rate, decay_steps, decay_rate)
-
-    model.compile(optimizer=tf.keras.optimizers.SGD(
-                      learning_rate=learning_rate_fn),
-                  loss='sparse_categorical_crossentropy',
-                  metrics=['accuracy'])
-
-    model.fit(data, labels, epochs=5)
-    ```
-
     Args:
       initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a
         Python number.  The initial learning rate.
@@ -471,11 +480,6 @@ class InverseTimeDecay(LearningRateSchedule):
         continuous, fashion.
       name: String.  Optional name of the operation.  Defaults to
         'InverseTimeDecay'.
-
-    Returns:
-      A 1-arg callable learning rate schedule that takes the current optimizer
-      step and outputs the decayed learning rate, a scalar `Tensor` of the same
-      type as `initial_learning_rate`.
     """
     super(InverseTimeDecay, self).__init__()
 
@@ -513,7 +517,47 @@ class InverseTimeDecay(LearningRateSchedule):
 
 @keras_export("keras.experimental.CosineDecay")
 class CosineDecay(LearningRateSchedule):
-  """A LearningRateSchedule that uses a cosine decay schedule."""
+  """A LearningRateSchedule that uses a cosine decay schedule.
+
+  See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
+  with Warm Restarts. https://arxiv.org/abs/1608.03983
+
+  When training a model, it is often recommended to lower the learning rate as
+  the training progresses. This schedule applies a cosine decay function
+  to an optimizer step, given a provided initial learning rate.
+  It requires a `step` value to compute the decayed learning rate. You can
+  just pass a TensorFlow variable that you increment at each training step.
+
+  The schedule a 1-arg callable that produces a decayed learning
+  rate when passed the current optimizer step. This can be useful for changing
+  the learning rate value across different invocations of optimizer functions.
+  It is computed as:
+
+  ```python
+  def decayed_learning_rate(step):
+    step = min(step, decay_steps)
+    cosine_decay = 0.5 * (1 + cos(pi * step / decay_steps))
+    decayed = (1 - alpha) * cosine_decay + alpha
+    return initial_learning_rate * decayed
+  ```
+
+  Example usage:
+  ```python
+  decay_steps = 1000
+  lr_decayed_fn = tf.keras.experimental.CosineDecay(
+      initial_learning_rate, decay_steps)
+  ```
+
+  You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+  as the learning rate. The learning rate schedule is also serializable and
+  deserializable using `tf.keras.optimizers.schedules.serialize` and
+  `tf.keras.optimizers.schedules.deserialize`.
+
+  Returns:
+    A 1-arg callable learning rate schedule that takes the current optimizer
+    step and outputs the decayed learning rate, a scalar `Tensor` of the same
+    type as `initial_learning_rate`.
+  """
 
   def __init__(
       self,
@@ -523,40 +567,6 @@ class CosineDecay(LearningRateSchedule):
       name=None):
     """Applies cosine decay to the learning rate.
 
-    See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
-    with Warm Restarts. https://arxiv.org/abs/1608.03983
-
-    When training a model, it is often recommended to lower the learning rate as
-    the training progresses. This schedule applies a cosine decay function
-    to an optimizer step, given a provided initial learning rate.
-    It requires a `step` value to compute the decayed learning rate. You can
-    just pass a TensorFlow variable that you increment at each training step.
-
-    The schedule a 1-arg callable that produces a decayed learning
-    rate when passed the current optimizer step. This can be useful for changing
-    the learning rate value across different invocations of optimizer functions.
-    It is computed as:
-
-    ```python
-    def decayed_learning_rate(step):
-      step = min(step, decay_steps)
-      cosine_decay = 0.5 * (1 + cos(pi * step / decay_steps))
-      decayed = (1 - alpha) * cosine_decay + alpha
-      return initial_learning_rate * decayed
-    ```
-
-    Example usage:
-    ```python
-    decay_steps = 1000
-    lr_decayed_fn = tf.keras.experimental.CosineDecay(
-        initial_learning_rate, decay_steps)
-    ```
-
-    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
-    as the learning rate. The learning rate schedule is also serializable and
-    deserializable using `tf.keras.optimizers.schedules.serialize` and
-    `tf.keras.optimizers.schedules.deserialize`.
-
     Args:
       initial_learning_rate: A scalar `float32` or `float64` Tensor or a
         Python number. The initial learning rate.
@@ -565,10 +575,6 @@ class CosineDecay(LearningRateSchedule):
       alpha: A scalar `float32` or `float64` Tensor or a Python number.
         Minimum learning rate value as a fraction of initial_learning_rate.
       name: String. Optional name of the operation.  Defaults to 'CosineDecay'.
-    Returns:
-      A 1-arg callable learning rate schedule that takes the current optimizer
-      step and outputs the decayed learning rate, a scalar `Tensor` of the same
-      type as `initial_learning_rate`.
     """
     super(CosineDecay, self).__init__()
 
@@ -604,7 +610,45 @@ class CosineDecay(LearningRateSchedule):
 
 @keras_export("keras.experimental.CosineDecayRestarts")
 class CosineDecayRestarts(LearningRateSchedule):
-  """A LearningRateSchedule that uses a cosine decay schedule with restarts."""
+  """A LearningRateSchedule that uses a cosine decay schedule with restarts.
+
+  See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
+  with Warm Restarts. https://arxiv.org/abs/1608.03983
+
+  When training a model, it is often recommended to lower the learning rate as
+  the training progresses. This schedule applies a cosine decay function with
+  restarts to an optimizer step, given a provided initial learning rate.
+  It requires a `step` value to compute the decayed learning rate. You can
+  just pass a TensorFlow variable that you increment at each training step.
+
+  The schedule a 1-arg callable that produces a decayed learning
+  rate when passed the current optimizer step. This can be useful for changing
+  the learning rate value across different invocations of optimizer functions.
+
+  The learning rate multiplier first decays
+  from 1 to `alpha` for `first_decay_steps` steps. Then, a warm
+  restart is performed. Each new warm restart runs for `t_mul` times more
+  steps and with `m_mul` times smaller initial learning rate.
+
+  Example usage:
+  ```python
+  first_decay_steps = 1000
+  lr_decayed_fn = (
+    tf.keras.experimental.CosineDecayRestarts(
+        initial_learning_rate,
+        first_decay_steps))
+  ```
+
+  You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+  as the learning rate. The learning rate schedule is also serializable and
+  deserializable using `tf.keras.optimizers.schedules.serialize` and
+  `tf.keras.optimizers.schedules.deserialize`.
+
+  Returns:
+    A 1-arg callable learning rate schedule that takes the current optimizer
+    step and outputs the decayed learning rate, a scalar `Tensor` of the same
+    type as `initial_learning_rate`.
+  """
 
   def __init__(
       self,
@@ -616,38 +660,6 @@ class CosineDecayRestarts(LearningRateSchedule):
       name=None):
     """Applies cosine decay with restarts to the learning rate.
 
-    See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
-    with Warm Restarts. https://arxiv.org/abs/1608.03983
-
-    When training a model, it is often recommended to lower the learning rate as
-    the training progresses. This schedule applies a cosine decay function with
-    restarts to an optimizer step, given a provided initial learning rate.
-    It requires a `step` value to compute the decayed learning rate. You can
-    just pass a TensorFlow variable that you increment at each training step.
-
-    The schedule a 1-arg callable that produces a decayed learning
-    rate when passed the current optimizer step. This can be useful for changing
-    the learning rate value across different invocations of optimizer functions.
-
-    The learning rate multiplier first decays
-    from 1 to `alpha` for `first_decay_steps` steps. Then, a warm
-    restart is performed. Each new warm restart runs for `t_mul` times more
-    steps and with `m_mul` times smaller initial learning rate.
-
-    Example usage:
-    ```python
-    first_decay_steps = 1000
-    lr_decayed_fn = (
-      tf.keras.experimental.CosineDecayRestarts(
-          initial_learning_rate,
-          first_decay_steps))
-    ```
-
-    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
-    as the learning rate. The learning rate schedule is also serializable and
-    deserializable using `tf.keras.optimizers.schedules.serialize` and
-    `tf.keras.optimizers.schedules.deserialize`.
-
     Args:
       initial_learning_rate: A scalar `float32` or `float64` Tensor or a Python
         number. The initial learning rate.
@@ -660,10 +672,6 @@ class CosineDecayRestarts(LearningRateSchedule):
       alpha: A scalar `float32` or `float64` Tensor or a Python number.
         Minimum learning rate value as a fraction of the initial_learning_rate.
       name: String. Optional name of the operation.  Defaults to 'SGDRDecay'.
-    Returns:
-      A 1-arg callable learning rate schedule that takes the current optimizer
-      step and outputs the decayed learning rate, a scalar `Tensor` of the same
-      type as `initial_learning_rate`.
     """
     super(CosineDecayRestarts, self).__init__()
 
@@ -728,7 +736,57 @@ class CosineDecayRestarts(LearningRateSchedule):
 
 @keras_export("keras.experimental.LinearCosineDecay")
 class LinearCosineDecay(LearningRateSchedule):
-  """A LearningRateSchedule that uses a linear cosine decay schedule."""
+  """A LearningRateSchedule that uses a linear cosine decay schedule.
+
+  See [Bello et al., ICML2017] Neural Optimizer Search with RL.
+  https://arxiv.org/abs/1709.07417
+
+  For the idea of warm starts here controlled by `num_periods`,
+  see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
+  with Warm Restarts. https://arxiv.org/abs/1608.03983
+
+  Note that linear cosine decay is more aggressive than cosine decay and
+  larger initial learning rates can typically be used.
+
+  When training a model, it is often recommended to lower the learning rate as
+  the training progresses. This schedule applies a linear cosine decay
+  function to an optimizer step, given a provided initial learning rate.
+  It requires a `step` value to compute the decayed learning rate. You can
+  just pass a TensorFlow variable that you increment at each training step.
+
+  The schedule a 1-arg callable that produces a decayed learning
+  rate when passed the current optimizer step. This can be useful for changing
+  the learning rate value across different invocations of optimizer functions.
+  It is computed as:
+
+  ```python
+  def decayed_learning_rate(step):
+    step = min(step, decay_steps)
+    linear_decay = (decay_steps - step) / decay_steps
+    cosine_decay = 0.5 * (
+        1 + cos(pi * 2 * num_periods * step / decay_steps))
+    decayed = (alpha + linear_decay) * cosine_decay + beta
+    return initial_learning_rate * decayed
+  ```
+
+  Example usage:
+  ```python
+  decay_steps = 1000
+  lr_decayed_fn = (
+    tf.keras.experimental.LinearCosineDecay(
+      initial_learning_rate, decay_steps))
+  ```
+
+  You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+  as the learning rate. The learning rate schedule is also serializable and
+  deserializable using `tf.keras.optimizers.schedules.serialize` and
+  `tf.keras.optimizers.schedules.deserialize`.
+
+  Returns:
+    A 1-arg callable learning rate schedule that takes the current optimizer
+    step and outputs the decayed learning rate, a scalar `Tensor` of the same
+    type as `initial_learning_rate`.
+  """
 
   def __init__(
       self,
@@ -740,50 +798,6 @@ class LinearCosineDecay(LearningRateSchedule):
       name=None):
     """Applies linear cosine decay to the learning rate.
 
-    See [Bello et al., ICML2017] Neural Optimizer Search with RL.
-    https://arxiv.org/abs/1709.07417
-
-    For the idea of warm starts here controlled by `num_periods`,
-    see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
-    with Warm Restarts. https://arxiv.org/abs/1608.03983
-
-    Note that linear cosine decay is more aggressive than cosine decay and
-    larger initial learning rates can typically be used.
-
-    When training a model, it is often recommended to lower the learning rate as
-    the training progresses. This schedule applies a linear cosine decay
-    function to an optimizer step, given a provided initial learning rate.
-    It requires a `step` value to compute the decayed learning rate. You can
-    just pass a TensorFlow variable that you increment at each training step.
-
-    The schedule a 1-arg callable that produces a decayed learning
-    rate when passed the current optimizer step. This can be useful for changing
-    the learning rate value across different invocations of optimizer functions.
-    It is computed as:
-
-    ```python
-    def decayed_learning_rate(step):
-      step = min(step, decay_steps)
-      linear_decay = (decay_steps - step) / decay_steps
-      cosine_decay = 0.5 * (
-          1 + cos(pi * 2 * num_periods * step / decay_steps))
-      decayed = (alpha + linear_decay) * cosine_decay + beta
-      return initial_learning_rate * decayed
-    ```
-
-    Example usage:
-    ```python
-    decay_steps = 1000
-    lr_decayed_fn = (
-      tf.keras.experimental.LinearCosineDecay(
-        initial_learning_rate, decay_steps))
-    ```
-
-    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
-    as the learning rate. The learning rate schedule is also serializable and
-    deserializable using `tf.keras.optimizers.schedules.serialize` and
-    `tf.keras.optimizers.schedules.deserialize`.
-
     Args:
       initial_learning_rate: A scalar `float32` or `float64` Tensor or a Python
         number. The initial learning rate.
@@ -795,10 +809,6 @@ class LinearCosineDecay(LearningRateSchedule):
       beta: See computation above.
       name: String.  Optional name of the operation.  Defaults to
         'LinearCosineDecay'.
-    Returns:
-      A 1-arg callable learning rate schedule that takes the current optimizer
-      step and outputs the decayed learning rate, a scalar `Tensor` of the same
-      type as `initial_learning_rate`.
     """
     super(LinearCosineDecay, self).__init__()
 
@@ -844,7 +854,59 @@ class LinearCosineDecay(LearningRateSchedule):
 
 @keras_export("keras.experimental.NoisyLinearCosineDecay")
 class NoisyLinearCosineDecay(LearningRateSchedule):
-  """A LearningRateSchedule that uses a noisy linear cosine decay schedule."""
+  """A LearningRateSchedule that uses a noisy linear cosine decay schedule.
+
+  See [Bello et al., ICML2017] Neural Optimizer Search with RL.
+  https://arxiv.org/abs/1709.07417
+
+  For the idea of warm starts here controlled by `num_periods`,
+  see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
+  with Warm Restarts. https://arxiv.org/abs/1608.03983
+
+  Note that linear cosine decay is more aggressive than cosine decay and
+  larger initial learning rates can typically be used.
+
+  When training a model, it is often recommended to lower the learning rate as
+  the training progresses. This schedule applies a noisy linear cosine decay
+  function to an optimizer step, given a provided initial learning rate.
+  It requires a `step` value to compute the decayed learning rate. You can
+  just pass a TensorFlow variable that you increment at each training step.
+
+  The schedule a 1-arg callable that produces a decayed learning
+  rate when passed the current optimizer step. This can be useful for changing
+  the learning rate value across different invocations of optimizer functions.
+  It is computed as:
+
+  ```python
+  def decayed_learning_rate(step):
+    step = min(step, decay_steps)
+    linear_decay = (decay_steps - step) / decay_steps)
+    cosine_decay = 0.5 * (
+        1 + cos(pi * 2 * num_periods * step / decay_steps))
+    decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta
+    return initial_learning_rate * decayed
+  ```
+  where eps_t is 0-centered gaussian noise with variance
+  initial_variance / (1 + global_step) ** variance_decay
+
+  Example usage:
+  ```python
+  decay_steps = 1000
+  lr_decayed_fn = (
+    tf.keras.experimental.NoisyLinearCosineDecay(
+      initial_learning_rate, decay_steps))
+  ```
+
+  You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+  as the learning rate. The learning rate schedule is also serializable and
+  deserializable using `tf.keras.optimizers.schedules.serialize` and
+  `tf.keras.optimizers.schedules.deserialize`.
+
+  Returns:
+    A 1-arg callable learning rate schedule that takes the current optimizer
+    step and outputs the decayed learning rate, a scalar `Tensor` of the same
+    type as `initial_learning_rate`.
+  """
 
   def __init__(
       self,
@@ -858,52 +920,6 @@ class NoisyLinearCosineDecay(LearningRateSchedule):
       name=None):
     """Applies noisy linear cosine decay to the learning rate.
 
-    See [Bello et al., ICML2017] Neural Optimizer Search with RL.
-    https://arxiv.org/abs/1709.07417
-
-    For the idea of warm starts here controlled by `num_periods`,
-    see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
-    with Warm Restarts. https://arxiv.org/abs/1608.03983
-
-    Note that linear cosine decay is more aggressive than cosine decay and
-    larger initial learning rates can typically be used.
-
-    When training a model, it is often recommended to lower the learning rate as
-    the training progresses. This schedule applies a noisy linear cosine decay
-    function to an optimizer step, given a provided initial learning rate.
-    It requires a `step` value to compute the decayed learning rate. You can
-    just pass a TensorFlow variable that you increment at each training step.
-
-    The schedule a 1-arg callable that produces a decayed learning
-    rate when passed the current optimizer step. This can be useful for changing
-    the learning rate value across different invocations of optimizer functions.
-    It is computed as:
-
-    ```python
-    def decayed_learning_rate(step):
-      step = min(step, decay_steps)
-      linear_decay = (decay_steps - step) / decay_steps)
-      cosine_decay = 0.5 * (
-          1 + cos(pi * 2 * num_periods * step / decay_steps))
-      decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta
-      return initial_learning_rate * decayed
-    ```
-    where eps_t is 0-centered gaussian noise with variance
-    initial_variance / (1 + global_step) ** variance_decay
-
-    Example usage:
-    ```python
-    decay_steps = 1000
-    lr_decayed_fn = (
-      tf.keras.experimental.NoisyLinearCosineDecay(
-        initial_learning_rate, decay_steps))
-    ```
-
-    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
-    as the learning rate. The learning rate schedule is also serializable and
-    deserializable using `tf.keras.optimizers.schedules.serialize` and
-    `tf.keras.optimizers.schedules.deserialize`.
-
     Args:
       initial_learning_rate: A scalar `float32` or `float64` Tensor or a Python
         number. The initial learning rate.
@@ -917,10 +933,6 @@ class NoisyLinearCosineDecay(LearningRateSchedule):
       beta: See computation above.
       name: String.  Optional name of the operation.  Defaults to
         'NoisyLinearCosineDecay'.
-    Returns:
-      A 1-arg callable learning rate schedule that takes the current optimizer
-      step and outputs the decayed learning rate, a scalar `Tensor` of the same
-      type as `initial_learning_rate`.
     """
     super(NoisyLinearCosineDecay, self).__init__()
 
diff --git a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py
index 56f4a30095b..859f78744aa 100644
--- a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py
+++ b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 # Import resource_variable_ops for the variables-to-tensor implicit conversion.
@@ -43,12 +44,13 @@ def _maybe_serialized(lr_decay, serialize_and_deserialize):
     return lr_decay
 
 
-@parameterized.named_parameters(
-    ("NotSerialized", False),
-    ("Serialized", True))
+# @parameterized.named_parameters(
+#     ("NotSerialized", False),
+#     ("Serialized", True))
+@combinations.generate(combinations.combine(serialize=[False, True],
+                                            mode=["graph", "eager"]))
 class LRDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
   def testContinuous(self, serialize):
     self.evaluate(variables.global_variables_initializer())
     step = 5
@@ -57,7 +59,6 @@ class LRDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
     expected = .05 * 0.96**(5.0 / 10.0)
     self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testStaircase(self, serialize):
     if context.executing_eagerly():
       step = resource_variable_ops.ResourceVariable(0)
@@ -102,7 +103,6 @@ class LRDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
       expected = .1 * 0.96**(100 // 3)
       self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testPiecewiseConstant(self, serialize):
     x = resource_variable_ops.ResourceVariable(-999)
     decayed_lr = learning_rate_schedule.PiecewiseConstantDecay(
@@ -143,7 +143,6 @@ class LRDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
       minimize()
       self.assertAllEqual(v.read_value(), -1.0)
 
-  @test_util.run_in_graph_and_eager_modes
   def testPiecewiseConstantEdgeCases(self, serialize):
     # Test casting boundaries from int32 to int64.
     x_int64 = resource_variable_ops.ResourceVariable(
@@ -165,12 +164,13 @@ class LRDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllClose(self.evaluate(decayed_lr(x_int64)), 0.7, 1e-6)
 
 
-@parameterized.named_parameters(
-    ("NotSerialized", False),
-    ("Serialized", True))
+# @parameterized.named_parameters(
+#     ("NotSerialized", False),
+#     ("Serialized", True))
+@combinations.generate(combinations.combine(serialize=[False, True],
+                                            mode=["graph", "eager"]))
 class LinearDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
   def testHalfWay(self, serialize):
     step = 5
     lr = 0.05
@@ -180,7 +180,6 @@ class LinearDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
     expected = lr * 0.5
     self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testEnd(self, serialize):
     step = 10
     lr = 0.05
@@ -190,7 +189,6 @@ class LinearDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
     expected = end_lr
     self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testHalfWayWithEnd(self, serialize):
     step = 5
     lr = 0.05
@@ -200,7 +198,6 @@ class LinearDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
     expected = (lr + end_lr) * 0.5
     self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testBeyondEnd(self, serialize):
     step = 15
     lr = 0.05
@@ -210,7 +207,6 @@ class LinearDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
     expected = end_lr
     self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testBeyondEndWithCycle(self, serialize):
     step = 15
     lr = 0.05
@@ -222,13 +218,14 @@ class LinearDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
 
 
-@parameterized.named_parameters(
-    ("NotSerialized", False),
-    ("Serialized", True))
+# @parameterized.named_parameters(
+#     ("NotSerialized", False),
+#     ("Serialized", True))
+@combinations.generate(combinations.combine(serialize=[False, True],
+                                            mode=["graph", "eager"]))
 class SqrtDecayTestV2(test_util.TensorFlowTestCase,
                       parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
   def testHalfWay(self, serialize):
     step = 5
     lr = 0.05
@@ -240,7 +237,6 @@ class SqrtDecayTestV2(test_util.TensorFlowTestCase,
     expected = lr * 0.5**power
     self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testEnd(self, serialize):
     step = 10
     lr = 0.05
@@ -252,7 +248,6 @@ class SqrtDecayTestV2(test_util.TensorFlowTestCase,
     expected = end_lr
     self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testHalfWayWithEnd(self, serialize):
     step = 5
     lr = 0.05
@@ -264,7 +259,6 @@ class SqrtDecayTestV2(test_util.TensorFlowTestCase,
     expected = (lr - end_lr) * 0.5**power + end_lr
     self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testBeyondEnd(self, serialize):
     step = 15
     lr = 0.05
@@ -276,7 +270,6 @@ class SqrtDecayTestV2(test_util.TensorFlowTestCase,
     expected = end_lr
     self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testBeyondEndWithCycle(self, serialize):
     step = 15
     lr = 0.05
@@ -289,13 +282,14 @@ class SqrtDecayTestV2(test_util.TensorFlowTestCase,
     self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
 
 
-@parameterized.named_parameters(
-    ("NotSerialized", False),
-    ("Serialized", True))
+# @parameterized.named_parameters(
+#     ("NotSerialized", False),
+#     ("Serialized", True))
+@combinations.generate(combinations.combine(serialize=[False, True],
+                                            mode=["graph", "eager"]))
 class PolynomialDecayTestV2(test_util.TensorFlowTestCase,
                             parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
   def testBeginWithCycle(self, serialize):
     lr = 0.001
     decay_steps = 10
@@ -307,12 +301,13 @@ class PolynomialDecayTestV2(test_util.TensorFlowTestCase,
     self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
 
 
-@parameterized.named_parameters(
-    ("NotSerialized", False),
-    ("Serialized", True))
+# @parameterized.named_parameters(
+#     ("NotSerialized", False),
+#     ("Serialized", True))
+@combinations.generate(combinations.combine(serialize=[False, True],
+                                            mode=["graph", "eager"]))
 class InverseDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
   def testDecay(self, serialize):
     initial_lr = 0.1
     k = 10
@@ -328,7 +323,6 @@ class InverseDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
       self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
       self.evaluate(step.assign_add(1))
 
-  @test_util.run_in_graph_and_eager_modes
   def testStaircase(self, serialize):
     initial_lr = 0.1
     k = 10
@@ -345,9 +339,8 @@ class InverseDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
       self.evaluate(step.assign_add(1))
 
 
-@parameterized.named_parameters(
-    ("NotSerialized", False),
-    ("Serialized", True))
+@combinations.generate(combinations.combine(serialize=[False, True],
+                                            mode=["graph", "eager"]))
 class CosineDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def np_cosine_decay(self, step, decay_steps, alpha=0.0):
@@ -356,7 +349,6 @@ class CosineDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
     decay = 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
     return (1.0 - alpha) * decay + alpha
 
-  @test_util.run_in_graph_and_eager_modes
   def testDecay(self, serialize):
     num_training_steps = 1000
     initial_lr = 1.0
@@ -367,7 +359,6 @@ class CosineDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
       expected = self.np_cosine_decay(step, num_training_steps)
       self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testAlpha(self, serialize):
     num_training_steps = 1000
     initial_lr = 1.0
@@ -381,9 +372,8 @@ class CosineDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
       self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
 
 
-@parameterized.named_parameters(
-    ("NotSerialized", False),
-    ("Serialized", True))
+@combinations.generate(combinations.combine(serialize=[False, True],
+                                            mode=["graph", "eager"]))
 class CosineDecayRestartsTestV2(test_util.TensorFlowTestCase,
                                 parameterized.TestCase):
 
@@ -399,7 +389,6 @@ class CosineDecayRestartsTestV2(test_util.TensorFlowTestCase,
     decay = fac * 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
     return (1.0 - alpha) * decay + alpha
 
-  @test_util.run_in_graph_and_eager_modes
   def testDecay(self, serialize):
     num_training_steps = 1000
     initial_lr = 1.0
@@ -410,7 +399,6 @@ class CosineDecayRestartsTestV2(test_util.TensorFlowTestCase,
       expected = self.np_cosine_decay_restarts(step, num_training_steps)
       self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testAlpha(self, serialize):
     num_training_steps = 1000
     initial_lr = 1.0
@@ -423,7 +411,6 @@ class CosineDecayRestartsTestV2(test_util.TensorFlowTestCase,
           step, num_training_steps, alpha=alpha)
       self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testMMul(self, serialize):
     num_training_steps = 1000
     initial_lr = 1.0
@@ -436,7 +423,6 @@ class CosineDecayRestartsTestV2(test_util.TensorFlowTestCase,
           step, num_training_steps, m_mul=m_mul)
       self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testTMul(self, serialize):
     num_training_steps = 1000
     initial_lr = 1.0
@@ -450,9 +436,8 @@ class CosineDecayRestartsTestV2(test_util.TensorFlowTestCase,
       self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
 
 
-@parameterized.named_parameters(
-    ("NotSerialized", False),
-    ("Serialized", True))
+@combinations.generate(combinations.combine(serialize=[False, True],
+                                            mode=["graph", "eager"]))
 class LinearCosineDecayTestV2(test_util.TensorFlowTestCase,
                               parameterized.TestCase):
 
@@ -468,7 +453,6 @@ class LinearCosineDecayTestV2(test_util.TensorFlowTestCase,
     cosine_decayed = 0.5 * (1.0 + math.cos(math.pi * fraction))
     return (alpha + linear_decayed) * cosine_decayed + beta
 
-  @test_util.run_in_graph_and_eager_modes
   def testDefaultDecay(self, serialize):
     num_training_steps = 1000
     initial_lr = 1.0
@@ -479,7 +463,6 @@ class LinearCosineDecayTestV2(test_util.TensorFlowTestCase,
       expected = self.np_linear_cosine_decay(step, num_training_steps)
       self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
 
-  @test_util.run_in_graph_and_eager_modes
   def testNonDefaultDecay(self, serialize):
     num_training_steps = 1000
     initial_lr = 1.0
@@ -496,13 +479,11 @@ class LinearCosineDecayTestV2(test_util.TensorFlowTestCase,
       self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
 
 
-@parameterized.named_parameters(
-    ("NotSerialized", False),
-    ("Serialized", True))
+@combinations.generate(combinations.combine(serialize=[False, True],
+                                            mode=["graph", "eager"]))
 class NoisyLinearCosineDecayTestV2(test_util.TensorFlowTestCase,
                                    parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
   def testDefaultNoisyLinearCosine(self, serialize):
     num_training_steps = 1000
     initial_lr = 1.0
@@ -514,7 +495,6 @@ class NoisyLinearCosineDecayTestV2(test_util.TensorFlowTestCase,
       # Cannot be deterministically tested
       self.evaluate(decayed_lr(step))
 
-  @test_util.run_in_graph_and_eager_modes
   def testNonDefaultNoisyLinearCosine(self, serialize):
     num_training_steps = 1000
     initial_lr = 1.0
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index 5a8baca7733..72cdda616b5 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -223,9 +223,9 @@ class OptimizerV2(trackable.Trackable):
   If you intend to create your own optimization algorithm, simply inherit from
   this class and override the following methods:
 
-    - resource_apply_dense (update variable given gradient tensor is dense)
-    - resource_apply_sparse (update variable given gradient tensor is sparse)
-    - create_slots (if your optimizer algorithm requires additional variables)
+    - _resource_apply_dense (update variable given gradient tensor is dense)
+    - _resource_apply_sparse (update variable given gradient tensor is sparse)
+    - _create_slots (if your optimizer algorithm requires additional variables)
     - get_config (serialization of the optimizer, include all hyper parameters)
   """
 
@@ -621,6 +621,9 @@ class OptimizerV2(trackable.Trackable):
     else:
       return value
 
+  def _create_slots(self, var_list):
+    pass
+
   def __getattribute__(self, name):
     """Overridden to support hyperparameter access."""
     try:
@@ -1009,7 +1012,7 @@ class OptimizerV2(trackable.Trackable):
     Returns:
       An `Operation` which updates the value of the variable.
     """
-    raise NotImplementedError()
+    raise NotImplementedError("Must be implemented in subclasses.")
 
   def _resource_apply_sparse_duplicate_indices(self, grad, handle, indices,
                                                **kwargs):
@@ -1058,7 +1061,7 @@ class OptimizerV2(trackable.Trackable):
     Returns:
       An `Operation` which updates the value of the variable.
     """
-    raise NotImplementedError()
+    raise NotImplementedError("Must be implemented in subclasses.")
 
   def _resource_scatter_add(self, x, i, v):
     with ops.control_dependencies(
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
index c14c4d71f67..ede1dc21618 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import callbacks
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import losses
 from tensorflow.python.keras import optimizers
@@ -67,9 +68,9 @@ if (not test_util.IsBuiltWithNvcc() and not test.is_built_with_rocm()):
   _DATA_TYPES += [dtypes.complex64, dtypes.complex128]
 
 
-class OptimizerTest(test.TestCase):
+class OptimizerTest(test.TestCase, parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testBasic(self):
     for dtype in _DATA_TYPES:
       with test_util.use_gpu():
@@ -90,50 +91,51 @@ class OptimizerTest(test.TestCase):
         self.assertAllClose([-14., -13.], self.evaluate(var0))
         self.assertAllClose([-6., -5.], self.evaluate(var1))
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testAdaptiveLearningRate(self):
     for dtype in _DATA_TYPES:
-      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
-      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+      with self.test_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
 
-      def loss():
-        return 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
+        def loss():
+          return 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
 
-      sgd = gradient_descent.SGD(1.0)
+        sgd = gradient_descent.SGD(1.0)
 
-      self.evaluate(variables.global_variables_initializer())
-      # Fetch params to validate initial values
-      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-      self.assertAllClose([3.0, 4.0], self.evaluate(var1))
-      # Run 1 step of sgd through optimizer
-      opt_op = sgd.minimize(loss, [var0, var1])
-      self.evaluate(variables.global_variables_initializer())
-      self.evaluate(opt_op)
-      # Validate updated params
-      # var0 = [1., 2.] - 1.0 * [5, 5]
-      self.assertAllClose([-4., -3.], self.evaluate(var0))
-      # var1 = [3., 4.] - 1.0 * [3, 3]
-      self.assertAllClose([0., 1.], self.evaluate(var1))
-
-      sgd.learning_rate = 0.5
-      if context.executing_eagerly():
-        sgd.minimize(loss, [var0, var1])
-      else:
+        self.evaluate(variables.global_variables_initializer())
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+        # Run 1 step of sgd through optimizer
+        opt_op = sgd.minimize(loss, [var0, var1])
+        self.evaluate(variables.global_variables_initializer())
         self.evaluate(opt_op)
-      # Validate updated params
-      # var0 = [-4., -3.] - 0.5 * [5, 5]
-      self.assertAllClose([-6.5, -5.5], self.evaluate(var0))
-      # var1 = [0., 1.] - 0.5 * [3, 3]
-      self.assertAllClose([-1.5, -0.5], self.evaluate(var1))
+        # Validate updated params
+        # var0 = [1., 2.] - 1.0 * [5, 5]
+        self.assertAllClose([-4., -3.], self.evaluate(var0))
+        # var1 = [3., 4.] - 1.0 * [3, 3]
+        self.assertAllClose([0., 1.], self.evaluate(var1))
 
-      sgd.learning_rate = learning_rate_schedule.InverseTimeDecay(
-          0.5, decay_steps=1.0, decay_rate=0.5)
-      if context.executing_eagerly():
-        sgd.minimize(loss, [var0, var1])
-      else:
-        self.evaluate(opt_op)
+        sgd.learning_rate = 0.5
+        if context.executing_eagerly():
+          sgd.minimize(loss, [var0, var1])
+        else:
+          self.evaluate(opt_op)
+        # Validate updated params
+        # var0 = [-4., -3.] - 0.5 * [5, 5]
+        self.assertAllClose([-6.5, -5.5], self.evaluate(var0))
+        # var1 = [0., 1.] - 0.5 * [3, 3]
+        self.assertAllClose([-1.5, -0.5], self.evaluate(var1))
 
-  @test_util.run_in_graph_and_eager_modes
+        sgd.learning_rate = learning_rate_schedule.InverseTimeDecay(
+            0.5, decay_steps=1.0, decay_rate=0.5)
+        if context.executing_eagerly():
+          sgd.minimize(loss, [var0, var1])
+        else:
+          self.evaluate(opt_op)
+
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testPrecomputedGradient(self):
     for dtype in _DATA_TYPES:
       with test_util.use_gpu():
@@ -157,7 +159,7 @@ class OptimizerTest(test.TestCase):
         self.assertAllClose([3.0 - 3 * 3 * 42.0, 4.0 - 3 * 3 * (-42.0)],
                             self.evaluate(var1))
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testNoGradients(self):
     for dtype in _DATA_TYPES:
       with test_util.use_gpu():
@@ -169,7 +171,7 @@ class OptimizerTest(test.TestCase):
           # var1 has no gradient
           sgd_op.minimize(loss, var_list=[var1])
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testNoGradientsForAnyVariables_Minimize(self):
     for dtype in _DATA_TYPES:
       with test_util.use_gpu():
@@ -182,7 +184,7 @@ class OptimizerTest(test.TestCase):
                                      'No gradients provided for any variable'):
           sgd_op.minimize(loss, var_list=[var0, var1])
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testNoGradientsForAnyVariables_ApplyGradients(self):
     for dtype in _DATA_TYPES:
       with test_util.use_gpu():
@@ -193,7 +195,7 @@ class OptimizerTest(test.TestCase):
                                      'No gradients provided for any variable'):
           sgd_op.apply_gradients([(None, var0), (None, var1)])
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testGradientsAsVariables(self):
     for i, dtype in enumerate(_DATA_TYPES):
       with test_util.use_gpu():
@@ -232,7 +234,7 @@ class OptimizerTest(test.TestCase):
         self.assertAllClose([-14., -13.], self.evaluate(var0))
         self.assertAllClose([-6., -5.], self.evaluate(var1))
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testComputeGradientsWithTensors(self):
     with test_util.use_gpu():
       x = ops.convert_to_tensor_v2(1.0)
@@ -250,7 +252,7 @@ class OptimizerTest(test.TestCase):
       with self.assertRaises(NotImplementedError):
         sgd.apply_gradients(grads_and_vars)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testConstraint(self):
     constraint_01 = lambda x: clip_ops.clip_by_value(x, -0.1, 0.)
     constraint_0 = lambda x: clip_ops.clip_by_value(x, 0., 1.)
@@ -274,14 +276,14 @@ class OptimizerTest(test.TestCase):
       self.assertAllClose([-0.1, -0.1], self.evaluate(var0))
       self.assertAllClose([0., 0.], self.evaluate(var1))
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testIterationWithoutMinimize(self):
     with test_util.use_gpu():
       sgd = gradient_descent.SGD(3.0)
       self.evaluate(sgd.iterations.initializer)
       self.assertEqual(0, self.evaluate(sgd.iterations))
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testConfig(self):
     with test_util.use_gpu():
       opt = gradient_descent.SGD(learning_rate=1.0)
@@ -301,7 +303,7 @@ class OptimizerTest(test.TestCase):
       self.evaluate(variables.global_variables_initializer())
       self.assertEqual(self.evaluate(lr), self.evaluate(lr3))
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testConfigWithLearningRateDecay(self):
     with test_util.use_gpu():
       var0 = variables.Variable([[1.0], [2.0]], dtype=dtypes.float32)
@@ -332,7 +334,7 @@ class OptimizerTest(test.TestCase):
             self.evaluate(opt._get_hyper('learning_rate')(step)),
             opt3._get_hyper('learning_rate')(step))
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testGradClipValue(self):
     with test_util.use_gpu():
       var = resource_variable_ops.ResourceVariable([1.0, 2.0])
@@ -343,7 +345,7 @@ class OptimizerTest(test.TestCase):
       self.evaluate(opt_op)
       self.assertAllClose([0., 1.], self.evaluate(var))
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testGradClipNorm(self):
     with test_util.use_gpu():
       var = resource_variable_ops.ResourceVariable([1.0])
@@ -354,17 +356,17 @@ class OptimizerTest(test.TestCase):
       self.evaluate(opt_op)
       self.assertAllClose([0.], self.evaluate(var))
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testInvalidClipNorm(self):
     with self.assertRaisesRegexp(ValueError, '>= 0'):
       gradient_descent.SGD(learning_rate=1.0, clipnorm=-1.0)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testInvalidKwargs(self):
     with self.assertRaisesRegexp(TypeError, 'Unexpected keyword argument'):
       gradient_descent.SGD(learning_rate=1.0, invalidkwargs=1.0)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testWeights(self):
     with test_util.use_gpu():
       opt1 = adam.Adam(learning_rate=1.0)
@@ -415,56 +417,58 @@ class OptimizerTest(test.TestCase):
       self.assertAllClose(
           self.evaluate([var3, var4]), self.evaluate([var5, var6]))
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testGettingHyperParameters(self):
-    opt = adam.Adam(learning_rate=1.0)
-    var = resource_variable_ops.ResourceVariable([1.0, 2.0],
-                                                 dtype=dtypes.float32)
-    loss = lambda: 3 * var
-    opt_op = opt.minimize(loss, [var])
-    self.evaluate(variables.global_variables_initializer())
-    self.evaluate(opt_op)
+    with self.test_session():
+      opt = adam.Adam(learning_rate=1.0)
+      var = resource_variable_ops.ResourceVariable([1.0, 2.0],
+                                                   dtype=dtypes.float32)
+      loss = lambda: 3 * var
+      opt_op = opt.minimize(loss, [var])
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate(opt_op)
 
-    lr = self.evaluate(opt.lr)
-    self.assertEqual(1.0, lr)
+      lr = self.evaluate(opt.lr)
+      self.assertEqual(1.0, lr)
 
-    opt.lr = 2.0
-    lr = self.evaluate(opt.lr)
-    self.assertEqual(2.0, lr)
+      opt.lr = 2.0
+      lr = self.evaluate(opt.lr)
+      self.assertEqual(2.0, lr)
 
-    self.evaluate(opt.lr.assign(3.0))
-    lr = self.evaluate(opt.lr)
-    self.assertEqual(3.0, lr)
+      self.evaluate(opt.lr.assign(3.0))
+      lr = self.evaluate(opt.lr)
+      self.assertEqual(3.0, lr)
 
-    with self.assertRaises(AttributeError):
-      opt.not_an_attr += 3
+      with self.assertRaises(AttributeError):
+        opt.not_an_attr += 3
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testGettingHyperParametersWithLrInConstructor(self):
-    opt = gradient_descent.SGD(lr=3.0)
-    var = resource_variable_ops.ResourceVariable([1.0, 2.0],
-                                                 dtype=dtypes.float32)
-    loss = lambda: 3 * var
-    opt_op = opt.minimize(loss, [var])
-    self.evaluate(variables.global_variables_initializer())
-    self.evaluate(opt_op)
+    with self.test_session():
+      opt = gradient_descent.SGD(lr=3.0)
+      var = resource_variable_ops.ResourceVariable([1.0, 2.0],
+                                                   dtype=dtypes.float32)
+      loss = lambda: 3 * var
+      opt_op = opt.minimize(loss, [var])
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate(opt_op)
 
-    self.assertTrue(isinstance(opt.lr, resource_variable_ops.ResourceVariable))
-    self.assertTrue(
-        isinstance(opt.learning_rate, resource_variable_ops.ResourceVariable))
+      self.assertIsInstance(opt.lr, resource_variable_ops.ResourceVariable)
+      self.assertIsInstance(opt.learning_rate,
+                            resource_variable_ops.ResourceVariable)
 
-    lr = self.evaluate(opt.lr)
-    self.assertEqual(3.0, lr)
+      lr = self.evaluate(opt.lr)
+      self.assertEqual(3.0, lr)
 
-    opt.lr = 2.0
-    lr = self.evaluate(opt.lr)
-    self.assertEqual(2.0, lr)
+      opt.lr = 2.0
+      lr = self.evaluate(opt.lr)
+      self.assertEqual(2.0, lr)
 
-    self.evaluate(opt.lr.assign(4.0))
-    lr = self.evaluate(opt.lr)
-    self.assertEqual(4.0, lr)
+      self.evaluate(opt.lr.assign(4.0))
+      lr = self.evaluate(opt.lr)
+      self.assertEqual(4.0, lr)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testOptimizerWithKerasModel(self):
     a = input_layer.Input(shape=(3,), name='input_a')
     b = input_layer.Input(shape=(3,), name='input_b')
@@ -490,7 +494,7 @@ class OptimizerTest(test.TestCase):
               epochs=1,
               batch_size=5)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testOptimizerWithCallbacks(self):
     np.random.seed(1331)
     input_np = np.random.random((10, 3))
@@ -553,7 +557,7 @@ class OptimizerTest(test.TestCase):
     new_step_value = self.evaluate(global_step)
     self.assertEqual(new_step_value, init_step_value + 1)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testOptimizerWithCallableVarList(self):
     train_samples = 20
     input_dim = 1
@@ -615,13 +619,13 @@ class OptimizerTest(test.TestCase):
       self.assertLen(opt_vars, 5)
       self.assertEqual('outter/Adam/var_2/m:0', opt_vars[3].name)
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testEmptyVarList(self):
     opt = gradient_descent.SGD(1.)
     opt.minimize(lambda: constant_op.constant(1.), [])
     opt.apply_gradients([])
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testAggregationTrue(self):
     # Test that all_reduce_sum_gradients=True works without distributed
     # strategy.
@@ -636,7 +640,7 @@ class OptimizerTest(test.TestCase):
     self.evaluate(opt_op)
     self.assertAllClose([0.7, 1.7], self.evaluate(var))
 
-  @test_util.run_in_graph_and_eager_modes
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testAggregationFalse(self):
     # Test that all_reduce_sum_gradients=False works without distributed
     # strategy.
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop.py b/tensorflow/python/keras/optimizer_v2/rmsprop.py
index 64942f911ec..bf08870bee5 100644
--- a/tensorflow/python/keras/optimizer_v2/rmsprop.py
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop.py
@@ -65,6 +65,18 @@ class RMSprop(optimizer_v2.OptimizerV2):
       \mathrm{learning\_rate} * g_t / sqrt(rms_t - mg_t^2 + \epsilon)$$
   $$\theta_t = \theta_{t-1} - mom_t$$
 
+  Note that in the dense implementation of this algorithm, variables and their
+  corresponding accumulators (momentum, gradient moving average, square
+  gradient moving average) will be updated even if the gradient is zero
+  (i.e. accumulators will decay, momentum will be applied). The sparse
+  implementation (used when the gradient is an `IndexedSlices` object,
+  typically because of `tf.gather` or an embedding lookup in the forward pass)
+  will not update variable slices or their accumulators unless those slices
+  were used in the forward pass (nor is there an "eventual" correction to
+  account for these omitted updates). This leads to more efficient updates for
+  large embedding lookup tables (where most of the slices are not accessed in
+  a particular graph execution), but differs from the published algorithm.
+
   Usage:
 
   >>> opt = tf.keras.optimizers.RMSprop(learning_rate=0.1)
@@ -91,18 +103,6 @@ class RMSprop(optimizer_v2.OptimizerV2):
                **kwargs):
     """Construct a new RMSprop optimizer.
 
-    Note that in the dense implementation of this algorithm, variables and their
-    corresponding accumulators (momentum, gradient moving average, square
-    gradient moving average) will be updated even if the gradient is zero
-    (i.e. accumulators will decay, momentum will be applied). The sparse
-    implementation (used when the gradient is an `IndexedSlices` object,
-    typically because of `tf.gather` or an embedding lookup in the forward pass)
-    will not update variable slices or their accumulators unless those slices
-    were used in the forward pass (nor is there an "eventual" correction to
-    account for these omitted updates). This leads to more efficient updates for
-    large embedding lookup tables (where most of the slices are not accessed in
-    a particular graph execution), but differs from the published algorithm.
-
     Args:
       learning_rate: A `Tensor`, floating point value, or a schedule that is a
         `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop_test.py b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
index 538cbca6218..66da72ba0ac 100644
--- a/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 from tensorflow.python.keras.optimizer_v2 import rmsprop
 from tensorflow.python.ops import embedding_ops
@@ -547,11 +548,11 @@ class RMSpropOptimizerTest(test.TestCase):
           self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
 
 
+@combinations.generate(combinations.combine(mode=["graph", "eager"]))
 class SlotColocationTest(test.TestCase, parameterized.TestCase):
 
   @parameterized.parameters([True, False])
   @test_util.run_gpu_only
-  @test_util.run_in_graph_and_eager_modes
   def testRunMinimizeOnGPUForCPUVariables(self, use_resource):
     with ops.device("/device:CPU:0"):
       if use_resource:
diff --git a/tensorflow/python/keras/premade/wide_deep.py b/tensorflow/python/keras/premade/wide_deep.py
index 5823144d2c1..8638d3afc71 100644
--- a/tensorflow/python/keras/premade/wide_deep.py
+++ b/tensorflow/python/keras/premade/wide_deep.py
@@ -42,14 +42,14 @@ class WideDeepModel(keras_training.Model):
   linear_model = LinearModel()
   dnn_model = keras.Sequential([keras.layers.Dense(units=64),
                                keras.layers.Dense(units=1)])
-  combined_model = WideDeepModel(dnn_model, linear_model)
+  combined_model = WideDeepModel(linear_model, dnn_model)
   combined_model.compile(optimizer=['sgd', 'adam'], 'mse', ['mse'])
   # define dnn_inputs and linear_inputs as separate numpy arrays or
   # a single numpy array if dnn_inputs is same as linear_inputs.
-  combined_model.fit([dnn_inputs, linear_inputs], y, epochs)
+  combined_model.fit([linear_inputs, dnn_inputs], y, epochs)
   # or define a single `tf.data.Dataset` that contains a single tensor or
   # separate tensors for dnn_inputs and linear_inputs.
-  dataset = tf.data.Dataset.from_tensors(([dnn_inputs, linear_inputs], y))
+  dataset = tf.data.Dataset.from_tensors(([linear_inputs, dnn_inputs], y))
   combined_model.fit(dataset, epochs)
   ```
 
@@ -64,9 +64,9 @@ class WideDeepModel(keras_training.Model):
   dnn_model = keras.Sequential([keras.layers.Dense(units=1)])
   dnn_model.compile('rmsprop', 'mse')
   dnn_model.fit(dnn_inputs, y, epochs)
-  combined_model = WideDeepModel(dnn_model, linear_model)
+  combined_model = WideDeepModel(linear_model, dnn_model)
   combined_model.compile(optimizer=['sgd', 'adam'], 'mse', ['mse'])
-  combined_model.fit([dnn_inputs, linear_inputs], y, epochs)
+  combined_model.fit([linear_inputs, dnn_inputs], y, epochs)
   ```
 
   """
diff --git a/tensorflow/python/keras/preprocessing/BUILD b/tensorflow/python/keras/preprocessing/BUILD
index 640e47a1d44..7c75e45fc58 100644
--- a/tensorflow/python/keras/preprocessing/BUILD
+++ b/tensorflow/python/keras/preprocessing/BUILD
@@ -19,6 +19,7 @@ py_library(
         ":image",
         ":sequence",
         ":text",
+        ":timeseries",
     ],
 )
 
@@ -47,6 +48,19 @@ py_library(
     ],
 )
 
+py_library(
+    name = "timeseries",
+    srcs = [
+        "timeseries.py",
+    ],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_library(
     name = "text",
     srcs = [
@@ -104,3 +118,16 @@ tf_py_test(
         "//third_party/py/numpy",
     ],
 )
+
+tf_py_test(
+    name = "timeseries_test",
+    size = "small",
+    srcs = ["timeseries_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":timeseries",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/compat:v2_compat",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/python/keras/preprocessing/image.py b/tensorflow/python/keras/preprocessing/image.py
index d44fd421929..0b7e2967dea 100644
--- a/tensorflow/python/keras/preprocessing/image.py
+++ b/tensorflow/python/keras/preprocessing/image.py
@@ -41,7 +41,6 @@ random_channel_shift = image.random_channel_shift
 apply_brightness_shift = image.apply_brightness_shift
 random_brightness = image.random_brightness
 apply_affine_transform = image.apply_affine_transform
-load_img = image.load_img
 
 
 @keras_export('keras.preprocessing.image.array_to_img')
@@ -158,6 +157,44 @@ def save_img(path,
                  scale=scale, **kwargs)
 
 
+def load_img(path, grayscale=False, color_mode='rgb', target_size=None,
+             interpolation='nearest'):
+  """Loads an image into PIL format.
+
+  Usage:
+
+  ```
+  image = tf.keras.preprocessing.image.load_img(image_path)
+  input_arr = keras.preprocessing.image.img_to_array(image)
+  input_arr = np.array([input_arr])  # Convert single image to a batch.
+  predictions = model.predict(input_arr)
+  ```
+
+  Arguments:
+      path: Path to image file.
+      grayscale: DEPRECATED use `color_mode="grayscale"`.
+      color_mode: One of "grayscale", "rgb", "rgba". Default: "rgb".
+          The desired image format.
+      target_size: Either `None` (default to original size)
+          or tuple of ints `(img_height, img_width)`.
+      interpolation: Interpolation method used to resample the image if the
+          target size is different from that of the loaded image.
+          Supported methods are "nearest", "bilinear", and "bicubic".
+          If PIL version 1.1.3 or newer is installed, "lanczos" is also
+          supported. If PIL version 3.4.0 or newer is installed, "box" and
+          "hamming" are also supported. By default, "nearest" is used.
+
+  Returns:
+      A PIL Image instance.
+
+  Raises:
+      ImportError: if PIL is not available.
+      ValueError: if interpolation method is not supported.
+  """
+  return image.load_img(path, grayscale=grayscale, color_mode=color_mode,
+                        target_size=target_size, interpolation=interpolation)
+
+
 @keras_export('keras.preprocessing.image.Iterator')
 class Iterator(image.Iterator, data_utils.Sequence):
   pass
diff --git a/tensorflow/python/keras/preprocessing/timeseries.py b/tensorflow/python/keras/preprocessing/timeseries.py
new file mode 100644
index 00000000000..ca41f1952e3
--- /dev/null
+++ b/tensorflow/python/keras/preprocessing/timeseries.py
@@ -0,0 +1,181 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras timeseries dataset utilities."""
+# pylint: disable=g-classes-have-attributes
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+def timeseries_dataset(
+    data,
+    targets,
+    sequence_length,
+    sampling_rate=1,
+    sequence_stride=1,
+    batch_size=128,
+    shuffle=False,
+    seed=None,
+    start_index=None,
+    end_index=None):
+  """Utility function for generating batches of temporal data.
+
+  This function takes in a sequence of data-points gathered at
+  equal intervals, along with time series parameters such as
+  spacing between two sequence, length of history, etc., to produce batches for
+  training/validation.
+
+  Arguments:
+    data: Indexable generator (such as a list or a Numpy array)
+      containing consecutive data points (timesteps).
+      Axis 0 is expected to be the time dimension.
+    targets: Targets corresponding to timesteps in `data`.
+      It should have same length as `data`.
+      Pass None if you don't have target data (in this case the dataset will
+      only yield the input data).
+    sequence_length: Length of the output sequences (in number of timesteps).
+    sampling_rate: Period between successive individual timesteps
+      within sequences. For rate `r`, timesteps
+      `data[i], data[i + r], ... data[i + sequence_length]`
+      are used for create a sample sequence.
+    sequence_stride: Period between successive output sequences.
+      For stride `s`, output samples would
+      start at index `data[i]`, `data[i + s]`, `data[i + 2 * s]`, etc.
+    batch_size: Number of timeseries samples in each batch
+      (except maybe the last one).
+    shuffle: Whether to shuffle output samples,
+      or instead draw them in chronological order.
+    seed: Optional int; random seed for shuffling.
+    start_index: Optional int; data points earlier (exclusive)
+      than `start_index` will not be used
+      in the output sequences. This is useful to reserve part of the
+      data for test or validation.
+    end_index: Optional int; data points later (exclusive) than `end_index`
+      will not be used in the output sequences.
+      This is useful to reserve part of the data for test or validation.
+
+  Returns:
+    A tf.data.Dataset instance. If `targets` was pass, the dataset yields
+    tuple `(batch_of_sequences, batch_of_targets)`. If not, the dataset yields
+    only `batch_of_sequences`.
+
+  Example:
+    Consider indices `[0, 1, ... 99]`.
+    With `sequence_length=10,  sampling_rate=2, sequence_stride=3`,
+    `shuffle=False`, the dataset will yield batches of sequences
+    composed of the following indices:
+
+  ```
+  First sequence:  [0  2  4  6  8 10 12 14 16 18]
+  Second sequence: [3  5  7  9 11 13 15 17 19 21]
+  Third sequence:  [6  8 10 12 14 16 18 20 22 24]
+  ...
+  Last sequence:   [78 80 82 84 86 88 90 92 94 96]
+  ```
+
+  In this case the last 3 data points are discarded since no full sequence
+  can be generated to include them (the next sequence would have started
+  at index 81, and thus its last step would have gone over 99).
+  """
+  # Validate the shape of data and targets
+  if targets is not None and len(targets) != len(data):
+    raise ValueError('Expected data and targets to have the same number of '
+                     'time steps (axis 0) but got '
+                     'shape(data) = %s; shape(targets) = %s.' %
+                     (data.shape, targets.shape))
+  if start_index and (start_index < 0 or start_index >= len(data)):
+    raise ValueError('start_index must be higher than 0 and lower than the '
+                     'length of the data. Got: start_index=%s '
+                     'for data of length %s.' % (start_index, len(data)))
+  if end_index:
+    if start_index and end_index <= start_index:
+      raise ValueError('end_index must be higher than start_index. Got: '
+                       'start_index=%s, end_index=%s.' %
+                       (start_index, end_index))
+    if end_index >= len(data):
+      raise ValueError('end_index must be lower than the length of the data. '
+                       'Got: end_index=%s' % (end_index,))
+    if end_index <= 0:
+      raise ValueError('end_index must be higher than 0. '
+                       'Got: end_index=%s' % (end_index,))
+
+  # Validate strides
+  if sampling_rate <= 0 or sampling_rate >= len(data):
+    raise ValueError(
+        'sampling_rate must be higher than 0 and lower than '
+        'the length of the data. Got: '
+        'sampling_rate=%s for data of length %s.' % (sampling_rate, len(data)))
+  if sequence_stride <= 0 or sequence_stride >= len(data):
+    raise ValueError(
+        'sequence_stride must be higher than 0 and lower than '
+        'the length of the data. Got: sequence_stride=%s '
+        'for data of length %s.' % (sequence_stride, len(data)))
+
+  if start_index is None:
+    start_index = 0
+  if end_index is None:
+    end_index = len(data)
+
+  # Determine the lowest dtype to store start positions (to lower memory usage).
+  num_seqs = end_index - start_index - (sequence_length * sampling_rate) + 1
+  if num_seqs < 2147483647:
+    index_dtype = 'int32'
+  else:
+    index_dtype = 'int64'
+
+  # Generate start positions
+  start_positions = np.arange(0, num_seqs, sequence_stride, dtype=index_dtype)
+  if shuffle:
+    if seed is None:
+      seed = np.random.randint(1e6)
+    rng = np.random.RandomState(seed)
+    rng.shuffle(start_positions)
+
+  sequence_length = math_ops.cast(sequence_length, dtype=index_dtype)
+  sampling_rate = math_ops.cast(sampling_rate, dtype=index_dtype)
+
+  # For each initial window position, generates indices of the window elements
+  indices = dataset_ops.Dataset.zip(
+      (dataset_ops.Dataset.range(len(start_positions)),
+       dataset_ops.Dataset.from_tensors(start_positions).repeat())).map(
+           lambda i, positions: math_ops.range(  # pylint: disable=g-long-lambda
+               positions[i],
+               positions[i] + sequence_length * sampling_rate,
+               sampling_rate),
+           num_parallel_calls=dataset_ops.AUTOTUNE)
+
+  dataset = sequences_from_indices(data, indices, start_index, end_index)
+  if targets is not None:
+    target_ds = sequences_from_indices(targets, indices, start_index, end_index)
+    dataset = dataset_ops.Dataset.zip((dataset, target_ds))
+  if shuffle:
+    # Shuffle locally at each iteration
+    dataset = dataset.shuffle(buffer_size=batch_size * 8, seed=seed)
+  dataset = dataset.batch(batch_size)
+  return dataset
+
+
+def sequences_from_indices(array, indices_ds, start_index, end_index):
+  dataset = dataset_ops.Dataset.from_tensors(array[start_index : end_index])
+  dataset = dataset_ops.Dataset.zip((dataset.repeat(), indices_ds)).map(
+      lambda steps, inds: array_ops.gather(steps, inds),  # pylint: disable=unnecessary-lambda
+      num_parallel_calls=dataset_ops.AUTOTUNE)
+  return dataset
diff --git a/tensorflow/python/keras/preprocessing/timeseries_test.py b/tensorflow/python/keras/preprocessing/timeseries_test.py
new file mode 100644
index 00000000000..ab1640191bf
--- /dev/null
+++ b/tensorflow/python/keras/preprocessing/timeseries_test.py
@@ -0,0 +1,162 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for timeseries."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.keras.preprocessing import timeseries
+from tensorflow.python.platform import test
+
+
+class TimeseriesDatasetTest(test.TestCase):
+
+  def test_basics(self):
+    # Test ordering, targets, sequence length, batch size
+    data = np.arange(100)
+    targets = data * 2
+    dataset = timeseries.timeseries_dataset(
+        data, targets, sequence_length=9, batch_size=5)
+    # Expect 19 batches
+    for i, batch in enumerate(dataset):
+      self.assertLen(batch, 2)
+      if i < 18:
+        self.assertEqual(batch[0].shape, (5, 9))
+      if i == 18:
+        # Last batch: size 2
+        self.assertEqual(batch[0].shape, (2, 9))
+      # Check target values
+      self.assertAllClose(batch[0] * 2, batch[1])
+      for j in range(min(5, len(batch[0]))):
+        # Check each sample in the batch
+        self.assertAllClose(batch[0][j], np.arange(i * 5 + j, i * 5 + j + 9))
+
+  def test_no_targets(self):
+    data = np.arange(50)
+    dataset = timeseries.timeseries_dataset(
+        data, None, sequence_length=10, batch_size=5)
+    # Expect 9 batches
+    for i, batch in enumerate(dataset):
+      if i < 8:
+        self.assertEqual(batch.shape, (5, 10))
+      elif i == 8:
+        self.assertEqual(batch.shape, (1, 10))
+      for j in range(min(5, len(batch))):
+        # Check each sample in the batch
+        self.assertAllClose(batch[j], np.arange(i * 5 + j, i * 5 + j + 10))
+
+  def test_shuffle(self):
+    # Test cross-epoch random order and seed determinism
+    data = np.arange(10)
+    targets = data * 2
+    dataset = timeseries.timeseries_dataset(
+        data, targets, sequence_length=5, batch_size=1, shuffle=True, seed=123)
+    first_seq = None
+    for x, y in dataset.take(1):
+      self.assertNotAllClose(x, np.arange(0, 5))
+      self.assertAllClose(x * 2, y)
+      first_seq = x
+    # Check that a new iteration with the same dataset yields different results
+    for x, _ in dataset.take(1):
+      self.assertNotAllClose(x, first_seq)
+    # Check determism with same seed
+    dataset = timeseries.timeseries_dataset(
+        data, targets, sequence_length=5, batch_size=1, shuffle=True, seed=123)
+    for x, _ in dataset.take(1):
+      self.assertAllClose(x, first_seq)
+
+  def test_sampling_rate(self):
+    data = np.arange(100)
+    targets = data * 2
+    dataset = timeseries.timeseries_dataset(
+        data, targets, sequence_length=9, batch_size=5, sampling_rate=2)
+    for i, batch in enumerate(dataset):
+      self.assertLen(batch, 2)
+      if i < 16:
+        self.assertEqual(batch[0].shape, (5, 9))
+      if i == 16:
+        # Last batch: size 3
+        self.assertEqual(batch[0].shape, (3, 9))
+      # Check target values
+      self.assertAllClose(batch[0] * 2, batch[1])
+      for j in range(min(5, len(batch[0]))):
+        # Check each sample in the batch
+        start_index = i * 5 + j
+        end_index = start_index + 9 * 2
+        self.assertAllClose(batch[0][j],
+                            np.arange(start_index, end_index, 2))
+
+  def test_sequence_stride(self):
+    data = np.arange(100)
+    targets = data * 2
+    dataset = timeseries.timeseries_dataset(
+        data, targets, sequence_length=9, batch_size=5, sequence_stride=3)
+    for i, batch in enumerate(dataset):
+      self.assertLen(batch, 2)
+      if i < 6:
+        self.assertEqual(batch[0].shape, (5, 9))
+      if i == 6:
+        # Last batch: size 1
+        self.assertEqual(batch[0].shape, (1, 9))
+      # Check target values
+      self.assertAllClose(batch[0] * 2, batch[1])
+      for j in range(min(5, len(batch[0]))):
+        # Check each sample in the batch
+        start_index = i * 5 * 3 + j * 3
+        end_index = start_index + 9
+        self.assertAllClose(batch[0][j],
+                            np.arange(start_index, end_index))
+
+  def test_start_and_end_index(self):
+    data = np.arange(100)
+    dataset = timeseries.timeseries_dataset(
+        data, None,
+        sequence_length=9, batch_size=5, sequence_stride=3, sampling_rate=2,
+        start_index=10, end_index=90)
+    for batch in dataset:
+      self.assertAllLess(batch[0], 90)
+      self.assertAllGreater(batch[0], 9)
+
+  def test_errors(self):
+    # bad targets
+    with self.assertRaisesRegex(ValueError,
+                                'data and targets to have the same number'):
+      _ = timeseries.timeseries_dataset(np.arange(10), np.arange(9), 3)
+    # bad start index
+    with self.assertRaisesRegex(ValueError, 'start_index must be '):
+      _ = timeseries.timeseries_dataset(np.arange(10), None, 3, start_index=-1)
+    with self.assertRaisesRegex(ValueError, 'start_index must be '):
+      _ = timeseries.timeseries_dataset(np.arange(10), None, 3, start_index=11)
+    # bad end index
+    with self.assertRaisesRegex(ValueError, 'end_index must be '):
+      _ = timeseries.timeseries_dataset(np.arange(10), None, 3, end_index=-1)
+    with self.assertRaisesRegex(ValueError, 'end_index must be '):
+      _ = timeseries.timeseries_dataset(np.arange(10), None, 3, end_index=11)
+    # bad sampling_rate
+    with self.assertRaisesRegex(ValueError, 'sampling_rate must be '):
+      _ = timeseries.timeseries_dataset(np.arange(10), None, 3, sampling_rate=0)
+    # bad sequence stride
+    with self.assertRaisesRegex(ValueError, 'sequence_stride must be '):
+      _ = timeseries.timeseries_dataset(
+          np.arange(10), None, 3, sequence_stride=0)
+
+
+if __name__ == '__main__':
+  v2_compat.enable_v2_behavior()
+  test.main()
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index 8fc438ee5d6..5da6aeef391 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -74,10 +74,17 @@ def get_test_data(train_samples,
 
 
 @test_util.disable_cudnn_autotune
-def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
-               input_data=None, expected_output=None,
-               expected_output_dtype=None, expected_output_shape=None,
-               validate_training=True, adapt_data=None):
+def layer_test(layer_cls,
+               kwargs=None,
+               input_shape=None,
+               input_dtype=None,
+               input_data=None,
+               expected_output=None,
+               expected_output_dtype=None,
+               expected_output_shape=None,
+               validate_training=True,
+               adapt_data=None,
+               custom_objects=None):
   """Test routine for a layer with a single input and single output.
 
   Arguments:
@@ -95,6 +102,8 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
       string or integer values.
     adapt_data: Optional data for an 'adapt' call. If None, adapt() will not
       be tested for this layer. This is only relevant for PreprocessingLayers.
+    custom_objects: Optional dictionary mapping name strings to custom objects
+      in the layer class. This is helpful for testing custom layers.
 
   Returns:
     The output data (Numpy array) returned by the layer, for additional
@@ -195,7 +204,7 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
 
   # test serialization, weight setting at model level
   model_config = model.get_config()
-  recovered_model = models.Model.from_config(model_config)
+  recovered_model = models.Model.from_config(model_config, custom_objects)
   if model.weights:
     weights = model.get_weights()
     recovered_model.set_weights(weights)
@@ -250,7 +259,7 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
 
   # test serialization, weight setting at model level
   model_config = model.get_config()
-  recovered_model = models.Sequential.from_config(model_config)
+  recovered_model = models.Sequential.from_config(model_config, custom_objects)
   if model.weights:
     weights = model.get_weights()
     recovered_model.set_weights(weights)
@@ -264,7 +273,6 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
 _thread_local_data = threading.local()
 _thread_local_data.model_type = None
 _thread_local_data.run_eagerly = None
-_thread_local_data.experimental_run_tf_function = None
 _thread_local_data.saved_model_format = None
 
 
@@ -321,40 +329,6 @@ def should_run_eagerly():
   return _thread_local_data.run_eagerly and context.executing_eagerly()
 
 
-@tf_contextlib.contextmanager
-def experimental_run_tf_function_scope(value):
-  """Provides a scope within which we compile models to run with distribution.
-
-  The boolean gets restored to its original value upon exiting the scope.
-
-  Arguments:
-     value: Bool specifying if we should run models with default distribution
-     in the active test. Should be True or False.
-
-  Yields:
-    The provided value.
-  """
-  previous_value = _thread_local_data.experimental_run_tf_function
-  try:
-    _thread_local_data.experimental_run_tf_function = value
-    yield value
-  finally:
-    # Restore model type to initial value.
-    _thread_local_data.experimental_run_tf_function = previous_value
-
-
-def should_run_tf_function():
-  """Returns whether the models we are testing should be run distributed."""
-  if _thread_local_data.experimental_run_tf_function is None:
-    raise ValueError(
-        'Cannot call `should_run_tf_function()` outside of a '
-        '`experimental_run_tf_function_scope()` or `run_all_keras_modes` '
-        'decorator.')
-
-  return (_thread_local_data.experimental_run_tf_function and
-          context.executing_eagerly())
-
-
 @tf_contextlib.contextmanager
 def saved_model_format_scope(value):
   """Provides a scope within which the savde model format to test is `value`.
diff --git a/tensorflow/python/keras/type/BUILD b/tensorflow/python/keras/type/BUILD
new file mode 100644
index 00000000000..cc82b1b4b7f
--- /dev/null
+++ b/tensorflow/python/keras/type/BUILD
@@ -0,0 +1,15 @@
+load("//tensorflow:tensorflow.bzl", "py_strict_library")
+
+package(
+    default_visibility = ["//tensorflow:__subpackages__"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+py_strict_library(
+    name = "types",
+    srcs = ["types.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "@six_archive//:six",
+    ],
+)
diff --git a/tensorflow/python/keras/type/types.py b/tensorflow/python/keras/type/types.py
new file mode 100644
index 00000000000..bf83670567c
--- /dev/null
+++ b/tensorflow/python/keras/type/types.py
@@ -0,0 +1,203 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=g-classes-have-attributes
+"""Python module for Keras base types.
+
+All the classes in this module is abstract classes that contains none or minimal
+implementations. It is designed be used as base class for other concrete
+classes, type checks, and python3 type hints.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import six
+
+# TODO(scottzhu): Export all the types under this module with API symbol.
+
+
+@six.add_metaclass(abc.ABCMeta)
+class Layer(object):
+  """This is the class from which all layers inherit.
+
+  A layer is a callable object that takes as input one or more tensors and
+  that outputs one or more tensors. It involves *computation*, defined
+  in the `call()` method, and a *state* (weight variables), defined
+  either in the constructor `__init__()` or in the `build()` method.
+
+  Users will just instantiate a layer and then treat it as a callable.
+
+  We recommend that descendants of `Layer` implement the following methods:
+
+  * `__init__()`: Defines custom layer attributes, and creates layer state
+    variables that do not depend on input shapes, using `add_weight()`.
+  * `build(self, input_shape)`: This method can be used to create weights that
+    depend on the shape(s) of the input(s), using `add_weight()`. `__call__()`
+    will automatically build the layer (if it has not been built yet) by
+    calling `build()`.
+  * `call(self, *args, **kwargs)`: Called in `__call__` after making sure
+    `build()` has been called. `call()` performs the logic of applying the
+    layer to the input tensors (which should be passed in as argument).
+    Two reserved keyword arguments you can optionally use in `call()` are:
+      - `training` (boolean, whether the call is in
+        inference mode or training mode)
+      - `mask` (boolean tensor encoding masked timesteps in the input, used
+        in RNN layers)
+  * `get_config(self)`: Returns a dictionary containing the configuration used
+    to initialize this layer. If the keys differ from the arguments
+    in `__init__`, then override `from_config(self)` as well.
+    This method is used when saving
+    the layer or a model that contains this layer.
+
+  Examples:
+
+  Here's a basic example: a layer with two variables, `w` and `b`,
+  that returns `y = w . x + b`.
+  It shows how to implement `build()` and `call()`.
+  Variables set as attributes of a layer are tracked as weights
+  of the layers (in `layer.weights`).
+
+  ```python
+  class SimpleDense(Layer):
+
+    def __init__(self, units=32):
+        super(SimpleDense, self).__init__()
+        self.units = units
+
+    def build(self, input_shape):  # Create the state of the layer (weights)
+      w_init = tf.random_normal_initializer()
+      self.w = tf.Variable(
+          initial_value=w_init(shape=(input_shape[-1], self.units),
+                               dtype='float32'),
+          trainable=True)
+      b_init = tf.zeros_initializer()
+      self.b = tf.Variable(
+          initial_value=b_init(shape=(self.units,), dtype='float32'),
+          trainable=True)
+
+    def call(self, inputs):  # Defines the computation from inputs to outputs
+        return tf.matmul(inputs, self.w) + self.b
+
+  # Instantiates the layer.
+  linear_layer = SimpleDense(4)
+
+  # This will also call `build(input_shape)` and create the weights.
+  y = linear_layer(tf.ones((2, 2)))
+  assert len(linear_layer.weights) == 2
+
+  # These weights are trainable, so they're listed in `trainable_weights`:
+  assert len(linear_layer.trainable_weights) == 2
+  ```
+
+  Note that the method `add_weight()` offers a shortcut to create weights:
+
+  ```python
+  class SimpleDense(Layer):
+
+    def __init__(self, units=32):
+        super(SimpleDense, self).__init__()
+        self.units = units
+
+    def build(self, input_shape):
+        self.w = self.add_weight(shape=(input_shape[-1], self.units),
+                                 initializer='random_normal',
+                                 trainable=True)
+        self.b = self.add_weight(shape=(self.units,),
+                                 initializer='random_normal',
+                                 trainable=True)
+
+    def call(self, inputs):
+        return tf.matmul(inputs, self.w) + self.b
+  ```
+
+  Besides trainable weights, updated via backpropagation during training,
+  layers can also have non-trainable weights. These weights are meant to
+  be updated manually during `call()`. Here's a example layer that computes
+  the running sum of its inputs:
+
+  ```python
+  class ComputeSum(Layer):
+
+    def __init__(self, input_dim):
+        super(ComputeSum, self).__init__()
+        # Create a non-trainable weight.
+        self.total = tf.Variable(initial_value=tf.zeros((input_dim,)),
+                                 trainable=False)
+
+    def call(self, inputs):
+        self.total.assign_add(tf.reduce_sum(inputs, axis=0))
+        return self.total
+
+  my_sum = ComputeSum(2)
+  x = tf.ones((2, 2))
+
+  y = my_sum(x)
+  print(y.numpy())  # [2. 2.]
+
+  y = my_sum(x)
+  print(y.numpy())  # [4. 4.]
+
+  assert my_sum.weights == [my_sum.total]
+  assert my_sum.non_trainable_weights == [my_sum.total]
+  assert my_sum.trainable_weights == []
+  ```
+
+  For more information about creating layers, see the guide
+  [Writing custom layers and models with Keras](
+    https://www.tensorflow.org/guide/keras/custom_layers_and_models)
+
+  Arguments:
+    trainable: Boolean, whether the layer's variables should be trainable.
+    name: String name of the layer.
+    dtype: The dtype of the layer's computations and weights (default of
+      `None` means use `tf.keras.backend.floatx` in TensorFlow 2, or the type
+      of the first input in TensorFlow 1).
+    dynamic: Set this to `True` if your layer should only be run eagerly, and
+      should not be used to generate a static computation graph.
+      This would be the case for a Tree-RNN or a recursive network,
+      for example, or generally for any layer that manipulates tensors
+      using Python control flow. If `False`, we assume that the layer can
+      safely be used to generate a static computation graph.
+
+  Attributes:
+    name: The name of the layer (string).
+    dtype: The dtype of the layer's computations and weights. If mixed
+      precision is used with a `tf.keras.mixed_precision.experimental.Policy`,
+      this is instead just the dtype of the layer's weights, as the computations
+      are done in a different dtype.
+    updates: List of update ops of this layer.
+    losses: List of losses added by this layer.
+    trainable_weights: List of variables to be included in backprop.
+    non_trainable_weights: List of variables that should not be
+      included in backprop.
+    weights: The concatenation of the lists trainable_weights and
+      non_trainable_weights (in this order).
+    trainable: Whether the layer should be trained (boolean).
+    input_spec: Optional (list of) `InputSpec` object(s) specifying the
+      constraints on inputs that can be accepted by the layer.
+
+  Each layer has a dtype, which is typically the dtype of the layer's
+  computations and variables. A layer's dtype can be queried via the
+  `Layer.dtype` property. The dtype is specified with the `dtype` constructor
+  argument. In TensorFlow 2, the dtype defaults to `tf.keras.backend.floatx()`
+  if no dtype is passed. `floatx()` itself defaults to "float32". Additionally,
+  layers will cast their inputs to the layer's dtype in TensorFlow 2. When mixed
+  precision is used, layers may have different computation and variable dtypes.
+  See `tf.keras.mixed_precision.experimental.Policy` for details on layer
+  dtypes.
+  """
+  pass
diff --git a/tensorflow/python/keras/utils/generic_utils.py b/tensorflow/python/keras/utils/generic_utils.py
index bbb6155e30e..970ec755c80 100644
--- a/tensorflow/python/keras/utils/generic_utils.py
+++ b/tensorflow/python/keras/utils/generic_utils.py
@@ -795,3 +795,14 @@ def validate_kwargs(kwargs,
 def validate_config(config):
   """Determines whether config appears to be a valid layer config."""
   return isinstance(config, dict) and _LAYER_UNDEFINED_CONFIG_KEY not in config
+
+
+def default(method):
+  """Decorates a method to detect overrides in subclasses."""
+  method._is_default = True  # pylint: disable=protected-access
+  return method
+
+
+def is_default(method):
+  """Check if a method is decorated with the `default` wrapper."""
+  return getattr(method, '_is_default', False)
diff --git a/tensorflow/python/keras/utils/vis_utils.py b/tensorflow/python/keras/utils/vis_utils.py
index 50c531e19cb..dab5243663b 100644
--- a/tensorflow/python/keras/utils/vis_utils.py
+++ b/tensorflow/python/keras/utils/vis_utils.py
@@ -49,7 +49,7 @@ def check_pydot():
     # to check the pydot/graphviz installation.
     pydot.Dot.create(pydot.Dot())
     return True
-  except OSError:
+  except (OSError, pydot.InvocationException):
     return False
 
 
diff --git a/tensorflow/python/keras/utils/vis_utils_test.py b/tensorflow/python/keras/utils/vis_utils_test.py
index 731400d0be8..34bc835da32 100644
--- a/tensorflow/python/keras/utils/vis_utils_test.py
+++ b/tensorflow/python/keras/utils/vis_utils_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for Keras Layer utils."""
+"""Tests for Keras Vis utils."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 7cda5eb8f3a..b3ab876d9b0 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -436,7 +436,7 @@ tf_py_test(
 
 cuda_py_test(
     name = "determinant_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["determinant_op_test.py"],
     deps = [
         "//tensorflow/python:client_testlib",
@@ -1592,7 +1592,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "broadcast_to_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["broadcast_to_ops_test.py"],
     deps = [
         "//tensorflow/python:array_ops",
@@ -1620,7 +1620,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "batch_matmul_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["batch_matmul_op_test.py"],
     shard_count = 20,
     deps = [
@@ -1677,7 +1677,6 @@ cuda_py_test(
     name = "bias_op_deterministic_test",
     size = "medium",
     srcs = ["bias_op_deterministic_test.py"],
-    xla_enable_strict_auto_jit = False,
     deps = [
         ":bias_op_base",
     ],
@@ -1757,7 +1756,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "control_flow_ops_py_test",
-    size = "small",
+    size = "medium",
     srcs = ["control_flow_ops_py_test.py"],
     shard_count = 16,
     tags = [
@@ -2116,7 +2115,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "lrn_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["lrn_op_test.py"],
     deps = [
         "//tensorflow/python:array_ops",
@@ -2715,7 +2714,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "tensor_array_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["tensor_array_ops_test.py"],
     flaky = 1,  # create_local_cluster sometimes times out.
     shard_count = 10,
@@ -2825,7 +2824,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "zero_division_test",
-    size = "small",
+    size = "medium",
     srcs = ["zero_division_test.py"],
     deps = [
         "//tensorflow/python:client_testlib",
@@ -2940,6 +2939,7 @@ cuda_py_test(
 cuda_py_test(
     name = "depthwise_conv_op_test",
     size = "medium",  # http://b/30603882
+    timeout = "long",
     srcs = ["depthwise_conv_op_test.py"],
     # TODO(b/118842098): Re-enable this test in Kokoro.
     tags = ["no_oss"],
@@ -3622,7 +3622,7 @@ tf_py_test(
 
 cuda_py_test(
     name = "bucketize_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["bucketize_op_test.py"],
     deps = [
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index ce96ee4ad6d..b81ec5f36a8 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -1672,6 +1672,22 @@ class SortedSearchTest(test_util.TensorFlowTestCase):
 
     self.assertAllEqual(result, tf_result)
 
+  def testZeroSequenceSize(self):
+    dtype = dtypes.int32
+    for side in ("left", "right"):
+      self.assertAllEqual(
+          array_ops.searchsorted(array_ops.ones([2, 0]), array_ops.ones([2, 3]),
+                                 side=side, out_type=dtype),
+          array_ops.zeros([2, 3], dtype))
+
+  def testZeroValueSize(self):
+    dtype = dtypes.int32
+    for side in ("left", "right"):
+      self.assertAllEqual(
+          array_ops.searchsorted(array_ops.ones([2, 3]), array_ops.ones([2, 0]),
+                                 side=side, out_type=dtype),
+          array_ops.zeros([2, 0], dtype))
+
 
 class BatchGatherNdTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index f65cd64c93d..ec9d97c4bcc 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -31,6 +31,7 @@ import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python import tf2
 from tensorflow.python.client import device_lib
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
@@ -150,6 +151,14 @@ def filter_test_messages(s):
   return [l[len(prefix):] for l in s.split("\n") if l.startswith(prefix)]
 
 
+def tf_function_in_tf2(f):
+  if tf2.enabled():
+    # In TF1 do not wrap with tf.function so that we can test the v1 control
+    # flow code path.
+    return def_function.function(f)
+  return f
+
+
 @test_util.with_control_flow_v2
 class ControlFlowTest(test.TestCase, parameterized.TestCase):
 
@@ -3207,31 +3216,37 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(gradient_checker_v2._to_numpy(var2_grad_val),
                         [3., 0., 0.])
 
-  @test_util.run_deprecated_v1
   def testWhileGrad_Gather(self):
     # NOTE(skyewm): this test is interesting because the gather gradient
     # function returns an IndexedSlices.
-    x = constant_op.constant([1., 1., 1., 1., 1.])
-    y = control_flow_ops.while_loop(
-        lambda i, _: i < 3,
-        lambda i, x: (i + 1, x + array_ops.gather(x, [0])),
-        [0, x[:1]])[1]
-    z = y * 3.0
-    grad = gradients_impl.gradients(z, x)[0]
+    @tf_function_in_tf2
+    def fn():
+      x = constant_op.constant([1., 1., 1., 1., 1.])
+      y = control_flow_ops.while_loop(
+          lambda i, _: i < 3,
+          lambda i, x: (i + 1, x + array_ops.gather(x, [0])),
+          [0, x[:1]])[1]
+      z = y * 3.0
+      grad = gradients_impl.gradients(z, x)[0]
+      return y, grad
+    y, grad = fn()
     self.assertEqual(self.evaluate(y), 8.)
     self.assertAllEqual(self.evaluate(grad), [24., 0., 0., 0., 0.])
 
-  @test_util.run_deprecated_v1
   def testWhileGrad_GatherNoFanOut(self):
     # NOTE(skyewm): this test is interesting because the gather gradient
     # function returns an IndexedSlices.
-    x = constant_op.constant([1., 1., 1., 1., 1.])
-    y = control_flow_ops.while_loop(
-        lambda i, _: i < 3,
-        lambda i, x: (i + 1, array_ops.gather(x, [0])),
-        [0, x[:1]])[1]
-    z = y * 3.0
-    grad = gradients_impl.gradients(z, x)[0]
+    @tf_function_in_tf2
+    def fn():
+      x = constant_op.constant([1., 1., 1., 1., 1.])
+      y = control_flow_ops.while_loop(
+          lambda i, _: i < 3,
+          lambda i, x: (i + 1, array_ops.gather(x, [0])),
+          [0, x[:1]])[1]
+      z = y * 3.0
+      grad = gradients_impl.gradients(z, x)[0]
+      return y, grad
+    y, grad = fn()
     self.assertEqual(self.evaluate(y), 1.)
     self.assertAllEqual(self.evaluate(grad), [3., 0., 0., 0., 0.])
 
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index 6420ed435bd..743a413d08c 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -1239,7 +1239,7 @@ class SingularGradientOpTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testGradientAtSingularity(self):
-    if not compat.forward_compatible(2020, 3, 14):
+    if not compat.forward_compatible(2020, 6, 14):
       self.skipTest("Skipping test for future functionality.")
 
     ops_and_singularity = [
diff --git a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
index 0509fcad283..bf284497984 100644
--- a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
@@ -34,31 +34,136 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
 
+def _DepthwiseConv2dNumpyBasic(x1, x2, strides):
+  """Compute depthwise_conv2d using Numpy.
+
+  This allows use to test TensorFlow's depthwise_conv2d by comparing to the
+  Numpy version.
+
+  Args:
+    x1: The input Numpy array, in NHWC format.
+    x2: The filter Numpy array.
+    strides: A Python list of 4 elements representing the strides.
+
+  Returns:
+    The depthwise conv2d output as a Numpy array.
+  """
+  n, h, w, c = x1.shape
+  fh, fw, c2, o = x2.shape
+  assert c == c2
+  _, sh, sw, _ = strides
+  out_rows = (h - fh + sh) // sh
+  out_cols = (w - fw + sw) // sw
+  out = np.zeros([n, out_rows, out_cols, c * o])
+  for i in range(out_rows):
+    for j in range(out_cols):
+      for k in range(c):
+        start_height = i * sh
+        end_height = start_height + fh
+        start_width = j * sw
+        end_width = start_width + fw
+        # multiplied_slice.shape: (b, fh, fw, o)
+        multiplied_slice = (
+            x1[:, start_height:end_height, start_width:end_width, k, np.newaxis]
+            * x2[:, :, k, :])
+        # Set a slice of b * o elements of 'out'.
+        out[:, i, j, k * o:(k + 1) * o] = np.sum(multiplied_slice, axis=(1, 2))
+  return out
+
+
+def _DepthwiseConv2dNumpy(x1, x2, strides, padding, data_format, dilations):
+  """Compute depthwise_conv2d using Numpy.
+
+  This allows use to test TensorFlow's depthwise_conv2d by comparing to the
+  Numpy version.
+
+  Unlike `_DepthwiseConv2dNumpyBasic`, this supports more advanced features
+  like padding.
+
+  Args:
+    x1: The input Numpy array.
+    x2: The filter Numpy array.
+    strides: A Python list of 4 elements representing the strides.
+    padding: The padding. "SAME", "VALID", or a list of explicit paddings.
+    data_format: "NHWC" or "NCHW".
+    dilations: A list of 2 elements, representing the dilations.
+
+  Returns:
+    The depthwise conv2d as a Numpy array.
+  """
+  if data_format == "NCHW":
+    # Transpose arguments to NHWC format.
+    x1 = np.transpose(x1, (0, 3, 1, 2))
+    strides = [strides[0], strides[3], strides[1], strides[2]]
+    if dilations:
+      dilations = [dilations[0], dilations[3], dilations[1], dilations[2]]
+
+  if dilations:
+    # Dilate the filter so _DepthwiseConv2dNumpyBasic doesn't have to deal with
+    # dilations.
+    fh, fw, c, o = x2.shape
+    new_fh = (fh - 1) * dilations[0] + 1
+    new_fw = (fw - 1) * dilations[1] + 1
+    new_x2 = np.zeros((new_fh, new_fw, c, o))
+    for i in range(fh):
+      for j in range(fw):
+        new_x2[i * dilations[0], j * dilations[1], : :] = x2[i, j, :, :]
+    x2 = new_x2
+
+  # Pad input so _DepthwiseConv2dNumpyBasic doesn't have to deal with padding.
+  if padding == "SAME":
+    def PaddingsForDim(input_dim, filter_dim, stride):
+      """Computes paddings for a single dimension."""
+      if input_dim % stride == 0:
+        total_padding = max(filter_dim - stride, 0)
+      else:
+        total_padding = max(filter_dim - (input_dim % stride), 0)
+      pad_before = total_padding // 2
+      pad_after = total_padding - pad_before
+      return pad_before, pad_after
+    padding = [(0, 0),
+               PaddingsForDim(x1.shape[1], x2.shape[0], strides[1]),
+               PaddingsForDim(x1.shape[2], x2.shape[1], strides[2]),
+               (0, 0)]
+  elif padding == "VALID":
+    padding = [(0, 0)] * 4
+  x1 = np.pad(x1, padding, "constant")
+
+  y = _DepthwiseConv2dNumpyBasic(x1, x2, strides)
+
+  if data_format == "NCHW":
+    # Transpose back to NCHW format.
+    y = np.transpose(y, (0, 2, 3, 1))
+
+  return y
+
+
 def ConfigsToTest():
   """Iterator for different convolution shapes, strides and paddings.
 
-  Yields:
-    Tuple (input_size, filter_size, out_size, stride, padding), the depthwise
-    convolution parameters.
+  Returns:
+    List of tuples (input_size, filter_size, out_size, stride, padding,
+    dilations), the depthwise convolution parameters.
   """
-  input_sizes = [[4, 5, 5, 48], [4, 8, 8, 84], [4, 17, 17, 48], [4, 9, 27, 8],
-                 [4, 31, 31, 7], [4, 35, 35, 2], [4, 147, 147, 2],
-                 [3, 299, 299, 3], [5, 183, 183, 1]]
-  filter_sizes = [[1, 1, 48, 2], [1, 3, 84, 1], [3, 1, 48, 4], [3, 3, 8, 1],
-                  [3, 3, 7, 1], [5, 5, 2, 1], [3, 3, 2, 8], [2, 2, 3,
-                                                             8], [5, 5, 1, 2]]
-  out_sizes = [[4, 5, 5, 96], [4, 8, 8, 84], [4, 17, 17, 192], [4, 9, 27, 8],
-               [4, 31, 31, 7], [4, 35, 35, 2], [4, 49, 49, 16],
-               [3, 150, 150, 24], [5, 92, 92, 2]]
-  strides = [1, 1, 1, 1, 1, 1, 3, 2, 2]
-  # pylint: disable=invalid-name
-  VALID = "VALID"
-  SAME = "SAME"
-  # pylint: enable=invalid-name
-  paddings = [SAME, SAME, SAME, SAME, SAME, SAME, VALID, SAME, SAME, SAME]
-  for i, f, o, s, p in zip(input_sizes, filter_sizes, out_sizes, strides,
-                           paddings):
-    yield i, f, o, s, p
+  def Config(input_size, filter_size, out_size, stride=1, padding="SAME",
+             dilations=None):
+    return input_size, filter_size, out_size, stride, padding, dilations
+  return [
+      Config([4, 5, 5, 48], [1, 1, 48, 2], [4, 5, 5, 96]),
+      Config([4, 8, 8, 84], [1, 3, 84, 1], [4, 8, 8, 84]),
+      Config([4, 17, 17, 48], [3, 1, 48, 4], [4, 17, 17, 192]),
+      Config([4, 9, 27, 8], [3, 3, 8, 1], [4, 9, 27, 8]),
+      Config([4, 31, 31, 7], [3, 3, 7, 1], [4, 31, 31, 7]),
+      Config([4, 35, 35, 2], [5, 5, 2, 1], [4, 35, 35, 2]),
+      Config([4, 147, 147, 2], [3, 3, 2, 8], [4, 49, 49, 16], 3,
+             padding="VALID"),
+      Config([3, 299, 299, 3], [3, 2, 3, 8], [3, 150, 150, 24], 2),
+      Config([5, 183, 183, 1], [5, 5, 1, 2], [5, 92, 92, 2], 2),
+      Config([5, 183, 183, 1], [5, 5, 1, 2], [5, 183, 183, 2],
+             dilations=[2, 2]),
+      Config([5, 41, 35, 2], [4, 7, 2, 2], [5, 32, 23, 4], padding="VALID",
+             dilations=[3, 2]),
+  ]
 
 
 def CheckGradConfigsToTest():
@@ -67,34 +172,31 @@ def CheckGradConfigsToTest():
   compute_gradient_error() is very expensive. So the configs should be
   relatively small.
 
-  Yields:
-    Tuple (input_size, filter_size, out_size, stride, padding), the depthwise
-    convolution parameters.
+  Returns:
+    List of tuples (input_size, filter_size, out_size, stride, padding,
+    dilations), the depthwise convolution parameters.
   """
-  input_sizes = [[2, 5, 8, 1], [4, 5, 5, 1], [2, 4, 4, 2], [1, 15, 15, 2],
-                 [2, 15, 16, 1]]
-  filter_sizes = [[4, 4, 1, 2], [2, 2, 1, 2], [3, 1, 2, 2], [1, 3, 2, 1],
-                  [3, 3, 1, 2]]
-  out_sizes = [[2, 5, 8, 2], [4, 2, 2, 2], [2, 4, 4, 4], [1, 15, 15, 2],
-               [2, 5, 5, 2]]
-  strides = [1, 2, 1, 1, 3]
-  # pylint: disable=invalid-name
-  VALID = "VALID"
-  SAME = "SAME"
-  # pylint: enable=invalid-name
-  paddings = [SAME, VALID, SAME, SAME, VALID]
-  for i, f, o, s, p in zip(input_sizes, filter_sizes, out_sizes, strides,
-                           paddings):
-    yield i, f, o, s, p
+  def Config(input_size, filter_size, out_size, stride=1, padding="SAME",
+             dilations=None):
+    return input_size, filter_size, out_size, stride, padding, dilations
+  return [
+      Config([2, 5, 8, 1], [4, 4, 1, 2], [2, 5, 8, 2]),
+      Config([4, 5, 5, 1], [2, 2, 1, 2], [4, 2, 2, 2], 2, padding="VALID"),
+      Config([2, 4, 4, 2], [3, 1, 2, 2], [2, 4, 4, 4]),
+      Config([1, 15, 15, 2], [1, 3, 2, 1], [1, 15, 15, 2]),
+      Config([2, 15, 16, 1], [3, 3, 1, 2], [2, 5, 5, 2], 3, padding="VALID"),
+      Config([2, 5, 8, 1], [4, 3, 1, 2], [2, 5, 8, 2], dilations=[1, 2]),
+      # These cases test the kernels in depthwise_conv_op_gpu.h which are used
+      # if the input size is small.
+      Config([1, 3, 1, 2], [2, 1, 2, 1], [1, 3, 1, 2]),
+      Config([2, 2, 3, 2], [2, 1, 2, 1], [2, 2, 3, 2]),
+      Config([2, 2, 3, 1], [2, 2, 1, 1], [2, 2, 3, 1]),
+  ]
 
 
 class DepthwiseConv2DTest(test.TestCase):
 
-  # This is testing that depthwise_conv2d and depthwise_conv2d_native
-  # produce the same results.  It also tests that NCHW and NHWC
-  # formats agree, by comparing the depthwise_conv2d_native with
-  # 'NCHW' format (with transposition) matches the 'NHWC' format using
-  # the higher level interface.
+  # This tests depthwise_conv2d and depthwise_conv2d_native
   def _VerifyValues(self,
                     tensor_in_sizes,
                     filter_in_sizes,
@@ -103,7 +205,8 @@ class DepthwiseConv2DTest(test.TestCase):
                     data_type,
                     use_gpu,
                     grouped_conv=False,
-                    data_format="NHWC"):
+                    data_format="NHWC",
+                    dilations=None):
     """Verifies the output values of the convolution function.
 
     Args:
@@ -117,6 +220,7 @@ class DepthwiseConv2DTest(test.TestCase):
       use_gpu: Whether to use GPU.
       grouped_conv: Whether to use cuDNN 7's grouped convolution.
       data_format: The data_format of the input. "NHWC" or "NCHW".
+      dilations: A list of 2 elements, representing the dilations.
     """
     input_size = 1
     filter_size = 1
@@ -126,7 +230,14 @@ class DepthwiseConv2DTest(test.TestCase):
       filter_size *= s
     # Initializes the input and filter tensor with numbers incrementing from 1.
     x1 = [f * 1.0 / input_size for f in range(1, input_size + 1)]
+    x1 = np.array(x1).reshape(tensor_in_sizes)
     x2 = [f * 1.0 / filter_size for f in range(1, filter_size + 1)]
+    x2 = np.array(x2).reshape(filter_in_sizes)
+    # Compute reference result
+    strides = [1, stride, stride, 1]
+    np_result = _DepthwiseConv2dNumpy(x1, x2, strides, padding, "NHWC",
+                                      dilations)
+
     ops.reset_default_graph()
     graph = ops.get_default_graph()
     with self.session(graph=graph, use_gpu=use_gpu) as sess:
@@ -137,60 +248,62 @@ class DepthwiseConv2DTest(test.TestCase):
       }[data_type]
 
       t1 = constant_op.constant(x1, shape=tensor_in_sizes, dtype=data_type)
-      t1.set_shape(tensor_in_sizes)
       t2 = constant_op.constant(x2, shape=filter_in_sizes, dtype=data_type)
 
-      native_t1 = t1
-      strides = [1, stride, stride, 1]
       if data_format == "NCHW":
         # Transpose from NHWC input to NCHW
         # Ex. [4, 5, 5, 48] to [4, 48, 5, 5]
-        native_t1 = array_ops.transpose(t1, [0, 3, 1, 2])
+        t1 = array_ops.transpose(t1, [0, 3, 1, 2])
         strides = [1, 1, stride, stride]
 
-      with sess.graph._kernel_label_map({
-          "DepthwiseConv2dNative": "cudnn_grouped_convolution"
-      } if grouped_conv else {}):
-        conv_native = nn_ops.depthwise_conv2d_native(
-            native_t1,
-            t2,
-            strides=strides,
-            data_format=data_format,
-            padding=padding)
+      # depthwise_conv2d_native does not support dilations except on TPUs.
+      if dilations is None:
+        with sess.graph._kernel_label_map({
+            "DepthwiseConv2dNative": "cudnn_grouped_convolution"
+        } if grouped_conv else {}):
+          conv_native = nn_ops.depthwise_conv2d_native(
+              t1,
+              t2,
+              strides=strides,
+              data_format=data_format,
+              padding=padding)
 
-      if data_format == "NCHW":
-        # Transpose back from NCHW to NHWC
-        conv_native = array_ops.transpose(conv_native, [0, 2, 3, 1])
+        if data_format == "NCHW":
+          # Transpose back from NCHW to NHWC
+          conv_native = array_ops.transpose(conv_native, [0, 2, 3, 1])
 
-      try:
-        native_result = self.evaluate(conv_native)
-      except errors.InvalidArgumentError as e:
-        # Grouped convolution kernel is only registered for cuDNN 7. Silently
-        # return when we are running on an earlier version or without GPU.
-        if e.message.startswith(
-            "No OpKernel was registered to support Op 'DepthwiseConv2dNative'"):
-          tf_logging.warn("Skipping grouped convolution test")
-          return
-        raise e
+        try:
+          # The Numpy array from calling depthwise_conv2d_native
+          native_result = self.evaluate(conv_native)
+        except errors.InvalidArgumentError as e:
+          # Grouped convolution kernel is only registered for cuDNN 7. Silently
+          # return when we are running on an earlier version or without GPU.
+          if e.message.startswith(
+              "No OpKernel was registered to support Op "
+              "'DepthwiseConv2dNative'"):
+            tf_logging.warn("Skipping grouped convolution test")
+            return
+          raise e
 
       conv_interface = nn_impl.depthwise_conv2d(
-          t1, t2, strides=[1, stride, stride, 1], padding=padding)
+          t1, t2, strides=strides, padding=padding,
+          data_format=data_format, dilations=dilations)
+      if data_format == "NCHW":
+        # Transpose back from NCHW to NHWC
+        conv_interface = array_ops.transpose(conv_interface, [0, 2, 3, 1])
+
+      # The Numpy array from calling depthwise_conv2d
       interface_result = self.evaluate(conv_interface)
 
-    tf_logging.info(
-        "data_type: %r, use_gpu: %r, grouped_conv: %r, max diff = %f",
-        data_type, use_gpu, grouped_conv,
-        np.amax(np.absolute(native_result - interface_result)))
-    self.assertArrayNear(
-        np.ravel(native_result), np.ravel(interface_result), tolerance)
-    self.assertShapeEqual(native_result, conv_native)
-    self.assertShapeEqual(native_result, conv_interface)
+    if dilations is None:
+      self.assertAllClose(native_result, np_result, atol=tolerance, rtol=0.)
+    self.assertAllClose(interface_result, np_result, atol=tolerance, rtol=0.)
 
   @test_util.run_v1_only("b/120545219")
   @test_util.run_cuda_only
   def testDepthwiseConv2DCudnn(self):
     for index, (input_size, filter_size, _, stride,
-                padding) in enumerate(ConfigsToTest()):
+                padding, dilations) in enumerate(ConfigsToTest()):
       # The CuDNN depthwise conv is turned on only when input/output is NCHW and
       # float16(half). See cudnn release note 7.6.3.
       tf_logging.info(
@@ -204,12 +317,13 @@ class DepthwiseConv2DTest(test.TestCase):
           padding,
           data_type,
           use_gpu=True,
-          data_format="NCHW")
+          data_format="NCHW",
+          dilations=dilations)
 
   @test_util.run_v1_only("b/120545219")
   def testDepthwiseConv2D(self):
     for index, (input_size, filter_size, _, stride,
-                padding) in enumerate(ConfigsToTest()):
+                padding, dilations) in enumerate(ConfigsToTest()):
       tf_logging.info(
           "Testing DepthwiseConv2D, %dth config: %r * %r, stride: %d, padding: "
           "%s", index, input_size, filter_size, stride, padding)
@@ -219,7 +333,8 @@ class DepthwiseConv2DTest(test.TestCase):
       for data_type in ([dtypes.float32] + optional_float64):
         tf_logging.info("Testing without grouped_conv")
         self._VerifyValues(
-            input_size, filter_size, stride, padding, data_type, use_gpu=True)
+            input_size, filter_size, stride, padding, data_type, use_gpu=True,
+            dilations=dilations)
         tf_logging.info("Testing with grouped_conv")
         self._VerifyValues(
             input_size,
@@ -228,7 +343,8 @@ class DepthwiseConv2DTest(test.TestCase):
             padding,
             data_type,
             use_gpu=True,
-            grouped_conv=True)
+            grouped_conv=True,
+            dilations=dilations)
 
   @test_util.run_v1_only("b/120545219")
   def testDepthwiseConv2DWithUnknownShape(self):
@@ -250,7 +366,7 @@ class DepthwiseConv2DTest(test.TestCase):
       return
 
     for index, (input_size, filter_size, _, stride,
-                padding) in enumerate(ConfigsToTest()):
+                padding, dilations) in enumerate(ConfigsToTest()):
       tf_logging.info(
           "Testing DepthwiseConv2DFormat, %dth config: %r * %r, stride: %d, "
           "padding: %s", index, input_size, filter_size, stride, padding)
@@ -265,7 +381,8 @@ class DepthwiseConv2DTest(test.TestCase):
             padding,
             data_type,
             use_gpu=True,
-            data_format="NCHW")
+            data_format="NCHW",
+            dilations=dilations)
 
 # This is testing against hand calculated results.
 
@@ -385,7 +502,8 @@ class DepthwiseConv2DTest(test.TestCase):
                                 test_input,
                                 use_gpu,
                                 grouped_conv=False,
-                                data_format="NHWC"):
+                                data_format="NHWC",
+                                dilations=None):
     input_size = 1
     for x in input_shape:
       input_size *= x
@@ -393,7 +511,9 @@ class DepthwiseConv2DTest(test.TestCase):
     for x in filter_shape:
       filter_size *= x
     input_data = [x * 1.0 / input_size for x in range(0, input_size)]
+    input_np = np.array(input_data).reshape(input_shape)
     filter_data = [x * 1.0 / filter_size for x in range(0, filter_size)]
+    filter_np = np.array(filter_data).reshape(filter_shape)
     ops.reset_default_graph()
     graph = ops.get_default_graph()
     with self.session(graph=graph, use_gpu=use_gpu) as sess:
@@ -404,9 +524,9 @@ class DepthwiseConv2DTest(test.TestCase):
       }[data_type]
 
       input_tensor = constant_op.constant(
-          input_data, shape=input_shape, dtype=data_type, name="input")
+          input_np, shape=input_shape, dtype=data_type, name="input")
       filter_tensor = constant_op.constant(
-          filter_data, shape=filter_shape, dtype=data_type, name="filter")
+          filter_np, shape=filter_shape, dtype=data_type, name="filter")
 
       native_input = input_tensor
       strides = [1, stride, stride, 1]
@@ -427,12 +547,13 @@ class DepthwiseConv2DTest(test.TestCase):
           "DepthwiseConv2dNativeBackpropInput": "cudnn_grouped_convolution",
           "DepthwiseConv2dNativeBackpropFilter": "cudnn_grouped_convolution",
       } if grouped_conv else {}):
-        depthwise_conv2d = nn_ops.depthwise_conv2d_native(
+        depthwise_conv2d = nn_impl.depthwise_conv2d(
             native_input,
             filter_tensor,
             strides,
             padding,
             data_format=data_format,
+            dilations=dilations,
             name="depthwise_conv2d")
 
       self.assertEqual(output_shape, depthwise_conv2d.get_shape())
@@ -462,7 +583,7 @@ class DepthwiseConv2DTest(test.TestCase):
   @test_util.run_cuda_only
   def testDepthwiseConv2DInputGradCudnn(self):
     for index, (input_size, filter_size, output_size, stride,
-                padding) in enumerate(CheckGradConfigsToTest()):
+                padding, dilations) in enumerate(CheckGradConfigsToTest()):
       # The CuDNN depthwise conv (input gradient) is turned on only when
       # stride = 1, input/output is NCHW and float16(half). See cudnn release
       # note 7.6.3.
@@ -482,12 +603,13 @@ class DepthwiseConv2DTest(test.TestCase):
           data_type,
           test_input=True,
           use_gpu=True,
-          data_format="NCHW")
+          data_format="NCHW",
+          dilations=dilations)
 
   @test_util.run_v1_only("b/120545219")
   def testDepthwiseConv2DInputGrad(self):
     for index, (input_size, filter_size, output_size, stride,
-                padding) in enumerate(CheckGradConfigsToTest()):
+                padding, dilations) in enumerate(CheckGradConfigsToTest()):
       tf_logging.info(
           "Testing DepthwiseConv2DInputGrad, %dth config: %r * %r, stride: %d, "
           "padding: %s", index, input_size, filter_size, stride, padding)
@@ -503,7 +625,8 @@ class DepthwiseConv2DTest(test.TestCase):
             padding,
             data_type,
             test_input=True,
-            use_gpu=True)
+            use_gpu=True,
+            dilations=dilations)
         self._ConstructAndTestGradient(
             input_size,
             filter_size,
@@ -513,7 +636,8 @@ class DepthwiseConv2DTest(test.TestCase):
             data_type,
             test_input=True,
             use_gpu=True,
-            grouped_conv=True)
+            grouped_conv=True,
+            dilations=dilations)
 
   @test_util.run_v1_only("b/120545219")
   def testDepthwiseConv2DInputGradFormat(self):
@@ -521,7 +645,7 @@ class DepthwiseConv2DTest(test.TestCase):
       return
 
     for index, (input_size, filter_size, output_size, stride,
-                padding) in enumerate(CheckGradConfigsToTest()):
+                padding, dilations) in enumerate(CheckGradConfigsToTest()):
       tf_logging.info(
           "Testing DepthwiseConv2DInputGradFormat, %dth config: %r * %r, "
           "stride: %d, padding: %s", index, input_size, filter_size, stride,
@@ -539,13 +663,14 @@ class DepthwiseConv2DTest(test.TestCase):
             data_type,
             test_input=True,
             use_gpu=True,
-            data_format="NCHW")
+            data_format="NCHW",
+            dilations=dilations)
 
   @test_util.run_v1_only("b/120545219")
   @test_util.run_cuda_only
   def testDepthwiseConv2DFilterGradCudnn(self):
     for index, (input_size, filter_size, output_size, stride,
-                padding) in enumerate(CheckGradConfigsToTest()):
+                padding, dilations) in enumerate(CheckGradConfigsToTest()):
       # The CuDNN depthwise conv (filter gradient) is turned on only when
       # input/output is float16(half). See cudnn release note 7.6.3.
       tf_logging.info(
@@ -562,7 +687,8 @@ class DepthwiseConv2DTest(test.TestCase):
           data_type,
           test_input=False,
           use_gpu=True,
-          data_format="NCHW")
+          data_format="NCHW",
+          dilations=dilations)
       self._ConstructAndTestGradient(
           input_size,
           filter_size,
@@ -572,12 +698,13 @@ class DepthwiseConv2DTest(test.TestCase):
           data_type,
           test_input=False,
           use_gpu=True,
-          data_format="NHWC")
+          data_format="NHWC",
+          dilations=dilations)
 
   @test_util.run_v1_only("b/120545219")
   def testDepthwiseConv2DFilterGrad(self):
     for index, (input_size, filter_size, output_size, stride,
-                padding) in enumerate(CheckGradConfigsToTest()):
+                padding, dilations) in enumerate(CheckGradConfigsToTest()):
       tf_logging.info(
           "Testing DepthwiseConv2DFilterGrad, %dth config: %r * %r, stride: "
           "%d, padding: %s", index, input_size, filter_size, stride, padding)
@@ -593,7 +720,8 @@ class DepthwiseConv2DTest(test.TestCase):
             padding,
             data_type,
             test_input=False,
-            use_gpu=True)
+            use_gpu=True,
+            dilations=dilations)
 
   @test_util.run_v1_only("b/120545219")
   def testDepthwiseConv2DFilterGradFormat(self):
@@ -601,7 +729,7 @@ class DepthwiseConv2DTest(test.TestCase):
       return
 
     for index, (input_size, filter_size, output_size, stride,
-                padding) in enumerate(CheckGradConfigsToTest()):
+                padding, dilations) in enumerate(CheckGradConfigsToTest()):
       tf_logging.info(
           "Testing DepthwiseConv2DFilterGradFormat, %dth config: %r * %r, "
           "stride: %d, padding: %s", index, input_size, filter_size, stride,
@@ -619,32 +747,13 @@ class DepthwiseConv2DTest(test.TestCase):
             data_type,
             test_input=False,
             use_gpu=True,
-            data_format="NCHW")
+            data_format="NCHW",
+            dilations=dilations)
 
-  def _CompareBackpropInputFloat(self, input_sizes, filter_sizes, output_sizes,
-                                 stride, padding):
-    x1 = np.random.rand(*filter_sizes).astype(np.float32)
-    x2 = np.random.rand(*output_sizes).astype(np.float32)
-
-    def _GetVal(use_gpu):
-      with self.cached_session(use_gpu=use_gpu):
-        t0 = constant_op.constant(input_sizes, shape=[len(input_sizes)])
-        t1 = constant_op.constant(x1, shape=filter_sizes)
-        t2 = constant_op.constant(x2, shape=output_sizes)
-        backprop = nn_ops.depthwise_conv2d_native_backprop_input(
-            t0, t1, t2, strides=[1, stride, stride, 1], padding=padding)
-        ret = self.evaluate(backprop)
-        self.assertShapeEqual(ret, backprop)
-        return ret
-
-    gpu_value = _GetVal(use_gpu=True)
-    cpu_value = _GetVal(use_gpu=False)
-    self.assertAllClose(cpu_value, gpu_value, rtol=1e-4, atol=1e-4)
-
-  def _CompareBackpropInputDouble(self, input_sizes, filter_sizes, output_sizes,
-                                  stride, padding):
-    x1 = np.random.rand(*filter_sizes).astype(np.float64)
-    x2 = np.random.rand(*output_sizes).astype(np.float64)
+  def _CompareBackpropInput(self, input_sizes, filter_sizes, output_sizes,
+                            stride, padding, dtype):
+    x1 = np.random.rand(*filter_sizes).astype(dtype)
+    x2 = np.random.rand(*output_sizes).astype(dtype)
 
     def _GetVal(use_gpu):
       with self.cached_session(use_gpu=use_gpu):
@@ -663,44 +772,26 @@ class DepthwiseConv2DTest(test.TestCase):
 
   def testDepthwiseConv2DInputGradCompare(self):
     for index, (input_size, filter_size, output_size, stride,
-                padding) in enumerate(ConfigsToTest()):
+                padding, dilations) in enumerate(ConfigsToTest()):
+      if dilations:
+        continue
       tf_logging.info(
           "Testing DepthwiseConv2DInputGradCompare, %dth config: %r * %r, "
           "stride: %d, padding: %s", index, input_size, filter_size, stride,
           padding)
-      self._CompareBackpropInputFloat(input_size, filter_size, output_size,
-                                      stride, padding)
+      self._CompareBackpropInput(input_size, filter_size, output_size, stride,
+                                 padding, "float32")
       # double datatype is currently not supported for convolution ops
       # on the ROCm platform
       if test.is_built_with_rocm():
         continue
-      self._CompareBackpropInputDouble(input_size, filter_size, output_size,
-                                       stride, padding)
+      self._CompareBackpropInput(input_size, filter_size, output_size, stride,
+                                 padding, "float64")
 
-  def _CompareBackpropFilterFloat(self, input_sizes, filter_sizes, output_sizes,
-                                  stride, padding):
-    x0 = np.random.rand(*input_sizes).astype(np.float32)
-    x2 = np.random.rand(*output_sizes).astype(np.float32)
-
-    def _GetVal(use_gpu):
-      with self.cached_session(use_gpu=use_gpu):
-        t0 = constant_op.constant(x0, shape=input_sizes)
-        t1 = constant_op.constant(filter_sizes, shape=[len(filter_sizes)])
-        t2 = constant_op.constant(x2, shape=output_sizes)
-        backprop = nn_ops.depthwise_conv2d_native_backprop_filter(
-            t0, t1, t2, strides=[1, stride, stride, 1], padding=padding)
-        ret = self.evaluate(backprop)
-        self.assertShapeEqual(ret, backprop)
-        return ret
-
-    gpu_value = _GetVal(use_gpu=True)
-    cpu_value = _GetVal(use_gpu=False)
-    self.assertAllClose(cpu_value, gpu_value, rtol=1e-4, atol=1e-4)
-
-  def _CompareBackpropFilterDouble(self, input_sizes, filter_sizes,
-                                   output_sizes, stride, padding):
-    x0 = np.random.rand(*input_sizes).astype(np.float64)
-    x2 = np.random.rand(*output_sizes).astype(np.float64)
+  def _CompareBackpropFilter(self, input_sizes, filter_sizes, output_sizes,
+                             stride, padding, dtype):
+    x0 = np.random.rand(*input_sizes).astype(dtype)
+    x2 = np.random.rand(*output_sizes).astype(dtype)
 
     def _GetVal(use_gpu):
       with self.cached_session(use_gpu=use_gpu):
@@ -719,19 +810,21 @@ class DepthwiseConv2DTest(test.TestCase):
 
   def testDepthwiseConv2DFilterGradCompare(self):
     for index, (input_size, filter_size, output_size, stride,
-                padding) in enumerate(ConfigsToTest()):
+                padding, dilations) in enumerate(ConfigsToTest()):
+      if dilations:
+        continue
       tf_logging.info(
           "Testing DepthwiseConv2DFilterGradCompare, %dth config: %r * %r, "
           "stride: %d, padding: %s", index, input_size, filter_size, stride,
           padding)
-      self._CompareBackpropFilterFloat(input_size, filter_size, output_size,
-                                       stride, padding)
+      self._CompareBackpropFilter(input_size, filter_size, output_size, stride,
+                                  padding, "float32")
       # double datatype is currently not supported for convolution ops
       # on the ROCm platform
       if test.is_built_with_rocm():
         continue
-      self._CompareBackpropFilterDouble(input_size, filter_size, output_size,
-                                        stride, padding)
+      self._CompareBackpropFilter(input_size, filter_size, output_size, stride,
+                                  padding, "float64")
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/distributions/BUILD b/tensorflow/python/kernel_tests/distributions/BUILD
index 23194dec2b8..9fbc161aafe 100644
--- a/tensorflow/python/kernel_tests/distributions/BUILD
+++ b/tensorflow/python/kernel_tests/distributions/BUILD
@@ -75,7 +75,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "bernoulli_test",
-    size = "small",
+    size = "medium",
     srcs = ["bernoulli_test.py"],
     deps = [
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index c876b8b3834..15779c54297 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -81,7 +81,7 @@ cuda_py_test(
     name = "linear_operator_block_diag_test",
     size = "medium",
     srcs = ["linear_operator_block_diag_test.py"],
-    shard_count = 6,
+    shard_count = 8,
     tags = [
         "noasan",
         "optonly",
@@ -103,7 +103,7 @@ cuda_py_test(
     name = "linear_operator_block_lower_triangular_test",
     size = "medium",
     srcs = ["linear_operator_block_lower_triangular_test.py"],
-    shard_count = 6,
+    shard_count = 8,
     tags = [
         "noasan",
         "optonly",
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
index abaf9bf3649..552825fb47c 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -82,6 +83,10 @@ class SquareLinearOperatorBlockDiagTest(
         shape_info((2, 1, 5, 5), blocks=[(2, 1, 2, 2), (1, 3, 3)]),
     ]
 
+  @staticmethod
+  def use_blockwise_arg():
+    return True
+
   def operator_and_matrix(
       self, shape_info, dtype, use_placeholder,
       ensure_self_adjoint_and_pd=False):
@@ -275,6 +280,20 @@ class SquareLinearOperatorBlockDiagTest(
     with self.assertRaisesRegexp(ValueError, "non-empty"):
       block_diag.LinearOperatorBlockDiag([])
 
+  def test_incompatible_input_blocks_raises(self):
+    matrix_1 = array_ops.placeholder_with_default(rng.rand(4, 4), shape=None)
+    matrix_2 = array_ops.placeholder_with_default(rng.rand(3, 3), shape=None)
+    operators = [
+        linalg.LinearOperatorFullMatrix(matrix_1, is_square=True),
+        linalg.LinearOperatorFullMatrix(matrix_2, is_square=True)
+    ]
+    operator = block_diag.LinearOperatorBlockDiag(operators)
+    x = np.random.rand(2, 4, 5).tolist()
+    msg = ("dimension does not match" if context.executing_eagerly()
+           else "input structure is ambiguous")
+    with self.assertRaisesRegexp(ValueError, msg):
+      operator.matmul(x)
+
 
 if __name__ == "__main__":
   linear_operator_test_util.add_tests(SquareLinearOperatorBlockDiagTest)
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_block_lower_triangular_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_block_lower_triangular_test.py
index 234414ea45b..dfa5c900ecd 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_block_lower_triangular_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_block_lower_triangular_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -68,6 +69,10 @@ class SquareLinearOperatorBlockLowerTriangularTest(
     self._rtol[dtypes.complex64] = 1e-5
     super(SquareLinearOperatorBlockLowerTriangularTest, self).setUp()
 
+  @staticmethod
+  def use_blockwise_arg():
+    return True
+
   @staticmethod
   def skip_these_tests():
     # Skipping since `LinearOperatorBlockLowerTriangular` is in general not
@@ -267,6 +272,23 @@ class SquareLinearOperatorBlockLowerTriangularTest(
     with self.assertRaisesRegexp(ValueError, "must be equal"):
       block_lower_triangular.LinearOperatorBlockLowerTriangular(operators)
 
+  def test_incompatible_input_blocks_raises(self):
+    matrix_1 = array_ops.placeholder_with_default(rng.rand(4, 4), shape=None)
+    matrix_2 = array_ops.placeholder_with_default(rng.rand(3, 4), shape=None)
+    matrix_3 = array_ops.placeholder_with_default(rng.rand(3, 3), shape=None)
+    operators = [
+        [linalg.LinearOperatorFullMatrix(matrix_1, is_square=True)],
+        [linalg.LinearOperatorFullMatrix(matrix_2),
+         linalg.LinearOperatorFullMatrix(matrix_3, is_square=True)]
+    ]
+    operator = block_lower_triangular.LinearOperatorBlockLowerTriangular(
+        operators)
+    x = np.random.rand(2, 4, 5).tolist()
+    msg = ("dimension does not match" if context.executing_eagerly()
+           else "input structure is ambiguous")
+    with self.assertRaisesRegexp(ValueError, msg):
+      operator.matmul(x)
+
 
 if __name__ == "__main__":
   linear_operator_test_util.add_tests(
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_toeplitz_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_toeplitz_test.py
index c9affa3ed1f..918c238d352 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_toeplitz_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_toeplitz_test.py
@@ -67,7 +67,7 @@ class LinearOperatorToeplitzTest(
     # Skip solve tests, as these could have better stability
     # (currently exercises the base class).
     # TODO(srvasude): Enable these when solve is implemented.
-    return ["cholesky", "inverse", "solve", "solve_with_broadcast"]
+    return ["cholesky", "cond", "inverse", "solve", "solve_with_broadcast"]
 
   @staticmethod
   def operator_shapes_infos():
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
index 1b33ec58548..d82de56c80c 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
@@ -21,6 +21,7 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
@@ -344,5 +345,88 @@ class UseOperatorOrProvidedHintUnlessContradictingTest(test.TestCase,
           message="my error message")
 
 
+class BlockwiseTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ("split_dim_1", [3, 3, 4], -1),
+      ("split_dim_2", [2, 5], -2),
+      )
+  def test_blockwise_input(self, op_dimension_values, split_dim):
+
+    op_dimensions = [
+        tensor_shape.Dimension(v) for v in op_dimension_values]
+    unknown_op_dimensions = [
+        tensor_shape.Dimension(None) for _ in op_dimension_values]
+
+    batch_shape = [2, 1]
+    arg_dim = 5
+    if split_dim == -1:
+      blockwise_arrays = [np.zeros(batch_shape + [arg_dim, d])
+                          for d in op_dimension_values]
+    else:
+      blockwise_arrays = [np.zeros(batch_shape + [d, arg_dim])
+                          for d in op_dimension_values]
+
+    blockwise_list = [block.tolist() for block in blockwise_arrays]
+    blockwise_tensors = [ops.convert_to_tensor(block)
+                         for block in blockwise_arrays]
+    blockwise_placeholders = [
+        array_ops.placeholder_with_default(block, shape=None)
+        for block in blockwise_arrays]
+
+    # Iterables of non-nested structures are always interpreted as blockwise.
+    # The list of lists is interpreted as blockwise as well, regardless of
+    # whether the operator dimensions are known, since the sizes of its elements
+    # along `split_dim` are non-identical.
+    for op_dims in [op_dimensions, unknown_op_dimensions]:
+      for blockwise_inputs in [
+          blockwise_arrays, blockwise_list,
+          blockwise_tensors, blockwise_placeholders]:
+        self.assertTrue(linear_operator_util.arg_is_blockwise(
+            op_dims, blockwise_inputs, split_dim))
+
+  def test_non_blockwise_input(self):
+    x = np.zeros((2, 3, 4, 6))
+    x_tensor = ops.convert_to_tensor(x)
+    x_placeholder = array_ops.placeholder_with_default(x, shape=None)
+    x_list = x.tolist()
+
+    # For known and matching operator dimensions, interpret all as non-blockwise
+    op_dimension_values = [2, 1, 3]
+    op_dimensions = [tensor_shape.Dimension(d) for d in op_dimension_values]
+    for inputs in [x, x_tensor, x_placeholder, x_list]:
+      self.assertFalse(linear_operator_util.arg_is_blockwise(
+          op_dimensions, inputs, -1))
+
+    # The input is still interpreted as non-blockwise for unknown operator
+    # dimensions (`x_list` has an outermost dimension that does not matcn the
+    # number of blocks, and the other inputs are not iterables).
+    unknown_op_dimensions = [
+        tensor_shape.Dimension(None) for _ in op_dimension_values]
+    for inputs in [x, x_tensor, x_placeholder, x_list]:
+      self.assertFalse(linear_operator_util.arg_is_blockwise(
+          unknown_op_dimensions, inputs, -1))
+
+  def test_ambiguous_input_raises(self):
+    x = np.zeros((3, 4, 2)).tolist()
+    op_dimensions = [tensor_shape.Dimension(None) for _ in range(3)]
+
+    # Since the leftmost dimension of `x` is equal to the number of blocks, and
+    # the operators have unknown dimension, the input is ambiguous.
+    with self.assertRaisesRegexp(ValueError, "structure is ambiguous"):
+      linear_operator_util.arg_is_blockwise(op_dimensions, x, -2)
+
+  def test_mismatched_input_raises(self):
+    x = np.zeros((2, 3, 4, 6)).tolist()
+    op_dimension_values = [4, 3]
+    op_dimensions = [tensor_shape.Dimension(v) for v in op_dimension_values]
+
+    # The dimensions of the two operator-blocks sum to 7. `x` is a
+    # two-element list; if interpreted blockwise, its corresponding dimensions
+    # sum to 12 (=6*2). If not interpreted blockwise, its corresponding
+    # dimension is 6. This is a mismatch.
+    with self.assertRaisesRegexp(ValueError, "dimension does not match"):
+      linear_operator_util.arg_is_blockwise(op_dimensions, x, -1)
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/sparse/BUILD b/tensorflow/python/kernel_tests/linalg/sparse/BUILD
index 6a672eedd97..96ebc38ce5a 100644
--- a/tensorflow/python/kernel_tests/linalg/sparse/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/sparse/BUILD
@@ -36,6 +36,7 @@ cuda_py_test(
 cuda_py_test(
     name = "csr_sparse_matrix_ops_test",
     size = "medium",
+    timeout = "long",
     srcs = ["csr_sparse_matrix_ops_test.py"],
     main = "csr_sparse_matrix_ops_test.py",
     shard_count = 10,
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index 7e69235c8b6..11f882b5bf3 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -37,8 +37,10 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_list_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import list_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import test
 
@@ -1629,6 +1631,16 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       t = list_ops.tensor_list_stack(inner_l, element_dtype=dtypes.float32)
       self.assertAllEqual(t, [1.0, 2.0, 3.0])
 
+  def testTensorListStrings(self):
+    self.skipTest("b/150742232")
+
+    @def_function.function
+    def f():
+      return map_fn.map_fn(string_ops.string_upper,
+                           constant_op.constant(["a", "b", "c"]))
+
+    self.assertAllEqual(f(), ["A", "B", "C"])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/map_fn_test.py b/tensorflow/python/kernel_tests/map_fn_test.py
index d2b1d433c78..1e10d689886 100644
--- a/tensorflow/python/kernel_tests/map_fn_test.py
+++ b/tensorflow/python/kernel_tests/map_fn_test.py
@@ -69,13 +69,14 @@ class MapFnTest(test.TestCase):
 
   def testMapSparseTensor(self):
     with self.cached_session():
-      with self.assertRaises(TypeError):
-        map_fn.map_fn(
-            lambda x: x,
-            sparse_tensor.SparseTensor(
-                indices=[[0, 0], [0, 1], [1, 0]],
-                values=constant_op.constant([0, 1, 2]),
-                dense_shape=[2, 2]))
+      st = sparse_tensor.SparseTensor(
+          indices=[[0, 0], [0, 1], [1, 0]],
+          values=constant_op.constant([0, 1, 2]),
+          dense_shape=[2, 2])
+      result = map_fn.map_fn(lambda x: x, st)
+      self.assertAllEqual(result.indices, st.indices)
+      self.assertAllEqual(result.values, st.values)
+      self.assertAllEqual(result.dense_shape, st.dense_shape)
 
   @test_util.run_in_graph_and_eager_modes
   def testMapOverScalarErrors(self):
diff --git a/tensorflow/python/kernel_tests/numerics_test.py b/tensorflow/python/kernel_tests/numerics_test.py
index 4c77028d8fd..4d31cd45289 100644
--- a/tensorflow/python/kernel_tests/numerics_test.py
+++ b/tensorflow/python/kernel_tests/numerics_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 import numpy as np
 
 from tensorflow.python.framework import constant_op
@@ -177,4 +179,8 @@ class NumericsTest(test.TestCase):
 
 
 if __name__ == "__main__":
+  # TODO(b/130689556): XLA CPU does not honor inf/nan which causes problems
+  os.environ[
+      "XLA_FLAGS"] = "--xla_cpu_enable_fast_math=false " + os.environ.get(
+          "XLA_FLAGS", "")
   test.main()
diff --git a/tensorflow/python/kernel_tests/random/random_ops_test.py b/tensorflow/python/kernel_tests/random/random_ops_test.py
index 1023b8f7901..4dbbb7c7f1e 100644
--- a/tensorflow/python/kernel_tests/random/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/random_ops_test.py
@@ -304,11 +304,11 @@ class RandomUniformTest(RandomOpTestCommon):
   def testUniformIntsWithInvalidShape(self):
     for dtype in dtypes.int32, dtypes.int64:
       with self.assertRaisesRegexp(
-          ValueError, "Shape must be rank 0 but is rank 1"):
+          ValueError, "minval must be a scalar; got a tensor of shape"):
         random_ops.random_uniform(
             [1000], minval=[1, 2], maxval=3, dtype=dtype)
       with self.assertRaisesRegexp(
-          ValueError, "Shape must be rank 0 but is rank 1"):
+          ValueError, "maxval must be a scalar; got a tensor of shape"):
         random_ops.random_uniform(
             [1000], minval=1, maxval=[2, 3], dtype=dtype)
 
diff --git a/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py b/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
index 0cd32a5b046..38325805d76 100644
--- a/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import functools
 
 import numpy as np
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed
@@ -193,6 +194,31 @@ class StatelessOpsTest(test.TestCase):
   def testDeterminismPoisson(self):
     self._test_determinism(self._poisson_cases())
 
+  @test_util.run_v2_only
+  def testErrors(self):
+    """Tests that proper errors are raised.
+    """
+    shape = [2, 3]
+    with self.assertRaisesWithPredicateMatch(
+        ValueError,
+        'minval must be a scalar; got a tensor of shape '):
+      @def_function.function
+      def f():
+        stateless.stateless_random_uniform(
+            shape=shape, seed=[1, 2], minval=array_ops.zeros(shape, 'int32'),
+            maxval=100, dtype='int32')
+      f()
+    with self.assertRaisesWithPredicateMatch(
+        ValueError,
+        'maxval must be a scalar; got a tensor of shape '):
+      @def_function.function
+      def f2():
+        stateless.stateless_random_uniform(
+            shape=shape, seed=[1, 2], minval=0,
+            maxval=array_ops.ones(shape, 'int32') * 100,
+            dtype='int32')
+      f2()
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index f20e54d18a5..cbd8f6a2ebe 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -30,6 +30,7 @@ from tensorflow.core.framework import tensor_pb2
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
+from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import dtypes
@@ -1488,5 +1489,40 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
     self.assertAllEqual(expected, result)
 
 
+class PerReplicaResourceHandleTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    super(PerReplicaResourceHandleTest, self).setUp()
+    cpus = config.list_physical_devices("CPU")
+    # Set 2 virtual CPUs
+    config.set_logical_device_configuration(cpus[0], [
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration(),
+    ])
+
+  def testAllowedDevices(self):
+    device0 = "/job:localhost/replica:0/task:0/device:CPU:0"
+    device1 = "/job:localhost/replica:0/task:0/device:CPU:1"
+    value0 = 1
+    value1 = 2
+    with context.eager_mode():
+      handle = resource_variable_ops.var_handle_op(
+          dtype=dtypes.int32, shape=[], allowed_devices=[device0, device1])
+      with ops.device(device0):
+        assign0 = resource_variable_ops.assign_variable_op(handle, value0)
+      with ops.device(device1):
+        assign1 = resource_variable_ops.assign_variable_op(handle, value1)
+      with ops.control_dependencies([assign0, assign1]):
+        with ops.device(device0):
+          read0 = resource_variable_ops.read_variable_op(
+              handle, dtype=dtypes.int32)
+        with ops.device(device1):
+          read1 = resource_variable_ops.read_variable_op(
+              handle, dtype=dtypes.int32)
+
+      self.assertAllEqual(value0, read0)
+      self.assertAllEqual(value1, read1)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/signal/BUILD b/tensorflow/python/kernel_tests/signal/BUILD
index d2d6296aef4..8c4142e441a 100644
--- a/tensorflow/python/kernel_tests/signal/BUILD
+++ b/tensorflow/python/kernel_tests/signal/BUILD
@@ -150,6 +150,7 @@ cuda_py_tests(
     python_version = "PY3",
     shard_count = 4,
     tags = [
+        "no_oss_py38",  #TODO(b/151631881)
         "no_windows_gpu",
     ],
     deps = [
diff --git a/tensorflow/python/kernel_tests/string_bytes_split_op_test.py b/tensorflow/python/kernel_tests/string_bytes_split_op_test.py
index 55c4618484b..8a4f5edc519 100644
--- a/tensorflow/python/kernel_tests/string_bytes_split_op_test.py
+++ b/tensorflow/python/kernel_tests/string_bytes_split_op_test.py
@@ -22,6 +22,8 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_string_ops
@@ -64,6 +66,16 @@ class StringsToBytesOpTest(test_util.TensorFlowTestCase,
     result = ragged_string_ops.string_bytes_split(source)
     self.assertAllEqual(expected, result)
 
+  def testUnknownInputRankError(self):
+    # Use a tf.function that erases shape information.
+    @def_function.function(input_signature=[tensor_spec.TensorSpec(None)])
+    def f(v):
+      return ragged_string_ops.string_bytes_split(v)
+
+    with self.assertRaisesRegexp(ValueError,
+                                 'input must have a statically-known rank'):
+      f(['foo'])
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/string_split_op_test.py b/tensorflow/python/kernel_tests/string_split_op_test.py
index e2067b2b156..8e9517b2f1f 100755
--- a/tensorflow/python/kernel_tests/string_split_op_test.py
+++ b/tensorflow/python/kernel_tests/string_split_op_test.py
@@ -24,6 +24,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import string_ops
@@ -33,7 +34,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 
-class StringSplitOpTest(test.TestCase):
+class StringSplitOpTest(test.TestCase, parameterized.TestCase):
 
   def testStringSplit(self):
     strings = ["pigs on the wing", "animals"]
@@ -169,6 +170,66 @@ class StringSplitOpTest(test.TestCase):
       self.assertAllEqual(indices, [[0, 0], [1, 0], [2, 0]])
       self.assertAllEqual(shape, [3, 1])
 
+  @parameterized.named_parameters([
+      dict(
+          testcase_name="RaggedResultType",
+          source=[b"pigs on the wing", b"animals"],
+          result_type="RaggedTensor",
+          expected=[[b"pigs", b"on", b"the", b"wing"], [b"animals"]]),
+      dict(
+          testcase_name="SparseResultType",
+          source=[b"pigs on the wing", b"animals"],
+          result_type="SparseTensor",
+          expected=sparse_tensor.SparseTensorValue(
+              [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0]],
+              [b"pigs", b"on", b"the", b"wing", b"animals"], [2, 4])),
+      dict(
+          testcase_name="DefaultResultType",
+          source=[b"pigs on the wing", b"animals"],
+          expected=sparse_tensor.SparseTensorValue(
+              [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0]],
+              [b"pigs", b"on", b"the", b"wing", b"animals"], [2, 4])),
+      dict(
+          testcase_name="BadResultType",
+          source=[b"pigs on the wing", b"animals"],
+          result_type="BouncyTensor",
+          error="result_type must be .*"),
+      dict(
+          testcase_name="WithSepAndAndSkipEmpty",
+          source=[b"+hello+++this+is+a+test"],
+          sep="+",
+          skip_empty=False,
+          result_type="RaggedTensor",
+          expected=[[b"", b"hello", b"", b"", b"this", b"is", b"a", b"test"]]),
+      dict(
+          testcase_name="WithDelimiter",
+          source=[b"hello world"],
+          delimiter="l",
+          result_type="RaggedTensor",
+          expected=[[b"he", b"o wor", b"d"]]),
+  ])
+  def testRaggedStringSplitWrapper(self,
+                                   source,
+                                   sep=None,
+                                   skip_empty=True,
+                                   delimiter=None,
+                                   result_type="SparseTensor",
+                                   expected=None,
+                                   error=None):
+    if error is not None:
+      with self.assertRaisesRegexp(ValueError, error):
+        ragged_string_ops.string_split(source, sep, skip_empty, delimiter,
+                                       result_type)
+    if expected is not None:
+      result = ragged_string_ops.string_split(source, sep, skip_empty,
+                                              delimiter, result_type)
+      if isinstance(expected, sparse_tensor.SparseTensorValue):
+        self.assertAllEqual(result.indices, expected.indices)
+        self.assertAllEqual(result.values, expected.values)
+        self.assertAllEqual(result.dense_shape, expected.dense_shape)
+      else:
+        self.assertAllEqual(result, expected)
+
 
 class StringSplitV2OpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
@@ -385,6 +446,10 @@ class StringSplitV2OpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertEqual(expected_sparse.dense_shape.tolist(),
                      self.evaluate(actual_sparse_v1.dense_shape).tolist())
 
+  def testSplitV1BadResultType(self):
+    with self.assertRaisesRegexp(ValueError, "result_type must be .*"):
+      ragged_string_ops.strings_split_v1("foo", result_type="BouncyTensor")
+
   def _py_split(self, strings, **kwargs):
     if isinstance(strings, compat.bytes_or_text_types):
       # Note: str.split doesn't accept keyword args.
diff --git a/tensorflow/python/kernel_tests/template_mirrored_strategy_test.py b/tensorflow/python/kernel_tests/template_mirrored_strategy_test.py
index 1d12c616be6..e4a97167c8b 100644
--- a/tensorflow/python/kernel_tests/template_mirrored_strategy_test.py
+++ b/tensorflow/python/kernel_tests/template_mirrored_strategy_test.py
@@ -46,7 +46,7 @@ class TemplateMirroredStrategyTest(test.TestCase):
 
     strategy = mirrored_strategy.MirroredStrategy(["/cpu:0", "/gpu:0"])
     out = strategy.experimental_local_results(
-        strategy.experimental_run_v2(temp))
+        strategy.run(temp))
 
     self.evaluate(variables.global_variables_initializer())
     self.assertAllEqual([42., 42.], self.evaluate(out))
diff --git a/tensorflow/python/kernel_tests/unicode_decode_op_test.py b/tensorflow/python/kernel_tests/unicode_decode_op_test.py
index a3dda14900f..6bd9b15af0f 100644
--- a/tensorflow/python/kernel_tests/unicode_decode_op_test.py
+++ b/tensorflow/python/kernel_tests/unicode_decode_op_test.py
@@ -126,6 +126,8 @@ class UnicodeDecodeTest(test_util.TensorFlowTestCase,
       {"texts": [["Hi", "there"], ["", u"\U0001f60a"]], "ragged_rank": 0},
       {"texts": [["Hi", "there", ""], [u"😊"]], "ragged_rank": 1},
       {"texts": [[[u"😊", u"🤠🧐"], []], [[u"🤓👻🤖"]]], "ragged_rank": 2},
+      {"texts": [[[u"😊"], [u"🤠🧐"]], [[u"🤓👻🤖"]]], "ragged_rank": 1},
+      {"texts": [[[u"😊"], [u"🤠🧐"]], [[u"🤓"], [u"👻"]]], "ragged_rank": 0},
       {"texts": []}
   ])  # pyformat: disable
   def testBasicDecode(self, texts, ragged_rank=None):
diff --git a/tensorflow/python/kernel_tests/unicode_encode_op_test.py b/tensorflow/python/kernel_tests/unicode_encode_op_test.py
index c01ca79d363..2f483b7fb68 100644
--- a/tensorflow/python/kernel_tests/unicode_encode_op_test.py
+++ b/tensorflow/python/kernel_tests/unicode_encode_op_test.py
@@ -21,8 +21,10 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl as errors
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_string_ops
@@ -285,6 +287,16 @@ class UnicodeEncodeOpTest(test.TestCase, parameterized.TestCase):
     unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
     self.assertAllEqual(unicode_encode_op, expected_value)
 
+  def testUnknownInputRankError(self):
+    # Use a tf.function that erases shape information.
+    @def_function.function(input_signature=[tensor_spec.TensorSpec(None)])
+    def f(v):
+      return ragged_string_ops.unicode_encode(v, "UTF-8")
+
+    with self.assertRaisesRegexp(
+        ValueError, "Rank of input_tensor must be statically known."):
+      f([72, 101, 108, 108, 111])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 33abd5c664e..1021e45cbd4 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -17,577 +17,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import copy
+from tensorflow.python.keras.legacy_tf_layers import base
 
-from tensorflow.python.eager import context
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.keras import backend
-from tensorflow.python.keras.engine import base_layer
-from tensorflow.python.keras.mixed_precision.experimental import policy
-from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.ops import variables as tf_variables
-from tensorflow.python.training.tracking import base as trackable
-from tensorflow.python.util import deprecation
-from tensorflow.python.util import function_utils
-from tensorflow.python.util import nest
-from tensorflow.python.util import tf_contextlib
-from tensorflow.python.util.tf_export import tf_export
+InputSpec = base.InputSpec
 
-# Avoid breaking users who directly import this symbol from this file.
-# TODO(fchollet): remove this.
-InputSpec = base_layer.InputSpec  # pylint: disable=invalid-name
-
-_KERAS_STYLE_SCOPE = False
-
-
-@tf_export(v1=['layers.experimental.keras_style_scope'])
-@tf_contextlib.contextmanager
-def keras_style_scope():
-  """Use Keras-style variable management.
-
-  All tf.layers and tf RNN cells created in this scope use Keras-style
-  variable management.  Creating such layers with a scope= argument is
-  disallowed, and reuse=True is disallowed.
-
-  The purpose of this scope is to allow users of existing layers to
-  slowly transition to a Keras layers API without breaking existing
-  functionality.
-
-  One example of this is when using TensorFlow's RNN classes with Keras
-  Models or Networks.  Because Keras models do not properly set variable
-  scopes, users of RNNs may either accidentally share scopes between two
-  different models, or get errors about variables that already exist.
-
-  Example:
-
-  ```python
-  class RNNModel(tf.keras.Model):
-
-    def __init__(self, name):
-      super(RNNModel, self).__init__(name=name)
-      self.rnn = tf.compat.v1.nn.rnn_cell.MultiRNNCell(
-        [tf.compat.v1.nn.rnn_cell.LSTMCell(64) for _ in range(2)])
-
-    def call(self, input, state):
-      return self.rnn(input, state)
-
-  model_1 = RNNModel("model_1")
-  model_2 = RNNModel("model_2")
-
-  # OK
-  output_1, next_state_1 = model_1(input, state)
-  # Raises an error about trying to create an already existing variable.
-  output_2, next_state_2 = model_2(input, state)
-  ```
-
-  The solution is to wrap the model construction and execution in a keras-style
-  scope:
-
-  ```python
-  with keras_style_scope():
-    model_1 = RNNModel("model_1")
-    model_2 = RNNModel("model_2")
-
-    # model_1 and model_2 are guaranteed to create their own variables.
-    output_1, next_state_1 = model_1(input, state)
-    output_2, next_state_2 = model_2(input, state)
-
-    assert len(model_1.weights) > 0
-    assert len(model_2.weights) > 0
-    assert(model_1.weights != model_2.weights)
-  ```
-
-  Yields:
-    A keras layer style scope.
-  """
-  global _KERAS_STYLE_SCOPE
-  stack = _KERAS_STYLE_SCOPE
-  _KERAS_STYLE_SCOPE = True
-  try:
-    yield
-  finally:
-    _KERAS_STYLE_SCOPE = stack
-
-
-@tf_export(v1=['layers.experimental.set_keras_style'])
-def set_keras_style():
-  """Use Keras-style variable management.
-
-  All tf.layers and tf RNN cells created after keras style ha been enabled
-  use Keras-style variable management.  Creating such layers with a
-  scope= argument is disallowed, and reuse=True is disallowed.
-
-  The purpose of this function is to allow users of existing layers to
-  slowly transition to Keras layers API without breaking existing
-  functionality.
-
-  For more details, see the documentation for `keras_style_scope`.
-
-  Note, once keras style has been set, it is set globally for the entire
-  program and cannot be unset.
-
-  Example:
-
-  ```python
-  set_keras_style()
-
-  model_1 = RNNModel(name="model_1")
-  model_2 = RNNModel(name="model_2")
-
-  # model_1 and model_2 are guaranteed to create their own variables.
-  output_1, next_state_1 = model_1(input, state)
-  output_2, next_state_2 = model_2(input, state)
-
-  assert len(model_1.weights) > 0
-  assert len(model_2.weights) > 0
-  assert(model_1.weights != model_2.weights)
-  ```
-  """
-  global _KERAS_STYLE_SCOPE
-  _KERAS_STYLE_SCOPE = True
-
-
-def _is_in_keras_style_scope():
-  global _KERAS_STYLE_SCOPE
-  return _KERAS_STYLE_SCOPE
-
-
-@tf_export(v1=['layers.Layer'])
-class Layer(base_layer.Layer):
-  """Base layer class.
-
-  It is considered legacy, and we recommend the use of `tf.keras.layers.Layer`
-  instead.
-
-  Arguments:
-    trainable: Boolean, whether the layer's variables should be trainable.
-    name: String name of the layer.
-    dtype: Default dtype of the layer's weights (default of `None` means use the
-      type of the first input).
-
-  Read-only properties:
-    name: The name of the layer (string).
-    dtype: Default dtype of the layer's weights (default of `None` means use the
-      type of the first input).
-    trainable_variables: List of trainable variables.
-    non_trainable_variables: List of non-trainable variables.
-    variables: List of all variables of this layer, trainable and
-      non-trainable.
-    updates: List of update ops of this layer.
-    losses: List of losses added by this layer.
-    trainable_weights: List of variables to be included in backprop.
-    non_trainable_weights: List of variables that should not be
-      included in backprop.
-    weights: The concatenation of the lists trainable_weights and
-      non_trainable_weights (in this order).
-
-  Mutable properties:
-    trainable: Whether the layer should be trained (boolean).
-    input_spec: Optional (list of) `InputSpec` object(s) specifying the
-      constraints on inputs that can be accepted by the layer.
-  """
-
-  def __init__(self, trainable=True, name=None, dtype=None,
-               **kwargs):
-    # For backwards compatibility, legacy layers do not use `ResourceVariable`
-    # by default.
-    self._use_resource_variables = False
-    scope = kwargs.pop('_scope', None)
-    self._reuse = kwargs.pop('_reuse', None)
-
-    # Avoid an incorrect lint error
-    self._trainable_weights = []
-    self.built = False
-
-    if dtype is None:
-      # Indicates to infer dtype from inputs. When the V2 dtype behavior is
-      # enabled, Keras layers default their dtype to floatx instead, so we pass
-      # an "infer" policy to keep the old V1 behavior.
-      dtype = policy.Policy('infer')
-
-    if 'autocast' not in kwargs:
-      kwargs['autocast'] = False
-
-    super(Layer, self).__init__(trainable=trainable, name=name, dtype=dtype,
-                                **kwargs)
-
-    if _is_in_keras_style_scope():
-      if scope is not None:
-        raise ValueError(
-            'scope argument not allowed when keras style layers are enabled, '
-            'but saw: {}'.format(scope))
-      if self._reuse is not None:
-        raise ValueError(
-            'reuse argument not allowed when keras style layers are enabled, '
-            'but saw: {}'.format(self._reuse))
-      self._keras_style = True
-    else:
-      self._keras_style = False
-
-    self._call_has_scope_arg = 'scope' in self._call_fn_args
-    if scope:
-      with vs.variable_scope(scope) as captured_scope:
-        self._scope = captured_scope
-    else:
-      self._scope = None
-    self._current_scope = None
-
-  # We no longer track graph in tf.layers layers. This property is only kept to
-  # maintain API backward compatibility.
-  @property
-  @deprecation.deprecated(
-      date=None,
-      instructions='Stop using this property because tf.layers layers no '
-      'longer track their graph.')
-  def graph(self):
-    if context.executing_eagerly():
-      raise RuntimeError('Layer.graph not supported when executing eagerly.')
-    return None
-
-  def _init_set_name(self, name):
-    # Determine layer name (non-unique).
-    if isinstance(name, vs.VariableScope):
-      base_name = name.name
-      self._name, _ = self._make_unique_name()
-    else:
-      base_name = name
-      self._name = name
-    if not name:
-      self._name, base_name = self._make_unique_name()
-    self._base_name = base_name
-
-  def _make_unique_name(self, name_uid_map=None, avoid_names=None,
-                        namespace='', zero_based=False):
-    base_name = base_layer.to_snake_case(self.__class__.__name__)
-    name = backend.unique_object_name(
-        base_name,
-        name_uid_map=name_uid_map,
-        avoid_names=avoid_names,
-        namespace=namespace,
-        zero_based=zero_based)
-    return (name, base_name)
-
-  @property
-  def scope_name(self):
-    if not self._scope:
-      raise ValueError('No name available for layer scope because the layer "' +
-                       self._name + '" has not been used yet. The scope name ' +
-                       ' is determined the first time the layer instance is ' +
-                       'called. You must therefore call the layer before ' +
-                       'querying `scope_name`.')
-    return self._scope.name
-
-  def add_loss(self, losses, inputs=None):
-    previous_losses_length = len(self._losses)
-    previous_callable_losses_length = len(self._callable_losses)
-    super(Layer, self).add_loss(losses, inputs=inputs)
-    if not context.executing_eagerly():
-      # TODO(fchollet): deprecate collection below.
-      new_losses = self._losses[previous_losses_length:]
-      new_callable_losses = self._callable_losses[
-          previous_callable_losses_length:]
-      for regularizer in new_callable_losses:
-        loss_tensor = regularizer()
-        if loss_tensor is not None:
-          new_losses.append(loss_tensor)
-      _add_elements_to_collection(
-          new_losses,
-          ops.GraphKeys.REGULARIZATION_LOSSES)
-
-  def _name_scope(self):
-    """Determines op naming for the Layer."""
-    if self._keras_style:
-      return super(Layer, self)._name_scope()
-    return self._current_scope.original_name_scope
-
-  def _set_scope(self, scope=None):
-    if self._scope is None:
-      # If constructed with _scope=None, lazy setting of scope.
-      if self._reuse:
-        with vs.variable_scope(
-            scope if scope is not None else self._base_name) as captured_scope:
-          self._scope = captured_scope
-      else:
-        with vs.variable_scope(
-            scope, default_name=self._base_name) as captured_scope:
-          self._scope = captured_scope
-
-  def add_weight(self,
-                 name,
-                 shape,
-                 dtype=None,
-                 initializer=None,
-                 regularizer=None,
-                 trainable=None,
-                 constraint=None,
-                 use_resource=None,
-                 synchronization=vs.VariableSynchronization.AUTO,
-                 aggregation=vs.VariableAggregation.NONE,
-                 partitioner=None,
-                 **kwargs):
-    """Adds a new variable to the layer, or gets an existing one; returns it.
-
-    Arguments:
-      name: variable name.
-      shape: variable shape.
-      dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
-      initializer: initializer instance (callable).
-      regularizer: regularizer instance (callable).
-      trainable: whether the variable should be part of the layer's
-        "trainable_variables" (e.g. variables, biases)
-        or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
-        Note, if the current variable scope is marked as non-trainable
-        then this parameter is ignored and any added variables are also
-        marked as non-trainable. `trainable` defaults to `True` unless
-        `synchronization` is set to `ON_READ`.
-      constraint: constraint instance (callable).
-      use_resource: Whether to use `ResourceVariable`.
-      synchronization: Indicates when a distributed a variable will be
-        aggregated. Accepted values are constants defined in the class
-        `tf.VariableSynchronization`. By default the synchronization is set to
-        `AUTO` and the current `DistributionStrategy` chooses
-        when to synchronize. If `synchronization` is set to `ON_READ`,
-        `trainable` must not be set to `True`.
-      aggregation: Indicates how a distributed variable will be aggregated.
-        Accepted values are constants defined in the class
-        `tf.VariableAggregation`.
-      partitioner: (optional) partitioner instance (callable).  If
-        provided, when the requested variable is created it will be split
-        into multiple partitions according to `partitioner`.  In this case,
-        an instance of `PartitionedVariable` is returned.  Available
-        partitioners include `tf.compat.v1.fixed_size_partitioner` and
-        `tf.compat.v1.variable_axis_size_partitioner`.  For more details, see
-        the documentation of `tf.compat.v1.get_variable` and the  "Variable
-        Partitioners and Sharding" section of the API guide.
-      **kwargs: Additional keyword arguments.
-
-    Returns:
-      The created variable.  Usually either a `Variable` or `ResourceVariable`
-      instance.  If `partitioner` is not `None`, a `PartitionedVariable`
-      instance is returned.
-
-    Raises:
-      RuntimeError: If called with partitioned variable regularization and
-        eager execution is enabled.
-      ValueError: When trainable has been set to True with synchronization
-        set as `ON_READ`.
-    """
-    for kwarg in kwargs:
-      if kwarg != 'experimental_autocast':
-        raise TypeError('Unknown keyword argument:', kwarg)
-    if self._keras_style:
-      return super(Layer, self).add_weight(
-          name=name,
-          shape=shape,
-          dtype=dtype,
-          initializer=initializer,
-          regularizer=regularizer,
-          trainable=trainable and self.trainable,
-          constraint=constraint,
-          use_resource=use_resource,
-          synchronization=vs.VariableSynchronization.AUTO,
-          aggregation=vs.VariableAggregation.NONE,
-          partitioner=partitioner,
-          **kwargs)
-
-    if synchronization == vs.VariableSynchronization.ON_READ:
-      if trainable:
-        raise ValueError(
-            'Synchronization value can be set to '
-            'VariableSynchronization.ON_READ only for non-trainable variables. '
-            'You have specified trainable=True and '
-            'synchronization=VariableSynchronization.ON_READ.')
-      else:
-        # Set trainable to be false when variable is to be synced on read.
-        trainable = False
-    elif trainable is None:
-      trainable = True
-
-    def _should_add_regularizer(variable, existing_variable_set):
-      if isinstance(variable, tf_variables.PartitionedVariable):
-        for var in variable:
-          if var in existing_variable_set:
-            return False
-        return True
-      else:
-        return variable not in existing_variable_set
-
-    init_graph = None
-    if not context.executing_eagerly():
-      default_graph = ops.get_default_graph()
-      if default_graph.building_function:
-        with ops.init_scope():
-          # Retrieve the variables from the graph into which variables
-          # will be lifted; if initialization ops will be lifted into
-          # the eager context, then there is nothing to retrieve, since variable
-          # collections are not supported when eager execution is enabled.
-          if not context.executing_eagerly():
-            init_graph = ops.get_default_graph()
-            existing_variables = set(tf_variables.global_variables())
-      else:
-        # Initialization ops will not be lifted out of the default graph.
-        init_graph = default_graph
-        existing_variables = set(tf_variables.global_variables())
-
-    if dtype is None:
-      dtype = self.dtype or dtypes.float32
-
-    self._set_scope(None)
-    reuse = self.built or self._reuse
-    prev_len_trainable = len(self._trainable_weights)
-    with vs.variable_scope(
-        self._scope, reuse=reuse, auxiliary_name_scope=False) as scope:
-      self._current_scope = scope
-      with ops.name_scope(self._name_scope(), skip_on_eager=False):
-        use_resource = (use_resource or
-                        self._use_resource_variables or
-                        scope.use_resource)
-        if initializer is None:
-          initializer = scope.initializer
-        variable = super(Layer, self).add_weight(
-            name,
-            shape,
-            dtype=dtypes.as_dtype(dtype),
-            initializer=initializer,
-            trainable=trainable and self.trainable,
-            constraint=constraint,
-            partitioner=partitioner,
-            use_resource=use_resource,
-            synchronization=synchronization,
-            aggregation=aggregation,
-            getter=vs.get_variable,
-            **kwargs)
-
-        if regularizer:
-          if (ops.executing_eagerly_outside_functions()
-              or _should_add_regularizer(variable, existing_variables)):
-            self._handle_weight_regularization(name, variable, regularizer)
-
-        if init_graph is not None:
-          # Handle edge case where a custom getter has overridden `trainable`.
-          # There is one known occurrence of this, in unit test
-          # testBasicRNNCellNotTrainable in
-          # contrib.rnn.python.kernel_tests.core_rnn_cell_test
-          with init_graph.as_default():
-            trainable_variables = tf_variables.trainable_variables()
-          if (trainable and self.trainable and
-              variable not in trainable_variables):
-            # A custom getter / variable scope overrode the trainable flag.
-            extra_trainable_vars = self._trainable_weights[prev_len_trainable:]
-            self._trainable_weights = self._trainable_weights[
-                :prev_len_trainable]
-            self._non_trainable_weights += extra_trainable_vars
-    return variable
-
-  def __call__(self, inputs, *args, **kwargs):
-    """Wraps `call`, applying pre- and post-processing steps.
-
-    Arguments:
-      inputs: input tensor(s).
-      *args: additional positional arguments to be passed to `self.call`.
-      **kwargs: additional keyword arguments to be passed to `self.call`.
-        **Note**: kwarg `scope` is reserved for use by the layer.
-
-    Returns:
-      Output tensor(s).
-
-    Note:
-      - If the layer's `call` method takes a `scope` keyword argument,
-        this argument will be automatically set to the current variable scope.
-      - If the layer's `call` method takes a `mask` argument (as some Keras
-        layers do), its default value will be set to the mask generated
-        for `inputs` by the previous layer (if `input` did come from
-        a layer that generated a corresponding mask, i.e. if it came from
-        a Keras layer with masking support.
-
-    Raises:
-      ValueError: if the layer's `call` method returns None (an invalid value).
-    """
-    scope = kwargs.pop('scope', None)
-
-    if self._keras_style:
-      if scope is not None:
-        raise ValueError(
-            'scope argument not allowed when keras style layers are enabled, '
-            'but saw: {}'.format(scope))
-      return super(Layer, self).__call__(inputs, *args, **kwargs)
-
-    self._set_scope(scope)
-
-    if self.built:
-      try:
-        # Some classes which inherit from Layer do not use its constructor, so
-        # rather than initializing to None we check for an AttributeError.
-        scope_context_manager = self._always_reuse_variable_scope
-      except AttributeError:
-        # From this point we will always set reuse=True, so create a "final"
-        # variable scope with this setting. We avoid re-creating variable scopes
-        # after this point as an optimization.
-        self._always_reuse_variable_scope = vs.variable_scope(
-            self._scope, reuse=True, auxiliary_name_scope=False)
-        scope_context_manager = self._always_reuse_variable_scope
-    else:
-      scope_context_manager = vs.variable_scope(
-          self._scope, reuse=self._reuse, auxiliary_name_scope=False)
-
-    with scope_context_manager as scope:
-      self._current_scope = scope
-
-      try:
-        call_has_scope_arg = self._call_has_scope_arg
-      except AttributeError:
-        self._call_fn_args = function_utils.fn_args(self.call)
-        self._call_has_scope_arg = 'scope' in self._call_fn_args
-        call_has_scope_arg = self._call_has_scope_arg
-      if call_has_scope_arg:
-        kwargs['scope'] = scope
-
-      # Actually call layer
-      outputs = super(Layer, self).__call__(inputs, *args, **kwargs)
-
-    if not context.executing_eagerly():
-      # Update global default collections.
-      _add_elements_to_collection(self.updates, ops.GraphKeys.UPDATE_OPS)
-    return outputs
-
-  def __deepcopy__(self, memo):
-    no_copy = set(['_graph', '_thread_local', '_metrics_lock'])
-    shallow_copy = set(['_scope', '_always_reuse_variable_scope'])
-    cls = self.__class__
-    result = cls.__new__(cls)
-    memo[id(self)] = result
-    for k, v in self.__dict__.items():
-      if k in no_copy:
-        setattr(result, k, v)
-      elif k in shallow_copy:
-        setattr(result, k, copy.copy(v))
-      elif base_layer.is_tensor_or_tensor_list(v):
-        setattr(result, k, v)
-      else:
-        setattr(result, k, copy.deepcopy(v, memo))
-    return result
-
-  def __setattr__(self, value, name):
-    # By-pass the automatic dependency tracking performed by the parent Layer.
-    super(trackable.Trackable, self).__setattr__(value, name)
-
-  @property
-  def _is_legacy_layer(self):
-    """Used by keras to check compatibility. This should not be overridden."""
-    return True
-
-
-def _add_elements_to_collection(elements, collection_list):
-  if context.executing_eagerly():
-    raise RuntimeError('Using collections from Layers not supported in Eager '
-                       'mode. Tried to add %s to %s' % (elements,
-                                                        collection_list))
-  elements = nest.flatten(elements)
-  collection_list = nest.flatten(collection_list)
-  for name in collection_list:
-    collection = ops.get_collection_ref(name)
-    collection_set = {id(e) for e in collection}
-    for element in elements:
-      if id(element) not in collection_set:
-        collection.append(element)
+keras_style_scope = base.keras_style_scope
+set_keras_style = base.set_keras_style
+Layer = base.Layer
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index f88934122fc..f3839facc8b 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -19,1439 +19,22 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras import layers as keras_layers
-from tensorflow.python.layers import base
-from tensorflow.python.ops import init_ops
-from tensorflow.python.util import deprecation
-from tensorflow.python.util.tf_export import tf_export
-
-
-@tf_export(v1=['layers.Conv1D'])
-class Conv1D(keras_layers.Conv1D, base.Layer):
-  """1D convolution layer (e.g. temporal convolution).
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
-  a bias vector is created and added to the outputs. Finally, if
-  `activation` is not `None`, it is applied to the outputs as well.
-
-  Arguments:
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of a single integer, specifying the
-      length of the 1D convolution window.
-    strides: An integer or tuple/list of a single integer,
-      specifying the stride length of the convolution.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, length, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, length)`.
-    dilation_rate: An integer or tuple/list of a single integer, specifying
-      the dilation rate to use for dilated convolution.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any `strides` value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    kernel_regularizer: Optional regularizer for the convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    kernel_constraint: Optional projection function to be applied to the
-        kernel after being updated by an `Optimizer` (e.g. used to implement
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self, filters,
-               kernel_size,
-               strides=1,
-               padding='valid',
-               data_format='channels_last',
-               dilation_rate=1,
-               activation=None,
-               use_bias=True,
-               kernel_initializer=None,
-               bias_initializer=init_ops.zeros_initializer(),
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super(Conv1D, self).__init__(
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        activation=activation,
-        use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        bias_initializer=bias_initializer,
-        kernel_regularizer=kernel_regularizer,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        kernel_constraint=kernel_constraint,
-        bias_constraint=bias_constraint,
-        trainable=trainable,
-        name=name, **kwargs)
-
-
-@deprecation.deprecated(
-    date=None,
-    instructions='Use `tf.keras.layers.Conv1D` instead.')
-@tf_export(v1=['layers.conv1d'])
-def conv1d(inputs,
-           filters,
-           kernel_size,
-           strides=1,
-           padding='valid',
-           data_format='channels_last',
-           dilation_rate=1,
-           activation=None,
-           use_bias=True,
-           kernel_initializer=None,
-           bias_initializer=init_ops.zeros_initializer(),
-           kernel_regularizer=None,
-           bias_regularizer=None,
-           activity_regularizer=None,
-           kernel_constraint=None,
-           bias_constraint=None,
-           trainable=True,
-           name=None,
-           reuse=None):
-  """Functional interface for 1D convolution layer (e.g. temporal convolution).
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
-  a bias vector is created and added to the outputs. Finally, if
-  `activation` is not `None`, it is applied to the outputs as well.
-
-  Arguments:
-    inputs: Tensor input.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of a single integer, specifying the
-      length of the 1D convolution window.
-    strides: An integer or tuple/list of a single integer,
-      specifying the stride length of the convolution.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, length, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, length)`.
-    dilation_rate: An integer or tuple/list of a single integer, specifying
-      the dilation rate to use for dilated convolution.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any `strides` value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    kernel_regularizer: Optional regularizer for the convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    kernel_constraint: Optional projection function to be applied to the
-        kernel after being updated by an `Optimizer` (e.g. used to implement
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-    reuse: Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-  """
-  layer = Conv1D(
-      filters=filters,
-      kernel_size=kernel_size,
-      strides=strides,
-      padding=padding,
-      data_format=data_format,
-      dilation_rate=dilation_rate,
-      activation=activation,
-      use_bias=use_bias,
-      kernel_initializer=kernel_initializer,
-      bias_initializer=bias_initializer,
-      kernel_regularizer=kernel_regularizer,
-      bias_regularizer=bias_regularizer,
-      activity_regularizer=activity_regularizer,
-      kernel_constraint=kernel_constraint,
-      bias_constraint=bias_constraint,
-      trainable=trainable,
-      name=name,
-      _reuse=reuse,
-      _scope=name)
-  return layer.apply(inputs)
-
-
-@tf_export(v1=['layers.Conv2D'])
-class Conv2D(keras_layers.Conv2D, base.Layer):
-  """2D convolution layer (e.g. spatial convolution over images).
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
-  a bias vector is created and added to the outputs. Finally, if
-  `activation` is not `None`, it is applied to the outputs as well.
-
-  Arguments:
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of 2 integers, specifying the
-      height and width of the 2D convolution window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 2 integers,
-      specifying the strides of the convolution along the height and width.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-
-    dilation_rate: An integer or tuple/list of 2 integers, specifying
-      the dilation rate to use for dilated convolution.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    kernel_regularizer: Optional regularizer for the convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    kernel_constraint: Optional projection function to be applied to the
-        kernel after being updated by an `Optimizer` (e.g. used to implement
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self, filters,
-               kernel_size,
-               strides=(1, 1),
-               padding='valid',
-               data_format='channels_last',
-               dilation_rate=(1, 1),
-               activation=None,
-               use_bias=True,
-               kernel_initializer=None,
-               bias_initializer=init_ops.zeros_initializer(),
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super(Conv2D, self).__init__(
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        activation=activation,
-        use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        bias_initializer=bias_initializer,
-        kernel_regularizer=kernel_regularizer,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        kernel_constraint=kernel_constraint,
-        bias_constraint=bias_constraint,
-        trainable=trainable,
-        name=name, **kwargs)
-
-
-@deprecation.deprecated(
-    date=None,
-    instructions='Use `tf.keras.layers.Conv2D` instead.')
-@tf_export(v1=['layers.conv2d'])
-def conv2d(inputs,
-           filters,
-           kernel_size,
-           strides=(1, 1),
-           padding='valid',
-           data_format='channels_last',
-           dilation_rate=(1, 1),
-           activation=None,
-           use_bias=True,
-           kernel_initializer=None,
-           bias_initializer=init_ops.zeros_initializer(),
-           kernel_regularizer=None,
-           bias_regularizer=None,
-           activity_regularizer=None,
-           kernel_constraint=None,
-           bias_constraint=None,
-           trainable=True,
-           name=None,
-           reuse=None):
-  """Functional interface for the 2D convolution layer.
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
-  a bias vector is created and added to the outputs. Finally, if
-  `activation` is not `None`, it is applied to the outputs as well.
-
-  Arguments:
-    inputs: Tensor input.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of 2 integers, specifying the
-      height and width of the 2D convolution window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 2 integers,
-      specifying the strides of the convolution along the height and width.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-
-    dilation_rate: An integer or tuple/list of 2 integers, specifying
-      the dilation rate to use for dilated convolution.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    kernel_regularizer: Optional regularizer for the convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    kernel_constraint: Optional projection function to be applied to the
-        kernel after being updated by an `Optimizer` (e.g. used to implement
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-    reuse: Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-  """
-  layer = Conv2D(
-      filters=filters,
-      kernel_size=kernel_size,
-      strides=strides,
-      padding=padding,
-      data_format=data_format,
-      dilation_rate=dilation_rate,
-      activation=activation,
-      use_bias=use_bias,
-      kernel_initializer=kernel_initializer,
-      bias_initializer=bias_initializer,
-      kernel_regularizer=kernel_regularizer,
-      bias_regularizer=bias_regularizer,
-      activity_regularizer=activity_regularizer,
-      kernel_constraint=kernel_constraint,
-      bias_constraint=bias_constraint,
-      trainable=trainable,
-      name=name,
-      _reuse=reuse,
-      _scope=name)
-  return layer.apply(inputs)
-
-
-@tf_export(v1=['layers.Conv3D'])
-class Conv3D(keras_layers.Conv3D, base.Layer):
-  """3D convolution layer (e.g. spatial convolution over volumes).
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
-  a bias vector is created and added to the outputs. Finally, if
-  `activation` is not `None`, it is applied to the outputs as well.
-
-  Arguments:
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of 3 integers, specifying the
-      depth, height and width of the 3D convolution window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 3 integers,
-      specifying the strides of the convolution along the depth,
-      height and width.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, depth, height, width, channels)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch, channels, depth, height, width)`.
-    dilation_rate: An integer or tuple/list of 3 integers, specifying
-      the dilation rate to use for dilated convolution.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    kernel_regularizer: Optional regularizer for the convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    kernel_constraint: Optional projection function to be applied to the
-        kernel after being updated by an `Optimizer` (e.g. used to implement
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self, filters,
-               kernel_size,
-               strides=(1, 1, 1),
-               padding='valid',
-               data_format='channels_last',
-               dilation_rate=(1, 1, 1),
-               activation=None,
-               use_bias=True,
-               kernel_initializer=None,
-               bias_initializer=init_ops.zeros_initializer(),
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super(Conv3D, self).__init__(
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        activation=activation,
-        use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        bias_initializer=bias_initializer,
-        kernel_regularizer=kernel_regularizer,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        kernel_constraint=kernel_constraint,
-        bias_constraint=bias_constraint,
-        trainable=trainable,
-        name=name, **kwargs)
-
-
-@deprecation.deprecated(
-    date=None,
-    instructions='Use `tf.keras.layers.Conv3D` instead.')
-@tf_export(v1=['layers.conv3d'])
-def conv3d(inputs,
-           filters,
-           kernel_size,
-           strides=(1, 1, 1),
-           padding='valid',
-           data_format='channels_last',
-           dilation_rate=(1, 1, 1),
-           activation=None,
-           use_bias=True,
-           kernel_initializer=None,
-           bias_initializer=init_ops.zeros_initializer(),
-           kernel_regularizer=None,
-           bias_regularizer=None,
-           activity_regularizer=None,
-           kernel_constraint=None,
-           bias_constraint=None,
-           trainable=True,
-           name=None,
-           reuse=None):
-  """Functional interface for the 3D convolution layer.
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
-  a bias vector is created and added to the outputs. Finally, if
-  `activation` is not `None`, it is applied to the outputs as well.
-
-  Arguments:
-    inputs: Tensor input.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of 3 integers, specifying the
-      depth, height and width of the 3D convolution window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 3 integers,
-      specifying the strides of the convolution along the depth,
-      height and width.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, depth, height, width, channels)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch, channels, depth, height, width)`.
-    dilation_rate: An integer or tuple/list of 3 integers, specifying
-      the dilation rate to use for dilated convolution.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    kernel_regularizer: Optional regularizer for the convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    kernel_constraint: Optional projection function to be applied to the
-        kernel after being updated by an `Optimizer` (e.g. used to implement
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-    reuse: Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-  """
-  layer = Conv3D(
-      filters=filters,
-      kernel_size=kernel_size,
-      strides=strides,
-      padding=padding,
-      data_format=data_format,
-      dilation_rate=dilation_rate,
-      activation=activation,
-      use_bias=use_bias,
-      kernel_initializer=kernel_initializer,
-      bias_initializer=bias_initializer,
-      kernel_regularizer=kernel_regularizer,
-      bias_regularizer=bias_regularizer,
-      activity_regularizer=activity_regularizer,
-      kernel_constraint=kernel_constraint,
-      bias_constraint=bias_constraint,
-      trainable=trainable,
-      name=name,
-      _reuse=reuse,
-      _scope=name)
-  return layer.apply(inputs)
-
-
-@tf_export(v1=['layers.SeparableConv1D'])
-class SeparableConv1D(keras_layers.SeparableConv1D, base.Layer):
-  """Depthwise separable 1D convolution.
-
-  This layer performs a depthwise convolution that acts separately on
-  channels, followed by a pointwise convolution that mixes channels.
-  If `use_bias` is True and a bias initializer is provided,
-  it adds a bias vector to the output.
-  It then optionally applies an activation function to produce the final output.
-
-  Arguments:
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: A single integer specifying the spatial
-      dimensions of the filters.
-    strides: A single integer specifying the strides
-      of the convolution.
-      Specifying any `stride` value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, length, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, length)`.
-    dilation_rate: A single integer, specifying
-      the dilation rate to use for dilated convolution.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    depth_multiplier: The number of depthwise convolution output channels for
-      each input channel. The total number of depthwise convolution output
-      channels will be equal to `num_filters_in * depth_multiplier`.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    depthwise_initializer: An initializer for the depthwise convolution kernel.
-    pointwise_initializer: An initializer for the pointwise convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    depthwise_regularizer: Optional regularizer for the depthwise
-      convolution kernel.
-    pointwise_regularizer: Optional regularizer for the pointwise
-      convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    depthwise_constraint: Optional projection function to be applied to the
-        depthwise kernel after being updated by an `Optimizer` (e.g. used for
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    pointwise_constraint: Optional projection function to be applied to the
-        pointwise kernel after being updated by an `Optimizer`.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self, filters,
-               kernel_size,
-               strides=1,
-               padding='valid',
-               data_format='channels_last',
-               dilation_rate=1,
-               depth_multiplier=1,
-               activation=None,
-               use_bias=True,
-               depthwise_initializer=None,
-               pointwise_initializer=None,
-               bias_initializer=init_ops.zeros_initializer(),
-               depthwise_regularizer=None,
-               pointwise_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               depthwise_constraint=None,
-               pointwise_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super(SeparableConv1D, self).__init__(
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        depth_multiplier=depth_multiplier,
-        activation=activation,
-        use_bias=use_bias,
-        depthwise_initializer=depthwise_initializer,
-        pointwise_initializer=pointwise_initializer,
-        bias_initializer=bias_initializer,
-        depthwise_regularizer=depthwise_regularizer,
-        pointwise_regularizer=pointwise_regularizer,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        depthwise_constraint=depthwise_constraint,
-        pointwise_constraint=pointwise_constraint,
-        bias_constraint=bias_constraint,
-        trainable=trainable,
-        name=name,
-        **kwargs)
-
-
-@tf_export(v1=['layers.SeparableConv2D'])
-class SeparableConv2D(keras_layers.SeparableConv2D, base.Layer):
-  """Depthwise separable 2D convolution.
-
-  This layer performs a depthwise convolution that acts separately on
-  channels, followed by a pointwise convolution that mixes channels.
-  If `use_bias` is True and a bias initializer is provided,
-  it adds a bias vector to the output.
-  It then optionally applies an activation function to produce the final output.
-
-  Arguments:
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: A tuple or list of 2 integers specifying the spatial
-      dimensions of the filters. Can be a single integer to specify the same
-      value for all spatial dimensions.
-    strides: A tuple or list of 2 positive integers specifying the strides
-      of the convolution. Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Specifying any `stride` value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-
-    dilation_rate: An integer or tuple/list of 2 integers, specifying
-      the dilation rate to use for dilated convolution.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    depth_multiplier: The number of depthwise convolution output channels for
-      each input channel. The total number of depthwise convolution output
-      channels will be equal to `num_filters_in * depth_multiplier`.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    depthwise_initializer: An initializer for the depthwise convolution kernel.
-    pointwise_initializer: An initializer for the pointwise convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    depthwise_regularizer: Optional regularizer for the depthwise
-      convolution kernel.
-    pointwise_regularizer: Optional regularizer for the pointwise
-      convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    depthwise_constraint: Optional projection function to be applied to the
-        depthwise kernel after being updated by an `Optimizer` (e.g. used for
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    pointwise_constraint: Optional projection function to be applied to the
-        pointwise kernel after being updated by an `Optimizer`.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self, filters,
-               kernel_size,
-               strides=(1, 1),
-               padding='valid',
-               data_format='channels_last',
-               dilation_rate=(1, 1),
-               depth_multiplier=1,
-               activation=None,
-               use_bias=True,
-               depthwise_initializer=None,
-               pointwise_initializer=None,
-               bias_initializer=init_ops.zeros_initializer(),
-               depthwise_regularizer=None,
-               pointwise_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               depthwise_constraint=None,
-               pointwise_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super(SeparableConv2D, self).__init__(
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        depth_multiplier=depth_multiplier,
-        activation=activation,
-        use_bias=use_bias,
-        depthwise_initializer=depthwise_initializer,
-        pointwise_initializer=pointwise_initializer,
-        bias_initializer=bias_initializer,
-        depthwise_regularizer=depthwise_regularizer,
-        pointwise_regularizer=pointwise_regularizer,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        depthwise_constraint=depthwise_constraint,
-        pointwise_constraint=pointwise_constraint,
-        bias_constraint=bias_constraint,
-        trainable=trainable,
-        name=name,
-        **kwargs)
-
-
-@deprecation.deprecated(
-    date=None,
-    instructions='Use `tf.keras.layers.SeparableConv1D` instead.')
-@tf_export(v1=['layers.separable_conv1d'])
-def separable_conv1d(inputs,
-                     filters,
-                     kernel_size,
-                     strides=1,
-                     padding='valid',
-                     data_format='channels_last',
-                     dilation_rate=1,
-                     depth_multiplier=1,
-                     activation=None,
-                     use_bias=True,
-                     depthwise_initializer=None,
-                     pointwise_initializer=None,
-                     bias_initializer=init_ops.zeros_initializer(),
-                     depthwise_regularizer=None,
-                     pointwise_regularizer=None,
-                     bias_regularizer=None,
-                     activity_regularizer=None,
-                     depthwise_constraint=None,
-                     pointwise_constraint=None,
-                     bias_constraint=None,
-                     trainable=True,
-                     name=None,
-                     reuse=None):
-  """Functional interface for the depthwise separable 1D convolution layer.
-
-  This layer performs a depthwise convolution that acts separately on
-  channels, followed by a pointwise convolution that mixes channels.
-  If `use_bias` is True and a bias initializer is provided,
-  it adds a bias vector to the output.
-  It then optionally applies an activation function to produce the final output.
-
-  Arguments:
-    inputs: Input tensor.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: A single integer specifying the spatial
-      dimensions of the filters.
-    strides: A single integer specifying the strides
-      of the convolution.
-      Specifying any `stride` value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, length, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, length)`.
-    dilation_rate: A single integer, specifying
-      the dilation rate to use for dilated convolution.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    depth_multiplier: The number of depthwise convolution output channels for
-      each input channel. The total number of depthwise convolution output
-      channels will be equal to `num_filters_in * depth_multiplier`.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    depthwise_initializer: An initializer for the depthwise convolution kernel.
-    pointwise_initializer: An initializer for the pointwise convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    depthwise_regularizer: Optional regularizer for the depthwise
-      convolution kernel.
-    pointwise_regularizer: Optional regularizer for the pointwise
-      convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    depthwise_constraint: Optional projection function to be applied to the
-        depthwise kernel after being updated by an `Optimizer` (e.g. used for
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    pointwise_constraint: Optional projection function to be applied to the
-        pointwise kernel after being updated by an `Optimizer`.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-    reuse: Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-  """
-  layer = SeparableConv1D(
-      filters=filters,
-      kernel_size=kernel_size,
-      strides=strides,
-      padding=padding,
-      data_format=data_format,
-      dilation_rate=dilation_rate,
-      depth_multiplier=depth_multiplier,
-      activation=activation,
-      use_bias=use_bias,
-      depthwise_initializer=depthwise_initializer,
-      pointwise_initializer=pointwise_initializer,
-      bias_initializer=bias_initializer,
-      depthwise_regularizer=depthwise_regularizer,
-      pointwise_regularizer=pointwise_regularizer,
-      bias_regularizer=bias_regularizer,
-      activity_regularizer=activity_regularizer,
-      depthwise_constraint=depthwise_constraint,
-      pointwise_constraint=pointwise_constraint,
-      bias_constraint=bias_constraint,
-      trainable=trainable,
-      name=name,
-      _reuse=reuse,
-      _scope=name)
-  return layer.apply(inputs)
-
-
-@deprecation.deprecated(
-    date=None,
-    instructions='Use `tf.keras.layers.SeparableConv2D` instead.')
-@tf_export(v1=['layers.separable_conv2d'])
-def separable_conv2d(inputs,
-                     filters,
-                     kernel_size,
-                     strides=(1, 1),
-                     padding='valid',
-                     data_format='channels_last',
-                     dilation_rate=(1, 1),
-                     depth_multiplier=1,
-                     activation=None,
-                     use_bias=True,
-                     depthwise_initializer=None,
-                     pointwise_initializer=None,
-                     bias_initializer=init_ops.zeros_initializer(),
-                     depthwise_regularizer=None,
-                     pointwise_regularizer=None,
-                     bias_regularizer=None,
-                     activity_regularizer=None,
-                     depthwise_constraint=None,
-                     pointwise_constraint=None,
-                     bias_constraint=None,
-                     trainable=True,
-                     name=None,
-                     reuse=None):
-  """Functional interface for the depthwise separable 2D convolution layer.
-
-  This layer performs a depthwise convolution that acts separately on
-  channels, followed by a pointwise convolution that mixes channels.
-  If `use_bias` is True and a bias initializer is provided,
-  it adds a bias vector to the output.
-  It then optionally applies an activation function to produce the final output.
-
-  Arguments:
-    inputs: Input tensor.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: A tuple or list of 2 integers specifying the spatial
-      dimensions of the filters. Can be a single integer to specify the same
-      value for all spatial dimensions.
-    strides: A tuple or list of 2 positive integers specifying the strides
-      of the convolution. Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Specifying any `stride` value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-
-    dilation_rate: An integer or tuple/list of 2 integers, specifying
-      the dilation rate to use for dilated convolution.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    depth_multiplier: The number of depthwise convolution output channels for
-      each input channel. The total number of depthwise convolution output
-      channels will be equal to `num_filters_in * depth_multiplier`.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    depthwise_initializer: An initializer for the depthwise convolution kernel.
-    pointwise_initializer: An initializer for the pointwise convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    depthwise_regularizer: Optional regularizer for the depthwise
-      convolution kernel.
-    pointwise_regularizer: Optional regularizer for the pointwise
-      convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    depthwise_constraint: Optional projection function to be applied to the
-        depthwise kernel after being updated by an `Optimizer` (e.g. used for
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    pointwise_constraint: Optional projection function to be applied to the
-        pointwise kernel after being updated by an `Optimizer`.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-    reuse: Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-  """
-  layer = SeparableConv2D(
-      filters=filters,
-      kernel_size=kernel_size,
-      strides=strides,
-      padding=padding,
-      data_format=data_format,
-      dilation_rate=dilation_rate,
-      depth_multiplier=depth_multiplier,
-      activation=activation,
-      use_bias=use_bias,
-      depthwise_initializer=depthwise_initializer,
-      pointwise_initializer=pointwise_initializer,
-      bias_initializer=bias_initializer,
-      depthwise_regularizer=depthwise_regularizer,
-      pointwise_regularizer=pointwise_regularizer,
-      bias_regularizer=bias_regularizer,
-      activity_regularizer=activity_regularizer,
-      depthwise_constraint=depthwise_constraint,
-      pointwise_constraint=pointwise_constraint,
-      bias_constraint=bias_constraint,
-      trainable=trainable,
-      name=name,
-      _reuse=reuse,
-      _scope=name)
-  return layer.apply(inputs)
-
-
-@tf_export(v1=['layers.Conv2DTranspose'])
-class Conv2DTranspose(keras_layers.Conv2DTranspose, base.Layer):
-  """Transposed 2D convolution layer (sometimes called 2D Deconvolution).
-
-  The need for transposed convolutions generally arises
-  from the desire to use a transformation going in the opposite direction
-  of a normal convolution, i.e., from something that has the shape of the
-  output of some convolution to something that has the shape of its input
-  while maintaining a connectivity pattern that is compatible with
-  said convolution.
-
-  Arguments:
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: A tuple or list of 2 positive integers specifying the spatial
-      dimensions of the filters. Can be a single integer to specify the same
-      value for all spatial dimensions.
-    strides: A tuple or list of 2 positive integers specifying the strides
-      of the convolution. Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: one of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    kernel_regularizer: Optional regularizer for the convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    kernel_constraint: Optional projection function to be applied to the
-        kernel after being updated by an `Optimizer` (e.g. used to implement
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self, filters,
-               kernel_size,
-               strides=(1, 1),
-               padding='valid',
-               data_format='channels_last',
-               activation=None,
-               use_bias=True,
-               kernel_initializer=None,
-               bias_initializer=init_ops.zeros_initializer(),
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super(Conv2DTranspose, self).__init__(
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        activation=activation,
-        use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        bias_initializer=bias_initializer,
-        kernel_regularizer=kernel_regularizer,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        kernel_constraint=kernel_constraint,
-        bias_constraint=bias_constraint,
-        trainable=trainable,
-        name=name,
-        **kwargs)
-
-
-@deprecation.deprecated(
-    date=None,
-    instructions='Use `tf.keras.layers.Conv2DTranspose` instead.')
-@tf_export(v1=['layers.conv2d_transpose'])
-def conv2d_transpose(inputs,
-                     filters,
-                     kernel_size,
-                     strides=(1, 1),
-                     padding='valid',
-                     data_format='channels_last',
-                     activation=None,
-                     use_bias=True,
-                     kernel_initializer=None,
-                     bias_initializer=init_ops.zeros_initializer(),
-                     kernel_regularizer=None,
-                     bias_regularizer=None,
-                     activity_regularizer=None,
-                     kernel_constraint=None,
-                     bias_constraint=None,
-                     trainable=True,
-                     name=None,
-                     reuse=None):
-  """Functional interface for transposed 2D convolution layer.
-
-  The need for transposed convolutions generally arises
-  from the desire to use a transformation going in the opposite direction
-  of a normal convolution, i.e., from something that has the shape of the
-  output of some convolution to something that has the shape of its input
-  while maintaining a connectivity pattern that is compatible with
-  said convolution.
-
-  Arguments:
-    inputs: Input tensor.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: A tuple or list of 2 positive integers specifying the spatial
-      dimensions of the filters. Can be a single integer to specify the same
-      value for all spatial dimensions.
-    strides: A tuple or list of 2 positive integers specifying the strides
-      of the convolution. Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: one of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-    activation: Activation function. Set it to `None` to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If `None`, the default
-      initializer will be used.
-    kernel_regularizer: Optional regularizer for the convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    kernel_constraint: Optional projection function to be applied to the
-        kernel after being updated by an `Optimizer` (e.g. used to implement
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-    reuse: Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-  """
-  layer = Conv2DTranspose(
-      filters=filters,
-      kernel_size=kernel_size,
-      strides=strides,
-      padding=padding,
-      data_format=data_format,
-      activation=activation,
-      use_bias=use_bias,
-      kernel_initializer=kernel_initializer,
-      bias_initializer=bias_initializer,
-      kernel_regularizer=kernel_regularizer,
-      bias_regularizer=bias_regularizer,
-      activity_regularizer=activity_regularizer,
-      kernel_constraint=kernel_constraint,
-      bias_constraint=bias_constraint,
-      trainable=trainable,
-      name=name,
-      _reuse=reuse,
-      _scope=name)
-  return layer.apply(inputs)
-
-
-@tf_export(v1=['layers.Conv3DTranspose'])
-class Conv3DTranspose(keras_layers.Conv3DTranspose, base.Layer):
-  """Transposed 3D convolution layer (sometimes called 3D Deconvolution).
-
-  Arguments:
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of 3 integers, specifying the
-      depth, height and width of the 3D convolution window.
-      Can be a single integer to specify the same value for all spatial
-      dimensions.
-    strides: An integer or tuple/list of 3 integers, specifying the strides
-      of the convolution along the depth, height and width.
-      Can be a single integer to specify the same value for all spatial
-      dimensions.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, depth, height, width, channels)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch, channels, depth, height, width)`.
-    activation: Activation function. Set it to `None` to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If `None`, the default
-      initializer will be used.
-    kernel_regularizer: Optional regularizer for the convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    kernel_constraint: Optional projection function to be applied to the
-        kernel after being updated by an `Optimizer` (e.g. used to implement
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self,
-               filters,
-               kernel_size,
-               strides=(1, 1, 1),
-               padding='valid',
-               data_format='channels_last',
-               activation=None,
-               use_bias=True,
-               kernel_initializer=None,
-               bias_initializer=init_ops.zeros_initializer(),
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super(Conv3DTranspose, self).__init__(
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        activation=activation,
-        use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        bias_initializer=bias_initializer,
-        kernel_regularizer=kernel_regularizer,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        kernel_constraint=kernel_constraint,
-        bias_constraint=bias_constraint,
-        trainable=trainable,
-        name=name,
-        **kwargs)
-
-
-@deprecation.deprecated(
-    date=None,
-    instructions='Use `tf.keras.layers.Conv3DTranspose` instead.')
-@tf_export(v1=['layers.conv3d_transpose'])
-def conv3d_transpose(inputs,
-                     filters,
-                     kernel_size,
-                     strides=(1, 1, 1),
-                     padding='valid',
-                     data_format='channels_last',
-                     activation=None,
-                     use_bias=True,
-                     kernel_initializer=None,
-                     bias_initializer=init_ops.zeros_initializer(),
-                     kernel_regularizer=None,
-                     bias_regularizer=None,
-                     activity_regularizer=None,
-                     kernel_constraint=None,
-                     bias_constraint=None,
-                     trainable=True,
-                     name=None,
-                     reuse=None):
-  """Functional interface for transposed 3D convolution layer.
-
-  Arguments:
-    inputs: Input tensor.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: A tuple or list of 3 positive integers specifying the spatial
-      dimensions of the filters. Can be a single integer to specify the same
-      value for all spatial dimensions.
-    strides: A tuple or list of 3 positive integers specifying the strides
-      of the convolution. Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: one of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, depth, height, width, channels)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch, channels, depth, height, width)`.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    kernel_regularizer: Optional regularizer for the convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    kernel_constraint: Optional projection function to be applied to the
-        kernel after being updated by an `Optimizer` (e.g. used to implement
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-    reuse: Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-  """
-  layer = Conv3DTranspose(
-      filters=filters,
-      kernel_size=kernel_size,
-      strides=strides,
-      padding=padding,
-      data_format=data_format,
-      activation=activation,
-      use_bias=use_bias,
-      kernel_initializer=kernel_initializer,
-      bias_initializer=bias_initializer,
-      kernel_regularizer=kernel_regularizer,
-      bias_regularizer=bias_regularizer,
-      activity_regularizer=activity_regularizer,
-      kernel_constraint=kernel_constraint,
-      bias_constraint=bias_constraint,
-      trainable=trainable,
-      name=name,
-      _reuse=reuse,
-      _scope=name)
-  return layer.apply(inputs)
-
+from tensorflow.python.keras.legacy_tf_layers import convolutional
+
+Conv1D = convolutional.Conv1D
+conv1d = convolutional.conv1d
+Conv2D = convolutional.Conv2D
+conv2d = convolutional.conv2d
+Conv3D = convolutional.Conv3D
+conv3d = convolutional.conv3d
+SeparableConv1D = convolutional.SeparableConv1D
+SeparableConv2D = convolutional.SeparableConv2D
+separable_conv1d = convolutional.separable_conv1d
+separable_conv2d = convolutional.separable_conv2d
+Conv2DTranspose = convolutional.Conv2DTranspose
+conv2d_transpose = convolutional.conv2d_transpose
+Conv3DTranspose = convolutional.Conv3DTranspose
+conv3d_transpose = convolutional.conv3d_transpose
 
 # Aliases
 
diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index dc9293d8072..b0c7d400ba6 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -21,316 +21,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-
-from tensorflow.python.keras import layers as keras_layers
-from tensorflow.python.layers import base
-from tensorflow.python.ops import init_ops
-from tensorflow.python.util import deprecation
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.keras.legacy_tf_layers import core
 
 
-@tf_export(v1=['layers.Dense'])
-class Dense(keras_layers.Dense, base.Layer):
-  """Densely-connected layer class.
-
-  This layer implements the operation:
-  `outputs = activation(inputs * kernel + bias)`
-  Where `activation` is the activation function passed as the `activation`
-  argument (if not `None`), `kernel` is a weights matrix created by the layer,
-  and `bias` is a bias vector created by the layer
-  (only if `use_bias` is `True`).
-
-  Arguments:
-    units: Integer or Long, dimensionality of the output space.
-    activation: Activation function (callable). Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: Initializer function for the weight matrix.
-      If `None` (default), weights are initialized using the default
-      initializer used by `tf.compat.v1.get_variable`.
-    bias_initializer: Initializer function for the bias.
-    kernel_regularizer: Regularizer function for the weight matrix.
-    bias_regularizer: Regularizer function for the bias.
-    activity_regularizer: Regularizer function for the output.
-    kernel_constraint: An optional projection function to be applied to the
-        kernel after being updated by an `Optimizer` (e.g. used to implement
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    bias_constraint: An optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require reuse=True in such cases.
-    _reuse: Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Properties:
-    units: Python integer, dimensionality of the output space.
-    activation: Activation function (callable).
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: Initializer instance (or name) for the kernel matrix.
-    bias_initializer: Initializer instance (or name) for the bias.
-    kernel_regularizer: Regularizer instance for the kernel matrix (callable)
-    bias_regularizer: Regularizer instance for the bias (callable).
-    activity_regularizer: Regularizer instance for the output (callable)
-    kernel_constraint: Constraint function for the kernel matrix.
-    bias_constraint: Constraint function for the bias.
-    kernel: Weight matrix (TensorFlow variable or tensor).
-    bias: Bias vector, if applicable (TensorFlow variable or tensor).
-  """
-
-  def __init__(self, units,
-               activation=None,
-               use_bias=True,
-               kernel_initializer=None,
-               bias_initializer=init_ops.zeros_initializer(),
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super(Dense, self).__init__(units=units,
-                                activation=activation,
-                                use_bias=use_bias,
-                                kernel_initializer=kernel_initializer,
-                                bias_initializer=bias_initializer,
-                                kernel_regularizer=kernel_regularizer,
-                                bias_regularizer=bias_regularizer,
-                                activity_regularizer=activity_regularizer,
-                                kernel_constraint=kernel_constraint,
-                                bias_constraint=bias_constraint,
-                                trainable=trainable,
-                                name=name,
-                                **kwargs)
-
-
-@deprecation.deprecated(
-    date=None, instructions='Use keras.layers.Dense instead.')
-@tf_export(v1=['layers.dense'])
-def dense(
-    inputs, units,
-    activation=None,
-    use_bias=True,
-    kernel_initializer=None,
-    bias_initializer=init_ops.zeros_initializer(),
-    kernel_regularizer=None,
-    bias_regularizer=None,
-    activity_regularizer=None,
-    kernel_constraint=None,
-    bias_constraint=None,
-    trainable=True,
-    name=None,
-    reuse=None):
-  """Functional interface for the densely-connected layer.
-
-  This layer implements the operation:
-  `outputs = activation(inputs * kernel + bias)`
-  where `activation` is the activation function passed as the `activation`
-  argument (if not `None`), `kernel` is a weights matrix created by the layer,
-  and `bias` is a bias vector created by the layer
-  (only if `use_bias` is `True`).
-
-  Arguments:
-    inputs: Tensor input.
-    units: Integer or Long, dimensionality of the output space.
-    activation: Activation function (callable). Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: Initializer function for the weight matrix.
-      If `None` (default), weights are initialized using the default
-      initializer used by `tf.compat.v1.get_variable`.
-    bias_initializer: Initializer function for the bias.
-    kernel_regularizer: Regularizer function for the weight matrix.
-    bias_regularizer: Regularizer function for the bias.
-    activity_regularizer: Regularizer function for the output.
-    kernel_constraint: An optional projection function to be applied to the
-        kernel after being updated by an `Optimizer` (e.g. used to implement
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    bias_constraint: An optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: String, the name of the layer.
-    reuse: Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Returns:
-    Output tensor the same shape as `inputs` except the last dimension is of
-    size `units`.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-  """
-  layer = Dense(units,
-                activation=activation,
-                use_bias=use_bias,
-                kernel_initializer=kernel_initializer,
-                bias_initializer=bias_initializer,
-                kernel_regularizer=kernel_regularizer,
-                bias_regularizer=bias_regularizer,
-                activity_regularizer=activity_regularizer,
-                kernel_constraint=kernel_constraint,
-                bias_constraint=bias_constraint,
-                trainable=trainable,
-                name=name,
-                _scope=name,
-                _reuse=reuse)
-  return layer.apply(inputs)
-
-
-@tf_export(v1=['layers.Dropout'])
-class Dropout(keras_layers.Dropout, base.Layer):
-  """Applies Dropout to the input.
-
-  Dropout consists in randomly setting a fraction `rate` of input units to 0
-  at each update during training time, which helps prevent overfitting.
-  The units that are kept are scaled by `1 / (1 - rate)`, so that their
-  sum is unchanged at training time and inference time.
-
-  Arguments:
-    rate: The dropout rate, between 0 and 1. E.g. `rate=0.1` would drop out
-      10% of input units.
-    noise_shape: 1D tensor of type `int32` representing the shape of the
-      binary dropout mask that will be multiplied with the input.
-      For instance, if your inputs have shape
-      `(batch_size, timesteps, features)`, and you want the dropout mask
-      to be the same for all timesteps, you can use
-      `noise_shape=[batch_size, 1, features]`.
-    seed: A Python integer. Used to create random seeds. See
-      `tf.compat.v1.set_random_seed`.
-      for behavior.
-    name: The name of the layer (string).
-  """
-
-  def __init__(self, rate=0.5,
-               noise_shape=None,
-               seed=None,
-               name=None,
-               **kwargs):
-    super(Dropout, self).__init__(rate=rate,
-                                  noise_shape=noise_shape,
-                                  seed=seed,
-                                  name=name,
-                                  **kwargs)
-
-  def call(self, inputs, training=False):
-    return super(Dropout, self).call(inputs, training=training)
-
-
-@deprecation.deprecated(
-    date=None,
-    instructions='Use keras.layers.dropout instead.')
-@tf_export(v1=['layers.dropout'])
-def dropout(inputs,
-            rate=0.5,
-            noise_shape=None,
-            seed=None,
-            training=False,
-            name=None):
-  """Applies Dropout to the input.
-
-  Dropout consists in randomly setting a fraction `rate` of input units to 0
-  at each update during training time, which helps prevent overfitting.
-  The units that are kept are scaled by `1 / (1 - rate)`, so that their
-  sum is unchanged at training time and inference time.
-
-  Arguments:
-    inputs: Tensor input.
-    rate: The dropout rate, between 0 and 1. E.g. "rate=0.1" would drop out
-      10% of input units.
-    noise_shape: 1D tensor of type `int32` representing the shape of the
-      binary dropout mask that will be multiplied with the input.
-      For instance, if your inputs have shape
-      `(batch_size, timesteps, features)`, and you want the dropout mask
-      to be the same for all timesteps, you can use
-      `noise_shape=[batch_size, 1, features]`.
-    seed: A Python integer. Used to create random seeds. See
-      `tf.compat.v1.set_random_seed`
-      for behavior.
-    training: Either a Python boolean, or a TensorFlow boolean scalar tensor
-      (e.g. a placeholder). Whether to return the output in training mode
-      (apply dropout) or in inference mode (return the input untouched).
-    name: The name of the layer (string).
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-  """
-  layer = Dropout(rate, noise_shape=noise_shape, seed=seed, name=name)
-  return layer.apply(inputs, training=training)
-
-
-@tf_export(v1=['layers.Flatten'])
-class Flatten(keras_layers.Flatten, base.Layer):
-  """Flattens an input tensor while preserving the batch axis (axis 0).
-
-  Arguments:
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, ..., channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, ...)`.
-
-  Examples:
-
-  ```
-    x = tf.compat.v1.placeholder(shape=(None, 4, 4), dtype='float32')
-    y = Flatten()(x)
-    # now `y` has shape `(None, 16)`
-
-    x = tf.compat.v1.placeholder(shape=(None, 3, None), dtype='float32')
-    y = Flatten()(x)
-    # now `y` has shape `(None, None)`
-  ```
-  """
-  pass
-
-
-@deprecation.deprecated(
-    date=None,
-    instructions='Use keras.layers.Flatten instead.')
-@tf_export(v1=['layers.flatten'])
-def flatten(inputs, name=None, data_format='channels_last'):
-  """Flattens an input tensor while preserving the batch axis (axis 0).
-
-  Arguments:
-    inputs: Tensor input.
-    name: The name of the layer (string).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-
-  Returns:
-    Reshaped tensor.
-
-  Examples:
-
-  ```
-    x = tf.compat.v1.placeholder(shape=(None, 4, 4), dtype='float32')
-    y = flatten(x)
-    # now `y` has shape `(None, 16)`
-
-    x = tf.compat.v1.placeholder(shape=(None, 3, None), dtype='float32')
-    y = flatten(x)
-    # now `y` has shape `(None, None)`
-  ```
-  """
-  layer = Flatten(name=name, data_format=data_format)
-  return layer.apply(inputs)
-
+Dense = core.Dense
+dense = core.dense
+Dropout = core.Dropout
+dropout = core.dropout
+Flatten = core.Flatten
+flatten = core.flatten
 
 # Aliases
 
diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py
index 98f042637d0..04ab985058d 100644
--- a/tensorflow/python/layers/normalization.py
+++ b/tensorflow/python/layers/normalization.py
@@ -20,322 +20,11 @@ from __future__ import division
 from __future__ import print_function
 
 
-from tensorflow.python.keras.layers import normalization as keras_normalization
-from tensorflow.python.layers import base
-from tensorflow.python.ops import init_ops
-from tensorflow.python.util import deprecation
-from tensorflow.python.util.tf_export import tf_export
-
-
-@tf_export(v1=['layers.BatchNormalization'])
-class BatchNormalization(keras_normalization.BatchNormalization, base.Layer):
-  """Batch Normalization layer from (Ioffe et al., 2015).
-
-  Keras APIs handle BatchNormalization updates to the moving_mean and
-  moving_variance as part of their `fit()` and `evaluate()` loops. However, if a
-  custom training loop is used with an instance of `Model`, these updates need
-  to be explicitly included.  Here's a simple example of how it can be done:
-
-  ```python
-    # model is an instance of Model that contains BatchNormalization layer.
-    update_ops = model.get_updates_for(None) + model.get_updates_for(features)
-    train_op = optimizer.minimize(loss)
-    train_op = tf.group([train_op, update_ops])
-  ```
-
-  Arguments:
-    axis: An `int` or list of `int`, the axis or axes that should be normalized,
-      typically the features axis/axes. For instance, after a `Conv2D` layer
-      with `data_format="channels_first"`, set `axis=1`. If a list of axes is
-      provided, each axis in `axis` will be normalized
-        simultaneously. Default is `-1` which uses the last axis. Note: when
-          using multi-axis batch norm, the `beta`, `gamma`, `moving_mean`, and
-          `moving_variance` variables are the same rank as the input Tensor,
-          with dimension size 1 in all reduced (non-axis) dimensions).
-    momentum: Momentum for the moving average.
-    epsilon: Small float added to variance to avoid dividing by zero.
-    center: If True, add offset of `beta` to normalized tensor. If False, `beta`
-      is ignored.
-    scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the
-      next layer is linear (also e.g. `nn.relu`), this can be disabled since the
-      scaling can be done by the next layer.
-    beta_initializer: Initializer for the beta weight.
-    gamma_initializer: Initializer for the gamma weight.
-    moving_mean_initializer: Initializer for the moving mean.
-    moving_variance_initializer: Initializer for the moving variance.
-    beta_regularizer: Optional regularizer for the beta weight.
-    gamma_regularizer: Optional regularizer for the gamma weight.
-    beta_constraint: An optional projection function to be applied to the `beta`
-      weight after being updated by an `Optimizer` (e.g. used to implement norm
-      constraints or value constraints for layer weights). The function must
-      take as input the unprojected variable and must return the projected
-      variable (which must have the same shape). Constraints are not safe to use
-      when doing asynchronous distributed training.
-    gamma_constraint: An optional projection function to be applied to the
-      `gamma` weight after being updated by an `Optimizer`.
-    renorm: Whether to use Batch Renormalization (Ioffe, 2017). This adds extra
-      variables during training. The inference is the same for either value of
-      this parameter.
-    renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
-      scalar `Tensors` used to clip the renorm correction. The correction `(r,
-      d)` is used as `corrected_value = normalized_value * r + d`, with `r`
-      clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
-      dmax are set to inf, 0, inf, respectively.
-    renorm_momentum: Momentum used to update the moving means and standard
-      deviations with renorm. Unlike `momentum`, this affects training and
-      should be neither too small (which would add noise) nor too large (which
-      would give stale estimates). Note that `momentum` is still applied to get
-      the means and variances for inference.
-    fused: if `None` or `True`, use a faster, fused implementation if possible.
-      If `False`, use the system recommended implementation.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
-    virtual_batch_size: An `int`. By default, `virtual_batch_size` is `None`,
-      which means batch normalization is performed across the whole batch. When
-      `virtual_batch_size` is not `None`, instead perform "Ghost Batch
-      Normalization", which creates virtual sub-batches which are each
-      normalized separately (with shared gamma, beta, and moving statistics).
-      Must divide the actual batch size during execution.
-    adjustment: A function taking the `Tensor` containing the (dynamic) shape of
-      the input tensor and returning a pair (scale, bias) to apply to the
-      normalized values (before gamma and beta), only during training. For
-      example, if axis==-1,
-        `adjustment = lambda shape: (
-          tf.random.uniform(shape[-1:], 0.93, 1.07),
-          tf.random.uniform(shape[-1:], -0.1, 0.1))` will scale the normalized
-            value by up to 7% up or down, then shift the result by up to 0.1
-            (with independent scaling and bias for each feature but shared
-            across all examples), and finally apply gamma and/or beta. If
-            `None`, no adjustment is applied. Cannot be specified if
-            virtual_batch_size is specified.
-    name: A string, the name of the layer.
-  References:
-    Batch Normalization - Accelerating Deep Network Training by Reducing
-      Internal Covariate Shift:
-      [Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html)
-      ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))
-    Batch Renormalization - Towards Reducing Minibatch Dependence in
-      Batch-Normalized Models:
-      [Ioffe,
-        2017](http://papers.nips.cc/paper/6790-batch-renormalization-towards-reducing-minibatch-dependence-in-batch-normalized-models)
-      ([pdf](http://papers.nips.cc/paper/6790-batch-renormalization-towards-reducing-minibatch-dependence-in-batch-normalized-models.pdf))
-  """
-
-  def __init__(self,
-               axis=-1,
-               momentum=0.99,
-               epsilon=1e-3,
-               center=True,
-               scale=True,
-               beta_initializer=init_ops.zeros_initializer(),
-               gamma_initializer=init_ops.ones_initializer(),
-               moving_mean_initializer=init_ops.zeros_initializer(),
-               moving_variance_initializer=init_ops.ones_initializer(),
-               beta_regularizer=None,
-               gamma_regularizer=None,
-               beta_constraint=None,
-               gamma_constraint=None,
-               renorm=False,
-               renorm_clipping=None,
-               renorm_momentum=0.99,
-               fused=None,
-               trainable=True,
-               virtual_batch_size=None,
-               adjustment=None,
-               name=None,
-               **kwargs):
-    super(BatchNormalization, self).__init__(
-        axis=axis,
-        momentum=momentum,
-        epsilon=epsilon,
-        center=center,
-        scale=scale,
-        beta_initializer=beta_initializer,
-        gamma_initializer=gamma_initializer,
-        moving_mean_initializer=moving_mean_initializer,
-        moving_variance_initializer=moving_variance_initializer,
-        beta_regularizer=beta_regularizer,
-        gamma_regularizer=gamma_regularizer,
-        beta_constraint=beta_constraint,
-        gamma_constraint=gamma_constraint,
-        renorm=renorm,
-        renorm_clipping=renorm_clipping,
-        renorm_momentum=renorm_momentum,
-        fused=fused,
-        trainable=trainable,
-        virtual_batch_size=virtual_batch_size,
-        adjustment=adjustment,
-        name=name,
-        **kwargs)
-
-  def call(self, inputs, training=False):
-    return super(BatchNormalization, self).call(inputs, training=training)
-
-
-@deprecation.deprecated(
-    date=None, instructions='Use keras.layers.BatchNormalization instead.  In '
-    'particular, `tf.control_dependencies(tf.GraphKeys.UPDATE_OPS)` should not '
-    'be used (consult the `tf.keras.layers.BatchNormalization` '
-    'documentation).')
-@tf_export(v1=['layers.batch_normalization'])
-def batch_normalization(inputs,
-                        axis=-1,
-                        momentum=0.99,
-                        epsilon=1e-3,
-                        center=True,
-                        scale=True,
-                        beta_initializer=init_ops.zeros_initializer(),
-                        gamma_initializer=init_ops.ones_initializer(),
-                        moving_mean_initializer=init_ops.zeros_initializer(),
-                        moving_variance_initializer=init_ops.ones_initializer(),
-                        beta_regularizer=None,
-                        gamma_regularizer=None,
-                        beta_constraint=None,
-                        gamma_constraint=None,
-                        training=False,
-                        trainable=True,
-                        name=None,
-                        reuse=None,
-                        renorm=False,
-                        renorm_clipping=None,
-                        renorm_momentum=0.99,
-                        fused=None,
-                        virtual_batch_size=None,
-                        adjustment=None):
-  """Functional interface for the batch normalization layer from_config(Ioffe et al., 2015).
-
-  Note: when training, the moving_mean and moving_variance need to be updated.
-  By default the update ops are placed in `tf.GraphKeys.UPDATE_OPS`, so they
-  need to be executed alongside the `train_op`. Also, be sure to add any
-  batch_normalization ops before getting the update_ops collection. Otherwise,
-  update_ops will be empty, and training/inference will not work properly. For
-  example:
-
-  ```python
-    x_norm = tf.compat.v1.layers.batch_normalization(x, training=training)
-
-    # ...
-
-    update_ops = tf.compat.v1.get_collection(tf.GraphKeys.UPDATE_OPS)
-    train_op = optimizer.minimize(loss)
-    train_op = tf.group([train_op, update_ops])
-  ```
-
-  Arguments:
-    inputs: Tensor input.
-    axis: An `int`, the axis that should be normalized (typically the features
-      axis). For instance, after a `Convolution2D` layer with
-      `data_format="channels_first"`, set `axis=1` in `BatchNormalization`.
-    momentum: Momentum for the moving average.
-    epsilon: Small float added to variance to avoid dividing by zero.
-    center: If True, add offset of `beta` to normalized tensor. If False, `beta`
-      is ignored.
-    scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the
-      next layer is linear (also e.g. `nn.relu`), this can be disabled since the
-      scaling can be done by the next layer.
-    beta_initializer: Initializer for the beta weight.
-    gamma_initializer: Initializer for the gamma weight.
-    moving_mean_initializer: Initializer for the moving mean.
-    moving_variance_initializer: Initializer for the moving variance.
-    beta_regularizer: Optional regularizer for the beta weight.
-    gamma_regularizer: Optional regularizer for the gamma weight.
-    beta_constraint: An optional projection function to be applied to the `beta`
-      weight after being updated by an `Optimizer` (e.g. used to implement norm
-      constraints or value constraints for layer weights). The function must
-      take as input the unprojected variable and must return the projected
-      variable (which must have the same shape). Constraints are not safe to use
-      when doing asynchronous distributed training.
-    gamma_constraint: An optional projection function to be applied to the
-      `gamma` weight after being updated by an `Optimizer`.
-    training: Either a Python boolean, or a TensorFlow boolean scalar tensor
-      (e.g. a placeholder). Whether to return the output in training mode
-      (normalized with statistics of the current batch) or in inference mode
-      (normalized with moving statistics). **NOTE**: make sure to set this
-        parameter correctly, or else your training/inference will not work
-        properly.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
-    name: String, the name of the layer.
-    reuse: Boolean, whether to reuse the weights of a previous layer by the same
-      name.
-    renorm: Whether to use Batch Renormalization (Ioffe, 2017). This adds extra
-      variables during training. The inference is the same for either value of
-      this parameter.
-    renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
-      scalar `Tensors` used to clip the renorm correction. The correction `(r,
-      d)` is used as `corrected_value = normalized_value * r + d`, with `r`
-      clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
-      dmax are set to inf, 0, inf, respectively.
-    renorm_momentum: Momentum used to update the moving means and standard
-      deviations with renorm. Unlike `momentum`, this affects training and
-      should be neither too small (which would add noise) nor too large (which
-      would give stale estimates). Note that `momentum` is still applied to get
-      the means and variances for inference.
-    fused: if `None` or `True`, use a faster, fused implementation if possible.
-      If `False`, use the system recommended implementation.
-    virtual_batch_size: An `int`. By default, `virtual_batch_size` is `None`,
-      which means batch normalization is performed across the whole batch. When
-      `virtual_batch_size` is not `None`, instead perform "Ghost Batch
-      Normalization", which creates virtual sub-batches which are each
-      normalized separately (with shared gamma, beta, and moving statistics).
-      Must divide the actual batch size during execution.
-    adjustment: A function taking the `Tensor` containing the (dynamic) shape of
-      the input tensor and returning a pair (scale, bias) to apply to the
-      normalized values (before gamma and beta), only during training. For
-      example, if axis==-1,
-        `adjustment = lambda shape: (
-          tf.random.uniform(shape[-1:], 0.93, 1.07),
-          tf.random.uniform(shape[-1:], -0.1, 0.1))` will scale the normalized
-            value by up to 7% up or down, then shift the result by up to 0.1
-            (with independent scaling and bias for each feature but shared
-            across all examples), and finally apply gamma and/or beta. If
-            `None`, no adjustment is applied. Cannot be specified if
-            virtual_batch_size is specified.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-
-  References:
-    Batch Normalization - Accelerating Deep Network Training by Reducing
-    Internal Covariate Shift:
-      [Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html)
-      ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))
-    Batch Renormalization - Towards Reducing Minibatch Dependence in
-    Batch-Normalized Models:
-      [Ioffe,
-      2017](http://papers.nips.cc/paper/6790-batch-renormalization-towards-reducing-minibatch-dependence-in-batch-normalized-models)
-      ([pdf](http://papers.nips.cc/paper/6790-batch-renormalization-towards-reducing-minibatch-dependence-in-batch-normalized-models.pdf))
-  """
-  layer = BatchNormalization(
-      axis=axis,
-      momentum=momentum,
-      epsilon=epsilon,
-      center=center,
-      scale=scale,
-      beta_initializer=beta_initializer,
-      gamma_initializer=gamma_initializer,
-      moving_mean_initializer=moving_mean_initializer,
-      moving_variance_initializer=moving_variance_initializer,
-      beta_regularizer=beta_regularizer,
-      gamma_regularizer=gamma_regularizer,
-      beta_constraint=beta_constraint,
-      gamma_constraint=gamma_constraint,
-      renorm=renorm,
-      renorm_clipping=renorm_clipping,
-      renorm_momentum=renorm_momentum,
-      fused=fused,
-      trainable=trainable,
-      virtual_batch_size=virtual_batch_size,
-      adjustment=adjustment,
-      name=name,
-      _reuse=reuse,
-      _scope=name)
-  return layer.apply(inputs, training=training)
+from tensorflow.python.keras.legacy_tf_layers import normalization
 
 
+BatchNormalization = normalization.BatchNormalization
+batch_normalization = normalization.batch_normalization
 # Aliases
 
 BatchNorm = BatchNormalization
diff --git a/tensorflow/python/layers/pooling.py b/tensorflow/python/layers/pooling.py
index 2dbdc099742..5737f1ff09e 100644
--- a/tensorflow/python/layers/pooling.py
+++ b/tensorflow/python/layers/pooling.py
@@ -19,448 +19,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras import layers as keras_layers
-from tensorflow.python.layers import base
-from tensorflow.python.util import deprecation
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.keras.legacy_tf_layers import pooling
 
 
-@tf_export(v1=['layers.AveragePooling1D'])
-class AveragePooling1D(keras_layers.AveragePooling1D, base.Layer):
-  """Average Pooling layer for 1D inputs.
-
-  Arguments:
-    pool_size: An integer or tuple/list of a single integer,
-      representing the size of the pooling window.
-    strides: An integer or tuple/list of a single integer, specifying the
-      strides of the pooling operation.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, length, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, length)`.
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self, pool_size, strides,
-               padding='valid', data_format='channels_last',
-               name=None, **kwargs):
-    if strides is None:
-      raise ValueError('Argument `strides` must not be None.')
-    super(AveragePooling1D, self).__init__(
-        pool_size=pool_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        name=name,
-        **kwargs)
-
-
-@deprecation.deprecated(
-    date=None, instructions='Use keras.layers.AveragePooling1D instead.')
-@tf_export(v1=['layers.average_pooling1d'])
-def average_pooling1d(inputs, pool_size, strides,
-                      padding='valid', data_format='channels_last',
-                      name=None):
-  """Average Pooling layer for 1D inputs.
-
-  Arguments:
-    inputs: The tensor over which to pool. Must have rank 3.
-    pool_size: An integer or tuple/list of a single integer,
-      representing the size of the pooling window.
-    strides: An integer or tuple/list of a single integer, specifying the
-      strides of the pooling operation.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, length, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, length)`.
-    name: A string, the name of the layer.
-
-  Returns:
-    The output tensor, of rank 3.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-  """
-  layer = AveragePooling1D(pool_size=pool_size,
-                           strides=strides,
-                           padding=padding,
-                           data_format=data_format,
-                           name=name)
-  return layer.apply(inputs)
-
-
-@tf_export(v1=['layers.MaxPooling1D'])
-class MaxPooling1D(keras_layers.MaxPooling1D, base.Layer):
-  """Max Pooling layer for 1D inputs.
-
-  Arguments:
-    pool_size: An integer or tuple/list of a single integer,
-      representing the size of the pooling window.
-    strides: An integer or tuple/list of a single integer, specifying the
-      strides of the pooling operation.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, length, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, length)`.
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self, pool_size, strides,
-               padding='valid', data_format='channels_last',
-               name=None, **kwargs):
-    if strides is None:
-      raise ValueError('Argument `strides` must not be None.')
-    super(MaxPooling1D, self).__init__(
-        pool_size=pool_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        name=name,
-        **kwargs)
-
-
-@deprecation.deprecated(
-    date=None, instructions='Use keras.layers.MaxPooling1D instead.')
-@tf_export(v1=['layers.max_pooling1d'])
-def max_pooling1d(inputs, pool_size, strides,
-                  padding='valid', data_format='channels_last',
-                  name=None):
-  """Max Pooling layer for 1D inputs.
-
-  Arguments:
-    inputs: The tensor over which to pool. Must have rank 3.
-    pool_size: An integer or tuple/list of a single integer,
-      representing the size of the pooling window.
-    strides: An integer or tuple/list of a single integer, specifying the
-      strides of the pooling operation.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, length, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, length)`.
-    name: A string, the name of the layer.
-
-  Returns:
-    The output tensor, of rank 3.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-  """
-  layer = MaxPooling1D(pool_size=pool_size,
-                       strides=strides,
-                       padding=padding,
-                       data_format=data_format,
-                       name=name)
-  return layer.apply(inputs)
-
-
-@tf_export(v1=['layers.AveragePooling2D'])
-class AveragePooling2D(keras_layers.AveragePooling2D, base.Layer):
-  """Average pooling layer for 2D inputs (e.g. images).
-
-  Arguments:
-    pool_size: An integer or tuple/list of 2 integers: (pool_height, pool_width)
-      specifying the size of the pooling window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 2 integers,
-      specifying the strides of the pooling operation.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string. The ordering of the dimensions in the inputs.
-      `channels_last` (default) and `channels_first` are supported.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self, pool_size, strides,
-               padding='valid', data_format='channels_last',
-               name=None, **kwargs):
-    if strides is None:
-      raise ValueError('Argument `strides` must not be None.')
-    super(AveragePooling2D, self).__init__(
-        pool_size=pool_size, strides=strides,
-        padding=padding, data_format=data_format, name=name, **kwargs)
-
-
-@deprecation.deprecated(
-    date=None, instructions='Use keras.layers.AveragePooling2D instead.')
-@tf_export(v1=['layers.average_pooling2d'])
-def average_pooling2d(inputs,
-                      pool_size, strides,
-                      padding='valid', data_format='channels_last',
-                      name=None):
-  """Average pooling layer for 2D inputs (e.g. images).
-
-  Arguments:
-    inputs: The tensor over which to pool. Must have rank 4.
-    pool_size: An integer or tuple/list of 2 integers: (pool_height, pool_width)
-      specifying the size of the pooling window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 2 integers,
-      specifying the strides of the pooling operation.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string. The ordering of the dimensions in the inputs.
-      `channels_last` (default) and `channels_first` are supported.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-    name: A string, the name of the layer.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-  """
-  layer = AveragePooling2D(pool_size=pool_size, strides=strides,
-                           padding=padding, data_format=data_format,
-                           name=name)
-  return layer.apply(inputs)
-
-
-@tf_export(v1=['layers.MaxPooling2D'])
-class MaxPooling2D(keras_layers.MaxPooling2D, base.Layer):
-  """Max pooling layer for 2D inputs (e.g. images).
-
-  Arguments:
-    pool_size: An integer or tuple/list of 2 integers: (pool_height, pool_width)
-      specifying the size of the pooling window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 2 integers,
-      specifying the strides of the pooling operation.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string. The ordering of the dimensions in the inputs.
-      `channels_last` (default) and `channels_first` are supported.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self, pool_size, strides,
-               padding='valid', data_format='channels_last',
-               name=None, **kwargs):
-    if strides is None:
-      raise ValueError('Argument `strides` must not be None.')
-    super(MaxPooling2D, self).__init__(
-        pool_size=pool_size, strides=strides,
-        padding=padding, data_format=data_format, name=name, **kwargs)
-
-
-@deprecation.deprecated(
-    date=None, instructions='Use keras.layers.MaxPooling2D instead.')
-@tf_export(v1=['layers.max_pooling2d'])
-def max_pooling2d(inputs,
-                  pool_size, strides,
-                  padding='valid', data_format='channels_last',
-                  name=None):
-  """Max pooling layer for 2D inputs (e.g. images).
-
-  Arguments:
-    inputs: The tensor over which to pool. Must have rank 4.
-    pool_size: An integer or tuple/list of 2 integers: (pool_height, pool_width)
-      specifying the size of the pooling window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 2 integers,
-      specifying the strides of the pooling operation.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string. The ordering of the dimensions in the inputs.
-      `channels_last` (default) and `channels_first` are supported.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-    name: A string, the name of the layer.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-  """
-  layer = MaxPooling2D(pool_size=pool_size, strides=strides,
-                       padding=padding, data_format=data_format,
-                       name=name)
-  return layer.apply(inputs)
-
-
-@tf_export(v1=['layers.AveragePooling3D'])
-class AveragePooling3D(keras_layers.AveragePooling3D, base.Layer):
-  """Average pooling layer for 3D inputs (e.g. volumes).
-
-  Arguments:
-    pool_size: An integer or tuple/list of 3 integers:
-      (pool_depth, pool_height, pool_width)
-      specifying the size of the pooling window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 3 integers,
-      specifying the strides of the pooling operation.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string. The ordering of the dimensions in the inputs.
-      `channels_last` (default) and `channels_first` are supported.
-      `channels_last` corresponds to inputs with shape
-      `(batch, depth, height, width, channels)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch, channels, depth, height, width)`.
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self, pool_size, strides,
-               padding='valid', data_format='channels_last',
-               name=None, **kwargs):
-    if strides is None:
-      raise ValueError('Argument `strides` must not be None.')
-    super(AveragePooling3D, self).__init__(
-        pool_size=pool_size, strides=strides,
-        padding=padding, data_format=data_format, name=name, **kwargs)
-
-
-@deprecation.deprecated(
-    date=None, instructions='Use keras.layers.AveragePooling3D instead.')
-@tf_export(v1=['layers.average_pooling3d'])
-def average_pooling3d(inputs,
-                      pool_size, strides,
-                      padding='valid', data_format='channels_last',
-                      name=None):
-  """Average pooling layer for 3D inputs (e.g. volumes).
-
-  Arguments:
-    inputs: The tensor over which to pool. Must have rank 5.
-    pool_size: An integer or tuple/list of 3 integers:
-      (pool_depth, pool_height, pool_width)
-      specifying the size of the pooling window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 3 integers,
-      specifying the strides of the pooling operation.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string. The ordering of the dimensions in the inputs.
-      `channels_last` (default) and `channels_first` are supported.
-      `channels_last` corresponds to inputs with shape
-      `(batch, depth, height, width, channels)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch, channels, depth, height, width)`.
-    name: A string, the name of the layer.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-  """
-  layer = AveragePooling3D(pool_size=pool_size, strides=strides,
-                           padding=padding, data_format=data_format,
-                           name=name)
-  return layer.apply(inputs)
-
-
-@tf_export(v1=['layers.MaxPooling3D'])
-class MaxPooling3D(keras_layers.MaxPooling3D, base.Layer):
-  """Max pooling layer for 3D inputs (e.g. volumes).
-
-  Arguments:
-    pool_size: An integer or tuple/list of 3 integers:
-      (pool_depth, pool_height, pool_width)
-      specifying the size of the pooling window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 3 integers,
-      specifying the strides of the pooling operation.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string. The ordering of the dimensions in the inputs.
-      `channels_last` (default) and `channels_first` are supported.
-      `channels_last` corresponds to inputs with shape
-      `(batch, depth, height, width, channels)` while `channels_first`
-      corresponds to inputs with shape
-      `(batch, channels, depth, height, width)`.
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self, pool_size, strides,
-               padding='valid', data_format='channels_last',
-               name=None, **kwargs):
-    if strides is None:
-      raise ValueError('Argument `strides` must not be None.')
-    super(MaxPooling3D, self).__init__(
-        pool_size=pool_size, strides=strides,
-        padding=padding, data_format=data_format, name=name, **kwargs)
-
-
-@deprecation.deprecated(
-    date=None, instructions='Use keras.layers.MaxPooling3D instead.')
-@tf_export(v1=['layers.max_pooling3d'])
-def max_pooling3d(inputs,
-                  pool_size, strides,
-                  padding='valid', data_format='channels_last',
-                  name=None):
-  """Max pooling layer for 3D inputs (e.g.
-
-  volumes).
-
-  Arguments:
-    inputs: The tensor over which to pool. Must have rank 5.
-    pool_size: An integer or tuple/list of 3 integers: (pool_depth, pool_height,
-      pool_width) specifying the size of the pooling window. Can be a single
-      integer to specify the same value for all spatial dimensions.
-    strides: An integer or tuple/list of 3 integers, specifying the strides of
-      the pooling operation. Can be a single integer to specify the same value
-      for all spatial dimensions.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string. The ordering of the dimensions in the inputs.
-      `channels_last` (default) and `channels_first` are supported.
-      `channels_last` corresponds to inputs with shape `(batch, depth, height,
-      width, channels)` while `channels_first` corresponds to inputs with shape
-      `(batch, channels, depth, height, width)`.
-    name: A string, the name of the layer.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    ValueError: if eager execution is enabled.
-  """
-  layer = MaxPooling3D(pool_size=pool_size, strides=strides,
-                       padding=padding, data_format=data_format,
-                       name=name)
-  return layer.apply(inputs)
+AveragePooling1D = pooling.AveragePooling1D
+average_pooling1d = pooling.average_pooling1d
+MaxPooling1D = pooling.MaxPooling1D
+max_pooling1d = pooling.max_pooling1d
+AveragePooling2D = pooling.AveragePooling2D
+average_pooling2d = pooling.average_pooling2d
+MaxPooling2D = pooling.MaxPooling2D
+max_pooling2d = pooling.max_pooling2d
+AveragePooling3D = pooling.AveragePooling3D
+average_pooling3d = pooling.average_pooling3d
+MaxPooling3D = pooling.MaxPooling3D
+max_pooling3d = pooling.max_pooling3d
 
 # Aliases
 
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index afa039e091b..5faa07baf94 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -48,7 +48,7 @@ namespace tensorflow {
 namespace {
 
 static mutex mu(LINKER_INITIALIZED);
-static PyObject* py_trampoline GUARDED_BY(mu) = nullptr;
+static PyObject* py_trampoline TF_GUARDED_BY(mu) = nullptr;
 
 // Returns the py_trampoline that is used to pass the control to the
 // python runtime.
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 8979384bc87..8c45161c450 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1282,7 +1282,7 @@ def parallel_stack(values, name="parallel_stack"):
 @dispatch.add_dispatch_support
 def stack(values, axis=0, name="stack"):
   """Stacks a list of rank-`R` tensors into one rank-`(R+1)` tensor.
-  
+
   See also `tf.concat`, `tf.tile`, `tf.repeat`.
 
   Packs the list of tensors in `values` into a tensor with rank one higher than
@@ -1515,7 +1515,7 @@ def unstack(value, num=None, axis=0, name="unstack"):
 @dispatch.add_dispatch_support
 def concat(values, axis, name="concat"):
   """Concatenates tensors along one dimension.
-  
+
   See also `tf.tile`, `tf.stack`, `tf.repeat`.
 
   Concatenates the list of tensors `values` along dimension `axis`.  If
@@ -1888,7 +1888,7 @@ unique_with_counts.__doc__ = gen_array_ops.unique_with_counts.__doc__
 @tf_export("split")
 def split(value, num_or_size_splits, axis=0, num=None, name="split"):
   """Splits a tensor `value` into a list of sub tensors.
-  
+
   See also `tf.unstack`.
 
   If `num_or_size_splits` is an integer, then `value` is split along the
@@ -2739,7 +2739,7 @@ def zeros(shape, dtype=dtypes.float32, name=None):
 @dispatch.add_dispatch_support
 def zeros_like(tensor, dtype=None, name=None, optimize=True):
   """Creates a tensor with all elements set to zero.
-  
+
   See also `tf.zeros`.
 
   Given a single tensor (`tensor`), this operation returns a tensor of the
@@ -2781,7 +2781,7 @@ def zeros_like_v2(
     dtype=None,
     name=None):
   """Creates a tensor with all elements set to zero.
-  
+
   See also `tf.zeros`.
 
   Given a single tensor or array-like object (`input`), this operation returns
@@ -2854,7 +2854,7 @@ def zeros_like_impl(tensor, dtype, name, optimize=True):
 @dispatch.add_dispatch_support
 def ones_like(tensor, dtype=None, name=None, optimize=True):
   """Creates a tensor with all elements set to 1.
-  
+
   See also `tf.ones`.
 
   Given a single tensor (`tensor`), this operation returns a tensor of the same
@@ -2890,7 +2890,7 @@ def ones_like_v2(
     dtype=None,
     name=None):
   """Creates a tensor of all ones that has the same shape as the input.
-  
+
   See also `tf.ones`.
 
   Given a single tensor (`tensor`), this operation returns a tensor of the
@@ -2934,7 +2934,7 @@ def ones_like_impl(tensor, dtype, name, optimize=True):
 @tf_export("ones")
 def ones(shape, dtype=dtypes.float32, name=None):
   """Creates a tensor with all elements set to one (1).
-  
+
   See also `tf.ones_like`.
 
   This operation returns a tensor of type `dtype` with shape `shape` and
diff --git a/tensorflow/python/ops/collective_ops_test.py b/tensorflow/python/ops/collective_ops_test.py
index 58c5f54e6ad..b9b533147fc 100644
--- a/tensorflow/python/ops/collective_ops_test.py
+++ b/tensorflow/python/ops/collective_ops_test.py
@@ -466,6 +466,39 @@ class CollectiveOpTest(test.TestCase):
         in_tensor, group_size, group_key, instance_key)
     self.assertAllEqual(in_value, gathered_tensor.numpy())
 
+  @test_util.run_deprecated_v1
+  def testConstantWithScopedAllocator(self):
+    group_size = 2
+    group_key = 1
+    instance_key1 = 1
+    instance_key2 = 2
+
+    graph_options = config_pb2.GraphOptions(
+        optimizer_options=config_pb2.OptimizerOptions(do_constant_folding=True))
+    cfg = config_pb2.ConfigProto(device_count={'CPU': group_size},
+                                 graph_options=graph_options)
+    rewrite_options = cfg.graph_options.rewrite_options
+    rewrite_options.scoped_allocator_optimization = (
+        rewriter_config_pb2.RewriterConfig.ON)
+    del rewrite_options.scoped_allocator_opts.enable_op[:]
+    rewrite_options.scoped_allocator_opts.enable_op.append('CollectiveReduce')
+
+    with self.session(config=cfg) as sess:
+      run_ops = []
+      for i in range(group_size):
+        with ops.device('CPU:%d' % i):
+          constant = constant_op.constant(i + 1.)
+          input_tensor1 = array_ops.identity(constant)
+          input_tensor2 = array_ops.identity(constant)
+          reduced_tensor1 = collective_ops.all_reduce(
+              input_tensor1, group_size, group_key, instance_key1, 'Add', 'Id')
+          reduced_tensor2 = collective_ops.all_reduce(
+              input_tensor2, group_size, group_key, instance_key2, 'Add', 'Id')
+          run_ops.append(array_ops.identity(reduced_tensor1))
+          run_ops.append(array_ops.identity(reduced_tensor2))
+      results = sess.run(run_ops)
+      self.assertEqual(results, [3., 3., 3., 3.])
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index a90f223ac92..63c653b5df1 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -1181,6 +1181,9 @@ def partitioned_call(args,
   outputs = op.outputs
   if hasattr(f, "graph"):
     _set_read_only_resource_inputs_attr(op, f.graph)
+    if hasattr(f.graph, "collective_manager_ids_used"):
+      ops.set_int_list_attr(
+          op, acd.COLLECTIVE_MANAGER_IDS, f.graph.collective_manager_ids_used)
   return outputs if outputs else op
 
 
diff --git a/tensorflow/python/ops/gradient_checker_v2.py b/tensorflow/python/ops/gradient_checker_v2.py
index a0eba2b5d95..4edb5adb113 100644
--- a/tensorflow/python/ops/gradient_checker_v2.py
+++ b/tensorflow/python/ops/gradient_checker_v2.py
@@ -284,7 +284,7 @@ def _compute_gradient(f, y_shape, y_dtype, xs, param, delta):
 def _compute_gradient_list(f, xs, delta):
   """Compute gradients for a list of x values."""
   # convert xs to tensors so that dtype and shape have uniform types
-  xs = list(map(ops.convert_to_tensor, xs))
+  xs = [ops.convert_to_tensor(x) for x in xs]
   # run the function to get info of the result
   xs_dtypes = [x.dtype for x in xs]
   xs_shapes = [x.shape for x in xs]
@@ -304,7 +304,8 @@ def compute_gradient(f, x, delta=1e-3):
 
   Args:
     f: the function.
-    x: a list arguments for the function
+    x: the arguments for the function as a list or tuple of values convertible
+      to a Tensor.
     delta: (optional) perturbation used to compute numeric Jacobian.
 
   Returns:
@@ -329,9 +330,10 @@ def compute_gradient(f, x, delta=1e-3):
   # ((array([[2.]], dtype=float32),), (array([[2.000004]], dtype=float32),))
   ```
   """
-  if not isinstance(x, list):
+  if not isinstance(x, (list, tuple)):
     raise ValueError(
-        "`x` must be a list of Tensors (arguments to `f`), not a %s" % type(x))
+        "`x` must be a list or tuple of values convertible to a Tensor "
+        "(arguments to `f`), not a %s" % type(x))
   return _compute_gradient_list(f, x, delta)
 
 
diff --git a/tensorflow/python/ops/gradient_checker_v2_test.py b/tensorflow/python/ops/gradient_checker_v2_test.py
index f16305f6943..96a942d8a90 100644
--- a/tensorflow/python/ops/gradient_checker_v2_test.py
+++ b/tensorflow/python/ops/gradient_checker_v2_test.py
@@ -61,6 +61,17 @@ class GradientCheckerTest(test.TestCase):
 
     self.assertLess(error, 1e-4)
 
+  def testWithArgumentsAsTuple(self):
+    size = (2, 3)
+    x1 = constant_op.constant(2.0, shape=size, name="x1")
+    x2 = constant_op.constant(3.0, shape=size, name="x2")
+
+    error = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        lambda x1: math_ops.add(x1, x2), (x1,)))
+
+    tf_logging.info("x1 error = %f", error)
+    self.assertLess(error, 1e-4)
+
   def testAddSimple(self):
     size = (2, 3)
     x1 = constant_op.constant(2.0, shape=size, name="x1")
diff --git a/tensorflow/python/ops/image_ops.py b/tensorflow/python/ops/image_ops.py
index ea8d9575698..3c8d4989a5f 100644
--- a/tensorflow/python/ops/image_ops.py
+++ b/tensorflow/python/ops/image_ops.py
@@ -143,7 +143,7 @@ the decode Ops to one of the cropping and resizing Ops.
 *   `tf.io.decode_and_crop_jpeg`
 *   `tf.io.decode_png`
 *   `tf.io.encode_jpeg`
-*   `tf.image.encode_png`
+*   `tf.io.encode_png`
 
 """
 from __future__ import absolute_import
@@ -245,6 +245,7 @@ def _image_projective_transform_grad(op, grad):
   images = op.inputs[0]
   transforms = op.inputs[1]
   interpolation = op.get_attr("interpolation")
+  fill_mode = op.get_attr("fill_mode")
 
   image_or_images = ops.convert_to_tensor(images, name="images")
   transform_or_transforms = ops.convert_to_tensor(
@@ -267,5 +268,6 @@ def _image_projective_transform_grad(op, grad):
       images=grad,
       transforms=transforms,
       output_shape=array_ops.shape(image_or_images)[1:3],
-      interpolation=interpolation)
+      interpolation=interpolation,
+      fill_mode=fill_mode)
   return [output, None, None]
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 94c3dc278b8..b73bfa44b03 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -1201,9 +1201,6 @@ def _resize_images_common(images, resizer_fn, size, preserve_aspect_ratio, name,
     if not size.get_shape().is_compatible_with([2]):
       raise ValueError('\'size\' must be a 1-D Tensor of 2 elements: '
                        'new_height, new_width')
-    size_const_as_shape = tensor_util.constant_value_as_shape(size)
-    new_height_const = size_const_as_shape.dims[0].value
-    new_width_const = size_const_as_shape.dims[1].value
 
     if preserve_aspect_ratio:
       # Get the current shapes of the image, even if dynamic.
@@ -1211,10 +1208,10 @@ def _resize_images_common(images, resizer_fn, size, preserve_aspect_ratio, name,
 
       # do the computation to find the right scale and height/width.
       scale_factor_height = (
-          math_ops.cast(new_height_const, dtypes.float32) /
+          math_ops.cast(size[0], dtypes.float32) /
           math_ops.cast(current_height, dtypes.float32))
       scale_factor_width = (
-          math_ops.cast(new_width_const, dtypes.float32) /
+          math_ops.cast(size[1], dtypes.float32) /
           math_ops.cast(current_width, dtypes.float32))
       scale_factor = math_ops.minimum(scale_factor_height, scale_factor_width)
       scaled_height_const = math_ops.cast(
@@ -1230,9 +1227,10 @@ def _resize_images_common(images, resizer_fn, size, preserve_aspect_ratio, name,
       size = ops.convert_to_tensor([scaled_height_const, scaled_width_const],
                                    dtypes.int32,
                                    name='size')
-      size_const_as_shape = tensor_util.constant_value_as_shape(size)
-      new_height_const = size_const_as_shape.dims[0].value
-      new_width_const = size_const_as_shape.dims[1].value
+
+    size_const_as_shape = tensor_util.constant_value_as_shape(size)
+    new_height_const = size_const_as_shape.dims[0].value
+    new_width_const = size_const_as_shape.dims[1].value
 
     # If we can determine that the height and width will be unmodified by this
     # transformation, we avoid performing the resize.
@@ -1374,11 +1372,11 @@ def resize_images_v2(images,
   >>> tf.image.resize(image[0], [3,5]).shape.as_list()
   [3, 5, 1]
 
-  When 'antialias' is true, the sampling filter will anti-alias the input image
+  When `antialias` is true, the sampling filter will anti-alias the input image
   as well as interpolate.  When downsampling an image with [anti-aliasing](
   https://en.wikipedia.org/wiki/Spatial_anti-aliasing) the sampling filter
   kernel is scaled in order to properly anti-alias the input image signal.
-  'antialias' has no effect when upsampling an image:
+  `antialias` has no effect when upsampling an image:
 
   >>> a = tf.image.resize(image, [5,10])
   >>> b = tf.image.resize(image, [5,10], antialias=True)
@@ -1388,8 +1386,8 @@ def resize_images_v2(images,
   The `method` argument expects an item from the `image.ResizeMethod` enum, or
   the string equivalent. The options are:
 
-  *   <b>`'bilinear'`</b>: [Bilinear interpolation.](
-    https://en.wikipedia.org/wiki/Bilinear_interpolation) If 'antialias' is
+  *   <b>`bilinear`</b>: [Bilinear interpolation.](
+    https://en.wikipedia.org/wiki/Bilinear_interpolation) If `antialias` is
     true, becomes a hat/tent filter function with radius 1 when downsampling.
   *   <b>`lanczos3`</b>:  [Lanczos kernel](
     https://en.wikipedia.org/wiki/Lanczos_resampling) with radius 3.
@@ -1407,9 +1405,9 @@ def resize_images_v2(images,
     sigma = 1.5 / 3.0.
   *   <b>`nearest`</b>: [Nearest neighbor interpolation.](
     https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation)
-    'antialias' has no effect when used with nearest neighbor interpolation.
+    `antialias` has no effect when used with nearest neighbor interpolation.
   *   <b>`area`</b>: Anti-aliased resampling with area interpolation.
-    'antialias' has no effect when used with area interpolation; it
+    `antialias` has no effect when used with area interpolation; it
     always anti-aliases.
   *   <b>`mitchellcubic`</b>: Mitchell-Netravali Cubic non-interpolating filter.
     For synthetic images (especially those lacking proper prefiltering), less
@@ -2512,7 +2510,7 @@ tf_export(
         gen_image_ops.extract_jpeg_shape)
 
 
-@tf_export('image.encode_png')
+@tf_export('io.encode_png', 'image.encode_png')
 def encode_png(image, compression=-1, name=None):
   r"""PNG-encode an image.
 
@@ -3257,16 +3255,16 @@ def rgb_to_yuv(images):
 
   Usage Example:
 
-  >>> x = [[[1.0, 2.0, 3.0],
-  ...       [4.0, 5.0, 6.0]],
-  ...     [[7.0, 8.0, 9.0],
-  ...       [10.0, 11.0, 12.0]]]
+  >>> x = [[[0.1, 0.2, 0.3],
+  ...       [0.4, 0.5, 0.6]],
+  ...     [[0.7, 0.8, 0.9],
+  ...       [0.10, 0.11, 0.12]]]
   >>> tf.image.rgb_to_yuv(x)
   <tf.Tensor: shape=(2, 2, 3), dtype=float32, numpy=
-  array([[[ 1.815    ,  0.5831516, -0.7149856],
-          [ 4.815    ,  0.5831516, -0.7149855]],
-         [[ 7.815    ,  0.5831516, -0.7149856],
-          [10.815001 ,  0.5831518, -0.7149852]]], dtype=float32)>
+  array([[[ 0.1815    ,  0.05831515, -0.07149857],
+          [ 0.4815    ,  0.05831517, -0.07149856]],
+         [[ 0.7815    ,  0.05831515, -0.07149857],
+          [ 0.10815   ,  0.00583152, -0.00714985]]], dtype=float32)>
 
   Args:
     images: 2-D or higher rank. Image data to convert. Last dimension must be
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index f4c0f6926b1..997a80f4648 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -2654,6 +2654,25 @@ class ResizeImagesV2Test(test_util.TensorFlowTestCase):
           cpu_val = self.evaluate(out_op)
         self.assertAllClose(cpu_val, gpu_val, rtol=1e-5, atol=1e-5)
 
+  @test_util.disable_xla("align_corners=False not supported by XLA")
+  def testBfloat16MultipleOps(self):
+    target_height = 8
+    target_width = 12
+    img = np.random.uniform(0, 100, size=(30, 10, 2)).astype(np.float32)
+    img_bf16 = ops.convert_to_tensor(img, dtype="bfloat16")
+    new_size = constant_op.constant([target_height, target_width])
+    img_methods = [
+        image_ops.ResizeMethod.BILINEAR,
+        image_ops.ResizeMethod.NEAREST_NEIGHBOR, image_ops.ResizeMethod.BICUBIC,
+        image_ops.ResizeMethod.AREA
+    ]
+    for method in img_methods:
+      out_op_bf16 = image_ops.resize_images_v2(img_bf16, new_size, method)
+      out_op_f32 = image_ops.resize_images_v2(img, new_size, method)
+      bf16_val = self.evaluate(out_op_bf16)
+      f32_val = self.evaluate(out_op_f32)
+      self.assertAllClose(bf16_val, f32_val, rtol=1e-2, atol=1e-2)
+
   def testCompareBilinear(self):
     if test.is_gpu_available():
       input_shape = [1, 5, 6, 3]
@@ -2712,7 +2731,9 @@ class ResizeImagesV2Test(test_util.TensorFlowTestCase):
       feed_dict = {}
 
     y = image_ops.resize_images(
-        x_tensor, target_max, preserve_aspect_ratio=preserve_aspect_ratio)
+        x_tensor,
+        ops.convert_to_tensor(target_max),
+        preserve_aspect_ratio=preserve_aspect_ratio)
 
     with self.cached_session(use_gpu=True):
       return y.eval(feed_dict=feed_dict)
@@ -2753,11 +2774,17 @@ class ResizeImagesV2Test(test_util.TensorFlowTestCase):
 
   @test_util.run_deprecated_v1
   def testPreserveAspectRatioMultipleImages(self):
-    x_shape = [10, 100, 100, 10]
+    x_shape = [10, 100, 80, 10]
     x = np.random.uniform(size=x_shape)
-
-    self._assertResizeCheckShape(
-        x, x_shape, [250, 250], [10, 250, 250, 10], preserve_aspect_ratio=False)
+    for preserve_aspect_ratio in [True, False]:
+      with self.subTest(preserve_aspect_ratio=preserve_aspect_ratio):
+        expect_shape = [10, 250, 200, 10] if preserve_aspect_ratio \
+            else [10, 250, 250, 10]
+        self._assertResizeCheckShape(
+            x,
+            x_shape, [250, 250],
+            expect_shape,
+            preserve_aspect_ratio=preserve_aspect_ratio)
 
   @test_util.run_deprecated_v1
   def testPreserveAspectRatioNoOp(self):
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index 4a181d72f2a..8e1967f63c1 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -664,7 +664,7 @@ class LinearOperator(module.Module):
     """Transform [batch] vector `x` with left multiplication:  `x --> Ax`.
 
     ```python
-    # Make an operator acting like batch matric A.  Assume A.shape = [..., M, N]
+    # Make an operator acting like batch matrix A.  Assume A.shape = [..., M, N]
     operator = LinearOperator(...)
 
     X = ... # shape [..., N], batch vector
diff --git a/tensorflow/python/ops/linalg/linear_operator_block_diag.py b/tensorflow/python/ops/linalg/linear_operator_block_diag.py
index 8b4ab0dc5e5..7c50d00a055 100644
--- a/tensorflow/python/ops/linalg/linear_operator_block_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_block_diag.py
@@ -26,6 +26,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_algebra
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.util.tf_export import tf_export
 
@@ -46,7 +47,6 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
   the [batch] square matrix formed by having each matrix `Aj` on the main
   diagonal.
 
-
   Each `opj` is required to represent a square matrix, and hence will have
   shape `batch_shape_j + [M_j, M_j]`.
 
@@ -58,6 +58,12 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
   methods may fail due to lack of broadcasting ability in the defining
   operators' methods.
 
+  Arguments to `matmul`, `matvec`, `solve`, and `solvevec` may either be single
+  `Tensor`s or lists of `Tensor`s that are interpreted as blocks. The `j`th
+  element of a blockwise list of `Tensor`s must have dimensions that match
+  `opj` for the given method. If a list of blocks is input, then a list of
+  blocks is returned as well.
+
   ```python
   # Create a 4 x 4 linear operator combined of two 2 x 2 operators.
   operator_1 = LinearOperatorFullMatrix([[1., 2.], [3., 4.]])
@@ -97,6 +103,11 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
   x = tf.random.normal(shape=[2, 3, 9])
   operator_99.matmul(x)
   ==> Shape [2, 3, 9] Tensor
+
+  # Create a blockwise list of vectors.
+  x = [tf.random.normal(shape=[2, 3, 4]), tf.random.normal(shape=[2, 3, 5])]
+  operator_99.matmul(x)
+  ==> [Shape [2, 3, 4] Tensor, Shape [2, 3, 5] Tensor]
   ```
 
   #### Performance
@@ -160,6 +171,10 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
           "Expected a non-empty list of operators. Found: %s" % operators)
     self._operators = operators
 
+    # Define diagonal operators, for functions that are shared across blockwise
+    # `LinearOperator` types.
+    self._diagonal_operators = operators
+
     # Validate dtype.
     dtype = operators[0].dtype
     for operator in operators:
@@ -218,14 +233,22 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
   def operators(self):
     return self._operators
 
+  def _block_range_dimensions(self):
+    return [op.range_dimension for op in self._diagonal_operators]
+
+  def _block_domain_dimensions(self):
+    return [op.domain_dimension for op in self._diagonal_operators]
+
+  def _block_range_dimension_tensors(self):
+    return [op.range_dimension_tensor() for op in self._diagonal_operators]
+
+  def _block_domain_dimension_tensors(self):
+    return [op.domain_dimension_tensor() for op in self._diagonal_operators]
+
   def _shape(self):
     # Get final matrix shape.
-    domain_dimension = self.operators[0].domain_dimension
-    range_dimension = self.operators[0].range_dimension
-    for operator in self.operators[1:]:
-      domain_dimension += operator.domain_dimension
-      range_dimension += operator.range_dimension
-
+    domain_dimension = sum(self._block_domain_dimensions())
+    range_dimension = sum(self._block_range_dimensions())
     matrix_shape = tensor_shape.TensorShape([domain_dimension, range_dimension])
 
     # Get broadcast batch shape.
@@ -243,12 +266,8 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
       return ops.convert_to_tensor(
           self.shape.as_list(), dtype=dtypes.int32, name="shape")
 
-    domain_dimension = self.operators[0].domain_dimension_tensor()
-    range_dimension = self.operators[0].range_dimension_tensor()
-    for operator in self.operators[1:]:
-      domain_dimension += operator.domain_dimension_tensor()
-      range_dimension += operator.range_dimension_tensor()
-
+    domain_dimension = sum(self._block_domain_dimension_tensors())
+    range_dimension = sum(self._block_range_dimension_tensors())
     matrix_shape = array_ops.stack([domain_dimension, range_dimension])
 
     # Dummy Tensor of zeros.  Will never be materialized.
@@ -259,19 +278,149 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
 
     return array_ops.concat((batch_shape, matrix_shape), 0)
 
+  def matmul(self, x, adjoint=False, adjoint_arg=False, name="matmul"):
+    """Transform [batch] matrix `x` with left multiplication:  `x --> Ax`.
+
+    ```python
+    # Make an operator acting like batch matrix A.  Assume A.shape = [..., M, N]
+    operator = LinearOperator(...)
+    operator.shape = [..., M, N]
+
+    X = ... # shape [..., N, R], batch matrix, R > 0.
+
+    Y = operator.matmul(X)
+    Y.shape
+    ==> [..., M, R]
+
+    Y[..., :, r] = sum_j A[..., :, j] X[j, r]
+    ```
+
+    Args:
+      x: `LinearOperator`, `Tensor` with compatible shape and same `dtype` as
+        `self`, or a blockwise iterable of `LinearOperator`s or `Tensor`s. See
+        class docstring for definition of shape compatibility.
+      adjoint: Python `bool`.  If `True`, left multiply by the adjoint: `A^H x`.
+      adjoint_arg:  Python `bool`.  If `True`, compute `A x^H` where `x^H` is
+        the hermitian transpose (transposition and complex conjugation).
+      name:  A name for this `Op`.
+
+    Returns:
+      A `LinearOperator` or `Tensor` with shape `[..., M, R]` and same `dtype`
+        as `self`, or if `x` is blockwise, a list of `Tensor`s with shapes that
+        concatenate to `[..., M, R]`.
+    """
+    if isinstance(x, linear_operator.LinearOperator):
+      left_operator = self.adjoint() if adjoint else self
+      right_operator = x.adjoint() if adjoint_arg else x
+
+      if (right_operator.range_dimension is not None and
+          left_operator.domain_dimension is not None and
+          right_operator.range_dimension != left_operator.domain_dimension):
+        raise ValueError(
+            "Operators are incompatible. Expected `x` to have dimension"
+            " {} but got {}.".format(
+                left_operator.domain_dimension, right_operator.range_dimension))
+      with self._name_scope(name):
+        return linear_operator_algebra.matmul(left_operator, right_operator)
+
+    with self._name_scope(name):
+      arg_dim = -1 if adjoint_arg else -2
+      block_dimensions = (self._block_range_dimensions() if adjoint
+                          else self._block_domain_dimensions())
+      if linear_operator_util.arg_is_blockwise(block_dimensions, x, arg_dim):
+        for i, block in enumerate(x):
+          if not isinstance(block, linear_operator.LinearOperator):
+            block = ops.convert_to_tensor(block)
+            self._check_input_dtype(block)
+            block_dimensions[i].assert_is_compatible_with(block.shape[arg_dim])
+            x[i] = block
+      else:
+        x = ops.convert_to_tensor(x, name="x")
+        self._check_input_dtype(x)
+        op_dimension = (self.range_dimension if adjoint
+                        else self.domain_dimension)
+        op_dimension.assert_is_compatible_with(x.shape[arg_dim])
+      return self._matmul(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
+
   def _matmul(self, x, adjoint=False, adjoint_arg=False):
-    split_dim = -1 if adjoint_arg else -2
-    # Split input by rows normally, and otherwise columns.
-    split_x = self._split_input_into_blocks(x, axis=split_dim)
+    arg_dim = -1 if adjoint_arg else -2
+    block_dimensions = (self._block_range_dimensions() if adjoint
+                        else self._block_domain_dimensions())
+    blockwise_arg = linear_operator_util.arg_is_blockwise(
+        block_dimensions, x, arg_dim)
+    if blockwise_arg:
+      split_x = x
+    else:
+      split_dim = -1 if adjoint_arg else -2
+      # Split input by rows normally, and otherwise columns.
+      split_x = linear_operator_util.split_arg_into_blocks(
+          self._block_domain_dimensions(),
+          self._block_domain_dimension_tensors,
+          x, axis=split_dim)
 
     result_list = []
     for index, operator in enumerate(self.operators):
       result_list += [operator.matmul(
           split_x[index], adjoint=adjoint, adjoint_arg=adjoint_arg)]
+
+    if blockwise_arg:
+      return result_list
+
     result_list = linear_operator_util.broadcast_matrix_batch_dims(
         result_list)
     return array_ops.concat(result_list, axis=-2)
 
+  def matvec(self, x, adjoint=False, name="matvec"):
+    """Transform [batch] vector `x` with left multiplication:  `x --> Ax`.
+
+    ```python
+    # Make an operator acting like batch matric A.  Assume A.shape = [..., M, N]
+    operator = LinearOperator(...)
+
+    X = ... # shape [..., N], batch vector
+
+    Y = operator.matvec(X)
+    Y.shape
+    ==> [..., M]
+
+    Y[..., :] = sum_j A[..., :, j] X[..., j]
+    ```
+
+    Args:
+      x: `Tensor` with compatible shape and same `dtype` as `self`, or an
+        iterable of `Tensor`s (for blockwise operators). `Tensor`s are treated
+        a [batch] vectors, meaning for every set of leading dimensions, the last
+        dimension defines a vector.
+        See class docstring for definition of compatibility.
+      adjoint: Python `bool`.  If `True`, left multiply by the adjoint: `A^H x`.
+      name:  A name for this `Op`.
+
+    Returns:
+      A `Tensor` with shape `[..., M]` and same `dtype` as `self`.
+    """
+    with self._name_scope(name):
+      block_dimensions = (self._block_range_dimensions() if adjoint
+                          else self._block_domain_dimensions())
+      if linear_operator_util.arg_is_blockwise(block_dimensions, x, -1):
+        for i, block in enumerate(x):
+          if not isinstance(block, linear_operator.LinearOperator):
+            block = ops.convert_to_tensor(block)
+            self._check_input_dtype(block)
+            block_dimensions[i].assert_is_compatible_with(block.shape[-1])
+            x[i] = block
+        x_mat = [block[..., array_ops.newaxis] for block in x]
+        y_mat = self.matmul(x_mat, adjoint=adjoint)
+        return [array_ops.squeeze(y, axis=-1) for y in y_mat]
+
+      x = ops.convert_to_tensor(x, name="x")
+      self._check_input_dtype(x)
+      op_dimension = (self.range_dimension if adjoint
+                      else self.domain_dimension)
+      op_dimension.assert_is_compatible_with(x.shape[-1])
+      x_mat = x[..., array_ops.newaxis]
+      y_mat = self.matmul(x_mat, adjoint=adjoint)
+      return array_ops.squeeze(y_mat, axis=-1)
+
   def _determinant(self):
     result = self.operators[0].determinant()
     for operator in self.operators[1:]:
@@ -284,19 +433,172 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
       result += operator.log_abs_determinant()
     return result
 
-  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
-    split_dim = -1 if adjoint_arg else -2
-    # Split input by rows normally, and otherwise columns.
-    split_rhs = self._split_input_into_blocks(rhs, axis=split_dim)
+  def solve(self, rhs, adjoint=False, adjoint_arg=False, name="solve"):
+    """Solve (exact or approx) `R` (batch) systems of equations: `A X = rhs`.
 
-    solution_list = []
-    for index, operator in enumerate(self.operators):
-      solution_list += [operator.solve(
-          split_rhs[index], adjoint=adjoint, adjoint_arg=adjoint_arg)]
+    The returned `Tensor` will be close to an exact solution if `A` is well
+    conditioned. Otherwise closeness will vary. See class docstring for details.
 
-    solution_list = linear_operator_util.broadcast_matrix_batch_dims(
-        solution_list)
-    return array_ops.concat(solution_list, axis=-2)
+    Examples:
+
+    ```python
+    # Make an operator acting like batch matrix A.  Assume A.shape = [..., M, N]
+    operator = LinearOperator(...)
+    operator.shape = [..., M, N]
+
+    # Solve R > 0 linear systems for every member of the batch.
+    RHS = ... # shape [..., M, R]
+
+    X = operator.solve(RHS)
+    # X[..., :, r] is the solution to the r'th linear system
+    # sum_j A[..., :, j] X[..., j, r] = RHS[..., :, r]
+
+    operator.matmul(X)
+    ==> RHS
+    ```
+
+    Args:
+      rhs: `Tensor` with same `dtype` as this operator and compatible shape,
+        or a list of `Tensor`s (for blockwise operators). `Tensor`s are treated
+        like a [batch] matrices meaning for every set of leading dimensions, the
+        last two dimensions defines a matrix.
+        See class docstring for definition of compatibility.
+      adjoint: Python `bool`.  If `True`, solve the system involving the adjoint
+        of this `LinearOperator`:  `A^H X = rhs`.
+      adjoint_arg:  Python `bool`.  If `True`, solve `A X = rhs^H` where `rhs^H`
+        is the hermitian transpose (transposition and complex conjugation).
+      name:  A name scope to use for ops added by this method.
+
+    Returns:
+      `Tensor` with shape `[...,N, R]` and same `dtype` as `rhs`.
+
+    Raises:
+      NotImplementedError:  If `self.is_non_singular` or `is_square` is False.
+    """
+    if self.is_non_singular is False:
+      raise NotImplementedError(
+          "Exact solve not implemented for an operator that is expected to "
+          "be singular.")
+    if self.is_square is False:
+      raise NotImplementedError(
+          "Exact solve not implemented for an operator that is expected to "
+          "not be square.")
+    if isinstance(rhs, linear_operator.LinearOperator):
+      left_operator = self.adjoint() if adjoint else self
+      right_operator = rhs.adjoint() if adjoint_arg else rhs
+
+      if (right_operator.range_dimension is not None and
+          left_operator.domain_dimension is not None and
+          right_operator.range_dimension != left_operator.domain_dimension):
+        raise ValueError(
+            "Operators are incompatible. Expected `rhs` to have dimension"
+            " {} but got {}.".format(
+                left_operator.domain_dimension, right_operator.range_dimension))
+      with self._name_scope(name):
+        return linear_operator_algebra.solve(left_operator, right_operator)
+
+    with self._name_scope(name):
+      block_dimensions = (self._block_domain_dimensions() if adjoint
+                          else self._block_range_dimensions())
+      arg_dim = -1 if adjoint_arg else -2
+      blockwise_arg = linear_operator_util.arg_is_blockwise(
+          block_dimensions, rhs, arg_dim)
+
+      if blockwise_arg:
+        split_rhs = rhs
+        for i, block in enumerate(split_rhs):
+          if not isinstance(block, linear_operator.LinearOperator):
+            block = ops.convert_to_tensor(block)
+            self._check_input_dtype(block)
+            block_dimensions[i].assert_is_compatible_with(block.shape[arg_dim])
+            split_rhs[i] = block
+      else:
+        rhs = ops.convert_to_tensor(rhs, name="rhs")
+        self._check_input_dtype(rhs)
+        op_dimension = (self.domain_dimension if adjoint
+                        else self.range_dimension)
+        op_dimension.assert_is_compatible_with(rhs.shape[arg_dim])
+        split_dim = -1 if adjoint_arg else -2
+        # Split input by rows normally, and otherwise columns.
+        split_rhs = linear_operator_util.split_arg_into_blocks(
+            self._block_domain_dimensions(),
+            self._block_domain_dimension_tensors,
+            rhs, axis=split_dim)
+
+      solution_list = []
+      for index, operator in enumerate(self.operators):
+        solution_list += [operator.solve(
+            split_rhs[index], adjoint=adjoint, adjoint_arg=adjoint_arg)]
+
+      if blockwise_arg:
+        return solution_list
+
+      solution_list = linear_operator_util.broadcast_matrix_batch_dims(
+          solution_list)
+      return array_ops.concat(solution_list, axis=-2)
+
+  def solvevec(self, rhs, adjoint=False, name="solve"):
+    """Solve single equation with best effort: `A X = rhs`.
+
+    The returned `Tensor` will be close to an exact solution if `A` is well
+    conditioned. Otherwise closeness will vary. See class docstring for details.
+
+    Examples:
+
+    ```python
+    # Make an operator acting like batch matrix A.  Assume A.shape = [..., M, N]
+    operator = LinearOperator(...)
+    operator.shape = [..., M, N]
+
+    # Solve one linear system for every member of the batch.
+    RHS = ... # shape [..., M]
+
+    X = operator.solvevec(RHS)
+    # X is the solution to the linear system
+    # sum_j A[..., :, j] X[..., j] = RHS[..., :]
+
+    operator.matvec(X)
+    ==> RHS
+    ```
+
+    Args:
+      rhs: `Tensor` with same `dtype` as this operator, or list of `Tensor`s
+        (for blockwise operators). `Tensor`s are treated as [batch] vectors,
+        meaning for every set of leading dimensions, the last dimension defines
+        a vector.  See class docstring for definition of compatibility regarding
+        batch dimensions.
+      adjoint: Python `bool`.  If `True`, solve the system involving the adjoint
+        of this `LinearOperator`:  `A^H X = rhs`.
+      name:  A name scope to use for ops added by this method.
+
+    Returns:
+      `Tensor` with shape `[...,N]` and same `dtype` as `rhs`.
+
+    Raises:
+      NotImplementedError:  If `self.is_non_singular` or `is_square` is False.
+    """
+    with self._name_scope(name):
+      block_dimensions = (self._block_domain_dimensions() if adjoint
+                          else self._block_range_dimensions())
+      if linear_operator_util.arg_is_blockwise(block_dimensions, rhs, -1):
+        for i, block in enumerate(rhs):
+          if not isinstance(block, linear_operator.LinearOperator):
+            block = ops.convert_to_tensor(block)
+            self._check_input_dtype(block)
+            block_dimensions[i].assert_is_compatible_with(block.shape[-1])
+            rhs[i] = block
+        rhs_mat = [array_ops.expand_dims(block, axis=-1) for block in rhs]
+        solution_mat = self.solve(rhs_mat, adjoint=adjoint)
+        return [array_ops.squeeze(x, axis=-1) for x in solution_mat]
+
+      rhs = ops.convert_to_tensor(rhs, name="rhs")
+      self._check_input_dtype(rhs)
+      op_dimension = (self.domain_dimension if adjoint
+                      else self.range_dimension)
+      op_dimension.assert_is_compatible_with(rhs.shape[-1])
+      rhs_mat = array_ops.expand_dims(rhs, axis=-1)
+      solution_mat = self.solve(rhs_mat, adjoint=adjoint)
+      return array_ops.squeeze(solution_mat, axis=-1)
 
   def _diag_part(self):
     diag_list = []
@@ -360,27 +662,3 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
     eig_list = linear_operator_util.broadcast_matrix_batch_dims(eig_list)
     eigs = array_ops.concat(eig_list, axis=-2)
     return array_ops.squeeze(eigs, axis=-1)
-
-  def _split_input_into_blocks(self, x, axis=-1):
-    """Split `x` into blocks matching `operators`'s `domain_dimension`.
-
-    Specifically, if we have a block diagonal matrix, with block sizes
-    `[M_j, M_j] j = 1..J`,  this method splits `x` on `axis` into `J`
-    tensors, whose shape at `axis` is `M_j`.
-
-    Args:
-      x: `Tensor`. `x` is split into `J` tensors.
-      axis: Python `Integer` representing the axis to split `x` on.
-
-    Returns:
-      A list of `Tensor`s.
-    """
-    block_sizes = []
-    if self.shape.is_fully_defined():
-      for operator in self.operators:
-        block_sizes += [operator.domain_dimension.value]
-    else:
-      for operator in self.operators:
-        block_sizes += [operator.domain_dimension_tensor()]
-
-    return array_ops.split(x, block_sizes, axis=axis)
diff --git a/tensorflow/python/ops/linalg/linear_operator_block_lower_triangular.py b/tensorflow/python/ops/linalg/linear_operator_block_lower_triangular.py
index 17f1e2c3b05..b4bf8bdb142 100644
--- a/tensorflow/python/ops/linalg/linear_operator_block_lower_triangular.py
+++ b/tensorflow/python/ops/linalg/linear_operator_block_lower_triangular.py
@@ -28,6 +28,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg_impl as linalg
 from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_algebra
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.util.tf_export import tf_export
 
@@ -147,6 +148,16 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
   >>> y.shape
   TensorShape([2, 3, 9])
 
+  Create a blockwise list of vectors and apply the operator to it. A blockwise
+  list is returned.
+  >>> x4 = tf.random.normal(shape=[2, 1, 4])
+  >>> x5 = tf.random.normal(shape=[2, 3, 5])
+  >>> y_blockwise = operator_99.matvec([x4, x5])
+  >>> y_blockwise[0].shape
+  TensorShape([2, 3, 4])
+  >>> y_blockwise[1].shape
+  TensorShape([2, 3, 5])
+
   #### Performance
 
   Suppose `operator` is a `LinearOperatorBlockLowerTriangular` consisting of `D`
@@ -230,6 +241,7 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
       raise ValueError(
           "Expected a non-empty list of operators. Found: {}".format(operators))
     self._operators = operators
+    self._diagonal_operators = [row[-1] for row in operators]
 
     dtype = operators[0][0].dtype
     self._validate_dtype(dtype)
@@ -287,13 +299,13 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
 
   # pylint: disable=g-bool-id-comparison
   def _validate_non_singular(self, is_non_singular):
-    if all(row[-1].is_non_singular for row in self.operators):
+    if all(op.is_non_singular for op in self._diagonal_operators):
       if is_non_singular is False:
         raise ValueError(
             "A blockwise lower-triangular operator with non-singular operators "
             " on the main diagonal is always non-singular.")
       return True
-    if any(row[-1].is_non_singular is False for row in self.operators):
+    if any(op.is_non_singular is False for op in self._diagonal_operators):
       if is_non_singular is True:
         raise ValueError(
             "A blockwise lower-triangular operator with a singular operator on "
@@ -303,7 +315,7 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
   def _validate_square(self, is_square):
     if is_square is False:
       raise ValueError("`LinearOperatorBlockLowerTriangular` must be square.")
-    if any(row[-1].is_square is False for row in self.operators):
+    if any(op.is_square is False for op in self._diagonal_operators):
       raise ValueError(
           "Matrices on the diagonal (the final elements of each row-partition "
           "in the `operators` list) must be square.")
@@ -323,14 +335,22 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
   def operators(self):
     return self._operators
 
+  def _block_range_dimensions(self):
+    return [op.range_dimension for op in self._diagonal_operators]
+
+  def _block_domain_dimensions(self):
+    return [op.domain_dimension for op in self._diagonal_operators]
+
+  def _block_range_dimension_tensors(self):
+    return [op.range_dimension_tensor() for op in self._diagonal_operators]
+
+  def _block_domain_dimension_tensors(self):
+    return [op.domain_dimension_tensor() for op in self._diagonal_operators]
+
   def _shape(self):
     # Get final matrix shape.
-    domain_dimension = self.operators[0][0].domain_dimension
-    range_dimension = self.operators[0][0].range_dimension
-    for row in self.operators[1:]:
-      domain_dimension += row[-1].domain_dimension
-      range_dimension += row[-1].range_dimension
-
+    domain_dimension = sum(self._block_domain_dimensions())
+    range_dimension = sum(self._block_range_dimensions())
     matrix_shape = tensor_shape.TensorShape([domain_dimension, range_dimension])
 
     # Get broadcast batch shape.
@@ -349,13 +369,8 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
       return ops.convert_to_tensor(
           self.shape.as_list(), dtype=dtypes.int32, name="shape")
 
-    domain_dimension = self.operators[0][0].domain_dimension_tensor()
-    range_dimension = self.operators[0][0].range_dimension_tensor()
-
-    for row in self.operators[1:]:
-      domain_dimension += row[-1].domain_dimension_tensor()
-      range_dimension += row[-1].range_dimension_tensor()
-
+    domain_dimension = sum(self._block_domain_dimension_tensors())
+    range_dimension = sum(self._block_range_dimension_tensors())
     matrix_shape = array_ops.stack([domain_dimension, range_dimension])
 
     batch_shape = self.operators[0][0].batch_shape_tensor()
@@ -366,17 +381,92 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
 
     return array_ops.concat((batch_shape, matrix_shape), 0)
 
+  def matmul(self, x, adjoint=False, adjoint_arg=False, name="matmul"):
+    """Transform [batch] matrix `x` with left multiplication:  `x --> Ax`.
+
+    ```python
+    # Make an operator acting like batch matrix A.  Assume A.shape = [..., M, N]
+    operator = LinearOperator(...)
+    operator.shape = [..., M, N]
+
+    X = ... # shape [..., N, R], batch matrix, R > 0.
+
+    Y = operator.matmul(X)
+    Y.shape
+    ==> [..., M, R]
+
+    Y[..., :, r] = sum_j A[..., :, j] X[j, r]
+    ```
+
+    Args:
+      x: `LinearOperator`, `Tensor` with compatible shape and same `dtype` as
+        `self`, or a blockwise iterable of `LinearOperator`s or `Tensor`s. See
+        class docstring for definition of shape compatibility.
+      adjoint: Python `bool`.  If `True`, left multiply by the adjoint: `A^H x`.
+      adjoint_arg:  Python `bool`.  If `True`, compute `A x^H` where `x^H` is
+        the hermitian transpose (transposition and complex conjugation).
+      name:  A name for this `Op`.
+
+    Returns:
+      A `LinearOperator` or `Tensor` with shape `[..., M, R]` and same `dtype`
+        as `self`, or if `x` is blockwise, a list of `Tensor`s with shapes that
+        concatenate to `[..., M, R]`.
+    """
+    if isinstance(x, linear_operator.LinearOperator):
+      left_operator = self.adjoint() if adjoint else self
+      right_operator = x.adjoint() if adjoint_arg else x
+
+      if (right_operator.range_dimension is not None and
+          left_operator.domain_dimension is not None and
+          right_operator.range_dimension != left_operator.domain_dimension):
+        raise ValueError(
+            "Operators are incompatible. Expected `x` to have dimension"
+            " {} but got {}.".format(
+                left_operator.domain_dimension, right_operator.range_dimension))
+      with self._name_scope(name):
+        return linear_operator_algebra.matmul(left_operator, right_operator)
+
+    with self._name_scope(name):
+      arg_dim = -1 if adjoint_arg else -2
+      block_dimensions = (self._block_range_dimensions() if adjoint
+                          else self._block_domain_dimensions())
+      if linear_operator_util.arg_is_blockwise(block_dimensions, x, arg_dim):
+        for i, block in enumerate(x):
+          if not isinstance(block, linear_operator.LinearOperator):
+            block = ops.convert_to_tensor(block)
+            self._check_input_dtype(block)
+            block_dimensions[i].assert_is_compatible_with(block.shape[arg_dim])
+            x[i] = block
+      else:
+        x = ops.convert_to_tensor(x, name="x")
+        self._check_input_dtype(x)
+        op_dimension = (self.range_dimension if adjoint
+                        else self.domain_dimension)
+        op_dimension.assert_is_compatible_with(x.shape[arg_dim])
+      return self._matmul(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
+
   def _matmul(self, x, adjoint=False, adjoint_arg=False):
-    split_dim = -1 if adjoint_arg else -2
-    # Split input by columns if adjoint_arg is True, else rows
-    split_x = self._split_input_into_blocks(x, axis=split_dim)
+    arg_dim = -1 if adjoint_arg else -2
+    block_dimensions = (self._block_range_dimensions() if adjoint
+                        else self._block_domain_dimensions())
+    blockwise_arg = linear_operator_util.arg_is_blockwise(
+        block_dimensions, x, arg_dim)
+    if blockwise_arg:
+      split_x = x
+    else:
+      split_dim = -1 if adjoint_arg else -2
+      # Split input by columns if adjoint_arg is True, else rows
+      split_x = linear_operator_util.split_arg_into_blocks(
+          self._block_domain_dimensions(),
+          self._block_domain_dimension_tensors,
+          x, axis=split_dim)
 
     result_list = []
     # Iterate over row-partitions (i.e. column-partitions of the adjoint).
     if adjoint:
       for index in range(len(self.operators)):
-        # Begin with the operator on the diagonal and apply it to the respective
-        # `rhs` block.
+        # Begin with the operator on the diagonal and apply it to the
+        # respective `rhs` block.
         result = self.operators[index][index].matmul(
             split_x[index], adjoint=adjoint, adjoint_arg=adjoint_arg)
 
@@ -400,8 +490,8 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
         result_list.append(result)
     else:
       for row in self.operators:
-        # Begin with the left-most operator in the row-partition and apply it to
-        # the first `rhs` block.
+        # Begin with the left-most operator in the row-partition and apply it
+        # to the first `rhs` block.
         result = row[0].matmul(
             split_x[0], adjoint=adjoint, adjoint_arg=adjoint_arg)
         # Iterate left to right over the operators in the remainder of the row
@@ -412,117 +502,329 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
               split_x[j + 1], adjoint=adjoint, adjoint_arg=adjoint_arg)
         result_list.append(result)
 
+    if blockwise_arg:
+      return result_list
+
     result_list = linear_operator_util.broadcast_matrix_batch_dims(
         result_list)
     return array_ops.concat(result_list, axis=-2)
 
+  def matvec(self, x, adjoint=False, name="matvec"):
+    """Transform [batch] vector `x` with left multiplication:  `x --> Ax`.
+
+    ```python
+    # Make an operator acting like batch matrix A.  Assume A.shape = [..., M, N]
+    operator = LinearOperator(...)
+
+    X = ... # shape [..., N], batch vector
+
+    Y = operator.matvec(X)
+    Y.shape
+    ==> [..., M]
+
+    Y[..., :] = sum_j A[..., :, j] X[..., j]
+    ```
+
+    Args:
+      x: `Tensor` with compatible shape and same `dtype` as `self`, or an
+        iterable of `Tensor`s. `Tensor`s are treated a [batch] vectors, meaning
+        for every set of leading dimensions, the last dimension defines a
+        vector.
+        See class docstring for definition of compatibility.
+      adjoint: Python `bool`.  If `True`, left multiply by the adjoint: `A^H x`.
+      name:  A name for this `Op`.
+
+    Returns:
+      A `Tensor` with shape `[..., M]` and same `dtype` as `self`.
+    """
+    with self._name_scope(name):
+      block_dimensions = (self._block_range_dimensions() if adjoint
+                          else self._block_domain_dimensions())
+      if linear_operator_util.arg_is_blockwise(block_dimensions, x, -1):
+        for i, block in enumerate(x):
+          if not isinstance(block, linear_operator.LinearOperator):
+            block = ops.convert_to_tensor(block)
+            self._check_input_dtype(block)
+            block_dimensions[i].assert_is_compatible_with(block.shape[-1])
+            x[i] = block
+        x_mat = [block[..., array_ops.newaxis] for block in x]
+        y_mat = self.matmul(x_mat, adjoint=adjoint)
+        return [array_ops.squeeze(y, axis=-1) for y in y_mat]
+
+      x = ops.convert_to_tensor(x, name="x")
+      self._check_input_dtype(x)
+      op_dimension = (self.range_dimension if adjoint
+                      else self.domain_dimension)
+      op_dimension.assert_is_compatible_with(x.shape[-1])
+      x_mat = x[..., array_ops.newaxis]
+      y_mat = self.matmul(x_mat, adjoint=adjoint)
+      return array_ops.squeeze(y_mat, axis=-1)
+
   def _determinant(self):
-    if all(row[-1].is_positive_definite for row in self.operators):
+    if all(op.is_positive_definite for op in self._diagonal_operators):
       return math_ops.exp(self._log_abs_determinant())
-    result = self.operators[0][0].determinant()
-    for row in self.operators[1:]:
-      result *= row[-1].determinant()
+    result = self._diagonal_operators[0].determinant()
+    for op in self._diagonal_operators[1:]:
+      result *= op.determinant()
     return result
 
   def _log_abs_determinant(self):
-    result = self.operators[0][0].log_abs_determinant()
-    for row in self.operators[1:]:
-      result += row[-1].log_abs_determinant()
+    result = self._diagonal_operators[0].log_abs_determinant()
+    for op in self._diagonal_operators[1:]:
+      result += op.log_abs_determinant()
     return result
 
-  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
-    # Given the blockwise `n + 1`-by-`n + 1` linear operator:
-    #
-    # op = [[A_00     0  ...     0  ...    0],
-    #       [A_10  A_11  ...     0  ...    0],
-    #       ...
-    #       [A_k0  A_k1  ...  A_kk  ...    0],
-    #       ...
-    #       [A_n0  A_n1  ...  A_nk  ... A_nn]]
-    #
-    # we find `x = op.solve(y)` by observing that
-    #
-    # `y_k = A_k0.matmul(x_0) + A_k1.matmul(x_1) + ... + A_kk.matmul(x_k)`
-    #
-    # and therefore
-    #
-    # `x_k = A_kk.solve(y_k -
-    #                   A_k0.matmul(x_0) - ... - A_k(k-1).matmul(x_(k-1)))`
-    #
-    # where `x_k` and `y_k` are the `k`th blocks obtained by decomposing `x`
-    # and `y` along their appropriate axes.
-    #
-    # We first solve `x_0 = A_00.solve(y_0)`. Proceeding inductively, we solve
-    # for `x_k`, `k = 1..n`, given `x_0..x_(k-1)`.
-    #
-    # The adjoint case is solved similarly, beginning with
-    # `x_n = A_nn.solve(y_n, adjoint=True)` and proceeding backwards.
-    rhs = linalg.adjoint(rhs) if adjoint_arg else rhs
-    split_rhs = self._split_input_into_blocks(rhs, axis=-2)
+  def solve(self, rhs, adjoint=False, adjoint_arg=False, name="solve"):
+    """Solve (exact or approx) `R` (batch) systems of equations: `A X = rhs`.
 
-    solution_list = []
-    if adjoint:
-      # For an adjoint blockwise lower-triangular linear operator, the system
-      # must be solved bottom to top. Iterate backwards over rows of the adjoint
-      # (i.e. columns of the non-adjoint operator).
-      for index in reversed(range(len(self.operators))):
-        y = split_rhs[index]
-        # Iterate top to bottom over the operators in the off-diagonal portion
-        # of the column-partition (i.e. row-partition of the adjoint), apply
-        # the operator to the respective block of the solution found in previous
-        # iterations, and subtract the result from the `rhs` block. For example,
-        # let `A`, `B`, and `D` be the linear operators in the top row-partition
-        # of the adjoint of
-        # `LinearOperatorBlockLowerTriangular([[A], [B, C], [D, E, F]])`,
-        # and `x_1` and `x_2` be blocks of the solution found in previous
-        # iterations of the outer loop. The following loop (when `index == 0`)
-        # expresses
-        # `Ax_0 + Bx_1 + Dx_2 = y_0` as `Ax_0 = y_0*`, where
-        # `y_0* = y_0 - Bx_1 - Dx_2`.
-        for j in reversed(range(index + 1, len(self.operators))):
-          y -= self.operators[j][index].matmul(
-              solution_list[len(self.operators) - 1 - j],
-              adjoint=adjoint)
-        # Continuing the example above, solve `Ax_0 = y_0*` for `x_0`.
-        solution_list.append(
-            self.operators[index][index].solve(y, adjoint=adjoint))
-      solution_list.reverse()
-    else:
-      # Iterate top to bottom over the row-partitions.
-      for row, y in zip(self.operators, split_rhs):
-        # Iterate left to right over the operators in the off-diagonal portion
-        # of the row-partition, apply the operator to the block of the solution
-        # found in previous iterations, and subtract the result from the `rhs`
-        # block. For example, let `D`, `E`, and `F` be the linear operators in
-        # the bottom row-partition of
-        # `LinearOperatorBlockLowerTriangular([[A], [B, C], [D, E, F]])` and
-        # `x_0` and `x_1` be blocks of the solution found in previous iterations
-        # of the outer loop. The following loop (when `index == 2`), expresses
-        # `Dx_0 + Ex_1 + Fx_2 = y_2` as `Fx_2 = y_2*`, where
-        # `y_2* = y_2 - D_x0 - Ex_1`.
-        for i, operator in enumerate(row[:-1]):
-          y -= operator.matmul(solution_list[i], adjoint=adjoint)
-        # Continuing the example above, solve `Fx_2 = y_2*` for `x_2`.
-        solution_list.append(row[-1].solve(y, adjoint=adjoint))
+    The returned `Tensor` will be close to an exact solution if `A` is well
+    conditioned. Otherwise closeness will vary. See class docstring for details.
 
-    solution_list = linear_operator_util.broadcast_matrix_batch_dims(
-        solution_list)
-    return array_ops.concat(solution_list, axis=-2)
+    Given the blockwise `n + 1`-by-`n + 1` linear operator:
+
+    op = [[A_00     0  ...     0  ...    0],
+          [A_10  A_11  ...     0  ...    0],
+          ...
+          [A_k0  A_k1  ...  A_kk  ...    0],
+          ...
+          [A_n0  A_n1  ...  A_nk  ... A_nn]]
+
+    we find `x = op.solve(y)` by observing that
+
+    `y_k = A_k0.matmul(x_0) + A_k1.matmul(x_1) + ... + A_kk.matmul(x_k)`
+
+    and therefore
+
+    `x_k = A_kk.solve(y_k -
+                      A_k0.matmul(x_0) - ... - A_k(k-1).matmul(x_(k-1)))`
+
+    where `x_k` and `y_k` are the `k`th blocks obtained by decomposing `x`
+    and `y` along their appropriate axes.
+
+    We first solve `x_0 = A_00.solve(y_0)`. Proceeding inductively, we solve
+    for `x_k`, `k = 1..n`, given `x_0..x_(k-1)`.
+
+    The adjoint case is solved similarly, beginning with
+    `x_n = A_nn.solve(y_n, adjoint=True)` and proceeding backwards.
+
+    Examples:
+
+    ```python
+    # Make an operator acting like batch matrix A.  Assume A.shape = [..., M, N]
+    operator = LinearOperator(...)
+    operator.shape = [..., M, N]
+
+    # Solve R > 0 linear systems for every member of the batch.
+    RHS = ... # shape [..., M, R]
+
+    X = operator.solve(RHS)
+    # X[..., :, r] is the solution to the r'th linear system
+    # sum_j A[..., :, j] X[..., j, r] = RHS[..., :, r]
+
+    operator.matmul(X)
+    ==> RHS
+    ```
+
+    Args:
+      rhs: `Tensor` with same `dtype` as this operator and compatible shape,
+        or a list of `Tensor`s. `Tensor`s are treated like a [batch] matrices
+        meaning for every set of leading dimensions, the last two dimensions
+        defines a matrix.
+        See class docstring for definition of compatibility.
+      adjoint: Python `bool`.  If `True`, solve the system involving the adjoint
+        of this `LinearOperator`:  `A^H X = rhs`.
+      adjoint_arg:  Python `bool`.  If `True`, solve `A X = rhs^H` where `rhs^H`
+        is the hermitian transpose (transposition and complex conjugation).
+      name:  A name scope to use for ops added by this method.
+
+    Returns:
+      `Tensor` with shape `[...,N, R]` and same `dtype` as `rhs`.
+
+    Raises:
+      NotImplementedError:  If `self.is_non_singular` or `is_square` is False.
+    """
+    if self.is_non_singular is False:
+      raise NotImplementedError(
+          "Exact solve not implemented for an operator that is expected to "
+          "be singular.")
+    if self.is_square is False:
+      raise NotImplementedError(
+          "Exact solve not implemented for an operator that is expected to "
+          "not be square.")
+    if isinstance(rhs, linear_operator.LinearOperator):
+      left_operator = self.adjoint() if adjoint else self
+      right_operator = rhs.adjoint() if adjoint_arg else rhs
+
+      if (right_operator.range_dimension is not None and
+          left_operator.domain_dimension is not None and
+          right_operator.range_dimension != left_operator.domain_dimension):
+        raise ValueError(
+            "Operators are incompatible. Expected `rhs` to have dimension"
+            " {} but got {}.".format(
+                left_operator.domain_dimension, right_operator.range_dimension))
+      with self._name_scope(name):
+        return linear_operator_algebra.solve(left_operator, right_operator)
+
+    with self._name_scope(name):
+      block_dimensions = (self._block_domain_dimensions() if adjoint
+                          else self._block_range_dimensions())
+      arg_dim = -1 if adjoint_arg else -2
+      blockwise_arg = linear_operator_util.arg_is_blockwise(
+          block_dimensions, rhs, arg_dim)
+      if blockwise_arg:
+        for i, block in enumerate(rhs):
+          if not isinstance(block, linear_operator.LinearOperator):
+            block = ops.convert_to_tensor(block)
+            self._check_input_dtype(block)
+            block_dimensions[i].assert_is_compatible_with(block.shape[arg_dim])
+            rhs[i] = block
+        if adjoint_arg:
+          split_rhs = [linalg.adjoint(y) for y in rhs]
+        else:
+          split_rhs = rhs
+
+      else:
+        rhs = ops.convert_to_tensor(rhs, name="rhs")
+        self._check_input_dtype(rhs)
+        op_dimension = (self.domain_dimension if adjoint
+                        else self.range_dimension)
+        op_dimension.assert_is_compatible_with(rhs.shape[arg_dim])
+
+        rhs = linalg.adjoint(rhs) if adjoint_arg else rhs
+        split_rhs = linear_operator_util.split_arg_into_blocks(
+            self._block_domain_dimensions(),
+            self._block_domain_dimension_tensors,
+            rhs, axis=-2)
+
+      solution_list = []
+      if adjoint:
+        # For an adjoint blockwise lower-triangular linear operator, the system
+        # must be solved bottom to top. Iterate backwards over rows of the
+        # adjoint (i.e. columns of the non-adjoint operator).
+        for index in reversed(range(len(self.operators))):
+          y = split_rhs[index]
+          # Iterate top to bottom over the operators in the off-diagonal portion
+          # of the column-partition (i.e. row-partition of the adjoint), apply
+          # the operator to the respective block of the solution found in
+          # previous iterations, and subtract the result from the `rhs` block.
+          # For example,let `A`, `B`, and `D` be the linear operators in the top
+          # row-partition of the adjoint of
+          # `LinearOperatorBlockLowerTriangular([[A], [B, C], [D, E, F]])`,
+          # and `x_1` and `x_2` be blocks of the solution found in previous
+          # iterations of the outer loop. The following loop (when `index == 0`)
+          # expresses
+          # `Ax_0 + Bx_1 + Dx_2 = y_0` as `Ax_0 = y_0*`, where
+          # `y_0* = y_0 - Bx_1 - Dx_2`.
+          for j in reversed(range(index + 1, len(self.operators))):
+            y -= self.operators[j][index].matmul(
+                solution_list[len(self.operators) - 1 - j],
+                adjoint=adjoint)
+          # Continuing the example above, solve `Ax_0 = y_0*` for `x_0`.
+          solution_list.append(
+              self._diagonal_operators[index].solve(y, adjoint=adjoint))
+        solution_list.reverse()
+      else:
+        # Iterate top to bottom over the row-partitions.
+        for row, y in zip(self.operators, split_rhs):
+          # Iterate left to right over the operators in the off-diagonal portion
+          # of the row-partition, apply the operator to the block of the
+          # solution found in previous iterations, and subtract the result from
+          # the `rhs` block. For example, let `D`, `E`, and `F` be the linear
+          # operators in the bottom row-partition of
+          # `LinearOperatorBlockLowerTriangular([[A], [B, C], [D, E, F]])` and
+          # `x_0` and `x_1` be blocks of the solution found in previous
+          # iterations of the outer loop. The following loop
+          # (when `index == 2`), expresses
+          # `Dx_0 + Ex_1 + Fx_2 = y_2` as `Fx_2 = y_2*`, where
+          # `y_2* = y_2 - D_x0 - Ex_1`.
+          for i, operator in enumerate(row[:-1]):
+            y -= operator.matmul(solution_list[i], adjoint=adjoint)
+          # Continuing the example above, solve `Fx_2 = y_2*` for `x_2`.
+          solution_list.append(row[-1].solve(y, adjoint=adjoint))
+
+      if blockwise_arg:
+        return solution_list
+
+      solution_list = linear_operator_util.broadcast_matrix_batch_dims(
+          solution_list)
+      return array_ops.concat(solution_list, axis=-2)
+
+  def solvevec(self, rhs, adjoint=False, name="solve"):
+    """Solve single equation with best effort: `A X = rhs`.
+
+    The returned `Tensor` will be close to an exact solution if `A` is well
+    conditioned. Otherwise closeness will vary. See class docstring for details.
+
+    Examples:
+
+    ```python
+    # Make an operator acting like batch matrix A.  Assume A.shape = [..., M, N]
+    operator = LinearOperator(...)
+    operator.shape = [..., M, N]
+
+    # Solve one linear system for every member of the batch.
+    RHS = ... # shape [..., M]
+
+    X = operator.solvevec(RHS)
+    # X is the solution to the linear system
+    # sum_j A[..., :, j] X[..., j] = RHS[..., :]
+
+    operator.matvec(X)
+    ==> RHS
+    ```
+
+    Args:
+      rhs: `Tensor` with same `dtype` as this operator, or list of `Tensor`s
+        (for blockwise operators). `Tensor`s are treated as [batch] vectors,
+        meaning for every set of leading dimensions, the last dimension defines
+        a vector.  See class docstring for definition of compatibility regarding
+        batch dimensions.
+      adjoint: Python `bool`.  If `True`, solve the system involving the adjoint
+        of this `LinearOperator`:  `A^H X = rhs`.
+      name:  A name scope to use for ops added by this method.
+
+    Returns:
+      `Tensor` with shape `[...,N]` and same `dtype` as `rhs`.
+
+    Raises:
+      NotImplementedError:  If `self.is_non_singular` or `is_square` is False.
+    """
+    with self._name_scope(name):
+      block_dimensions = (self._block_domain_dimensions() if adjoint
+                          else self._block_range_dimensions())
+      if linear_operator_util.arg_is_blockwise(block_dimensions, rhs, -1):
+        for i, block in enumerate(rhs):
+          if not isinstance(block, linear_operator.LinearOperator):
+            block = ops.convert_to_tensor(block)
+            self._check_input_dtype(block)
+            block_dimensions[i].assert_is_compatible_with(block.shape[-1])
+            rhs[i] = block
+        rhs_mat = [array_ops.expand_dims(block, axis=-1) for block in rhs]
+        solution_mat = self.solve(rhs_mat, adjoint=adjoint)
+        return [array_ops.squeeze(x, axis=-1) for x in solution_mat]
+      rhs = ops.convert_to_tensor(rhs, name="rhs")
+      self._check_input_dtype(rhs)
+      op_dimension = (self.domain_dimension if adjoint
+                      else self.range_dimension)
+      op_dimension.assert_is_compatible_with(rhs.shape[-1])
+      rhs_mat = array_ops.expand_dims(rhs, axis=-1)
+      solution_mat = self.solve(rhs_mat, adjoint=adjoint)
+      return array_ops.squeeze(solution_mat, axis=-1)
 
   def _diag_part(self):
     diag_list = []
-    for row in self.operators:
+    for op in self._diagonal_operators:
       # Extend the axis, since `broadcast_matrix_batch_dims` treats all but the
       # final two dimensions as batch dimensions.
-      diag_list.append(row[-1].diag_part()[..., array_ops.newaxis])
+      diag_list.append(op.diag_part()[..., array_ops.newaxis])
     diag_list = linear_operator_util.broadcast_matrix_batch_dims(diag_list)
     diagonal = array_ops.concat(diag_list, axis=-2)
     return array_ops.squeeze(diagonal, axis=-1)
 
   def _trace(self):
-    result = self.operators[0][0].trace()
-    for row in self.operators[1:]:
-      result += row[-1].trace()
+    result = self._diagonal_operators[0].trace()
+    for op in self._diagonal_operators[1:]:
+      result += op.trace()
     return result
 
   def _to_dense(self):
@@ -551,37 +853,13 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
 
   def _assert_non_singular(self):
     return control_flow_ops.group([
-        row[-1].assert_non_singular() for row in self.operators])
+        op.assert_non_singular() for op in self._diagonal_operators])
 
   def _eigvals(self):
     eig_list = []
-    for row in self.operators:
+    for op in self._diagonal_operators:
       # Extend the axis for broadcasting.
-      eig_list.append(row[-1].eigvals()[..., array_ops.newaxis])
+      eig_list.append(op.eigvals()[..., array_ops.newaxis])
     eig_list = linear_operator_util.broadcast_matrix_batch_dims(eig_list)
     eigs = array_ops.concat(eig_list, axis=-2)
     return array_ops.squeeze(eigs, axis=-1)
-
-  def _split_input_into_blocks(self, x, axis=-1):
-    """Split `x` into blocks matching `operators`'s `domain_dimension`.
-
-    Specifically, if we have a blockwise lower-triangular matrix, with block
-    sizes along the diagonal `[M_j, M_j] j = 0,1,2..J`,  this method splits `x`
-    on `axis` into `J` tensors, whose shape at `axis` is `M_j`.
-
-    Args:
-      x: `Tensor`. `x` is split into `J` tensors.
-      axis: Python `Integer` representing the axis to split `x` on.
-
-    Returns:
-      A list of `Tensor`s.
-    """
-    block_sizes = []
-    if self.shape.is_fully_defined():
-      for row in self.operators:
-        block_sizes.append(row[-1].domain_dimension.value)
-    else:
-      for row in self.operators:
-        block_sizes.append(row[-1].domain_dimension_tensor())
-
-    return array_ops.split(x, block_sizes, axis=axis)
diff --git a/tensorflow/python/ops/linalg/linear_operator_test_util.py b/tensorflow/python/ops/linalg/linear_operator_test_util.py
index 07c45941e48..8ba442f1c6c 100644
--- a/tensorflow/python/ops/linalg/linear_operator_test_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_test_util.py
@@ -114,6 +114,10 @@ class LinearOperatorDerivedClassTest(test.TestCase):
   def use_placeholder_options():
     return [False, True]
 
+  @staticmethod
+  def use_blockwise_arg():
+    return False
+
   @staticmethod
   def operator_shapes_infos():
     """Returns list of OperatorShapesInfo, encapsulating the shape to test."""
@@ -321,6 +325,7 @@ def _test_matmul_base(
     dtype,
     adjoint,
     adjoint_arg,
+    blockwise_arg,
     with_batch):
   # If batch dimensions are omitted, but there are
   # no batch dimensions for the linear operator, then
@@ -346,8 +351,35 @@ def _test_matmul_base(
     if not use_placeholder:
       self.assertAllEqual(op_matmul.shape,
                           mat_matmul.shape)
-    op_matmul_v, mat_matmul_v = sess.run(
-        [op_matmul, mat_matmul])
+
+    # If the operator is blockwise, test both blockwise `x` and `Tensor` `x`;
+    # else test only `Tensor` `x`. In both cases, evaluate all results in a
+    # single `sess.run` call to avoid re-sampling the random `x` in graph mode.
+    if blockwise_arg and len(operator.operators) > 1:
+      split_x = linear_operator_util.split_arg_into_blocks(
+          operator._block_domain_dimensions(),  # pylint: disable=protected-access
+          operator._block_domain_dimension_tensors,  # pylint: disable=protected-access
+          x, axis=-2)
+      if adjoint_arg:
+        split_x = [linalg.adjoint(y) for y in split_x]
+      split_matmul = operator.matmul(
+          split_x, adjoint=adjoint, adjoint_arg=adjoint_arg)
+
+      self.assertEqual(len(split_matmul), len(operator.operators))
+      split_matmul = linear_operator_util.broadcast_matrix_batch_dims(
+          split_matmul)
+      fused_block_matmul = array_ops.concat(split_matmul, axis=-2)
+      op_matmul_v, mat_matmul_v, fused_block_matmul_v = sess.run([
+          op_matmul, mat_matmul, fused_block_matmul])
+
+      # Check that the operator applied to blockwise input gives the same result
+      # as matrix multiplication.
+      self.assertAC(fused_block_matmul_v, mat_matmul_v)
+    else:
+      op_matmul_v, mat_matmul_v = sess.run([op_matmul, mat_matmul])
+
+    # Check that the operator applied to a `Tensor` gives the same result as
+    # matrix multiplication.
     self.assertAC(op_matmul_v, mat_matmul_v)
 
 
@@ -356,7 +388,8 @@ def _test_matmul(
     shapes_info,
     dtype,
     adjoint,
-    adjoint_arg):
+    adjoint_arg,
+    blockwise_arg):
   def test_matmul(self):
     _test_matmul_base(
         self,
@@ -365,6 +398,7 @@ def _test_matmul(
         dtype,
         adjoint,
         adjoint_arg,
+        blockwise_arg,
         with_batch=True)
   return test_matmul
 
@@ -374,7 +408,8 @@ def _test_matmul_with_broadcast(
     shapes_info,
     dtype,
     adjoint,
-    adjoint_arg):
+    adjoint_arg,
+    blockwise_arg):
   def test_matmul_with_broadcast(self):
     _test_matmul_base(
         self,
@@ -383,6 +418,7 @@ def _test_matmul_with_broadcast(
         dtype,
         adjoint,
         adjoint_arg,
+        blockwise_arg,
         with_batch=True)
   return test_matmul_with_broadcast
 
@@ -505,6 +541,7 @@ def _test_solve_base(
     dtype,
     adjoint,
     adjoint_arg,
+    blockwise_arg,
     with_batch):
   # If batch dimensions are omitted, but there are
   # no batch dimensions for the linear operator, then
@@ -532,12 +569,39 @@ def _test_solve_base(
     if not use_placeholder:
       self.assertAllEqual(op_solve.shape,
                           mat_solve.shape)
-    op_solve_v, mat_solve_v = sess.run([op_solve, mat_solve])
+
+    # If the operator is blockwise, test both blockwise rhs and `Tensor` rhs;
+    # else test only `Tensor` rhs. In both cases, evaluate all results in a
+    # single `sess.run` call to avoid re-sampling the random rhs in graph mode.
+    if blockwise_arg and len(operator.operators) > 1:
+      split_rhs = linear_operator_util.split_arg_into_blocks(
+          operator._block_domain_dimensions(),  # pylint: disable=protected-access
+          operator._block_domain_dimension_tensors,  # pylint: disable=protected-access
+          rhs, axis=-2)
+      if adjoint_arg:
+        split_rhs = [linalg.adjoint(y) for y in split_rhs]
+      split_solve = operator.solve(
+          split_rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
+      self.assertEqual(len(split_solve), len(operator.operators))
+      split_solve = linear_operator_util.broadcast_matrix_batch_dims(
+          split_solve)
+      fused_block_solve = array_ops.concat(split_solve, axis=-2)
+      op_solve_v, mat_solve_v, fused_block_solve_v = sess.run([
+          op_solve, mat_solve, fused_block_solve])
+
+      # Check that the operator and matrix give the same solution when the rhs
+      # is blockwise.
+      self.assertAC(mat_solve_v, fused_block_solve_v)
+    else:
+      op_solve_v, mat_solve_v = sess.run([op_solve, mat_solve])
+
+    # Check that the operator and matrix give the same solution when the rhs is
+    # a `Tensor`.
     self.assertAC(op_solve_v, mat_solve_v)
 
 
 def _test_solve(
-    use_placeholder, shapes_info, dtype, adjoint, adjoint_arg):
+    use_placeholder, shapes_info, dtype, adjoint, adjoint_arg, blockwise_arg):
   def test_solve(self):
     _test_solve_base(
         self,
@@ -546,12 +610,13 @@ def _test_solve(
         dtype,
         adjoint,
         adjoint_arg,
+        blockwise_arg,
         with_batch=True)
   return test_solve
 
 
 def _test_solve_with_broadcast(
-    use_placeholder, shapes_info, dtype, adjoint, adjoint_arg):
+    use_placeholder, shapes_info, dtype, adjoint, adjoint_arg, blockwise_arg):
   def test_solve_with_broadcast(self):
     _test_solve_base(
         self,
@@ -560,6 +625,7 @@ def _test_solve_with_broadcast(
         dtype,
         adjoint,
         adjoint_arg,
+        blockwise_arg,
         with_batch=False)
   return test_solve_with_broadcast
 
@@ -681,7 +747,8 @@ def add_tests(test_cls):
                     shape_info,
                     dtype,
                     adjoint,
-                    adjoint_arg)))
+                    adjoint_arg,
+                    test_cls.use_blockwise_arg())))
       else:
         if hasattr(test_cls, base_test_name):
           raise RuntimeError("Test %s defined more than once" % base_test_name)
diff --git a/tensorflow/python/ops/linalg/linear_operator_util.py b/tensorflow/python/ops/linalg/linear_operator_util.py
index e176d140b8a..948f2f86a53 100644
--- a/tensorflow/python/ops/linalg/linear_operator_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_util.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables as variables_module
+from tensorflow.python.util import nest
 
 
 ################################################################################
@@ -135,6 +136,14 @@ def dtype_name(dtype):
   return str(dtype)
 
 
+def check_dtype(arg, dtype):
+  """Check that arg.dtype == self.dtype."""
+  if arg.dtype.base_dtype != dtype:
+    raise TypeError(
+        "Expected argument to have dtype %s.  Found: %s in tensor %s" %
+        (dtype, arg.dtype, arg))
+
+
 def is_ref(x):
   """Evaluates if the object has reference semantics.
 
@@ -500,3 +509,77 @@ def use_operator_or_provided_hint_unless_contradicting(
     return False
   # pylint: enable=g-bool-id-comparison
   return None
+
+
+################################################################################
+# Utilities for blockwise operators.
+################################################################################
+
+
+def arg_is_blockwise(block_dimensions, arg, arg_split_dim):
+  """Detect if input should be interpreted as a list of blocks."""
+  # Tuples and lists of length equal to the number of operators may be
+  # blockwise.
+  if (isinstance(arg, (tuple, list)) and len(arg) == len(block_dimensions)):
+    # If the elements of the iterable are not nested, interpret the input as
+    # blockwise.
+    if not any(nest.is_nested(x) for x in arg):
+      return True
+    else:
+      arg_dims = [ops.convert_to_tensor(x).shape[arg_split_dim] for x in arg]
+      self_dims = [dim.value for dim in block_dimensions]
+
+      # If none of the operator dimensions are known, interpret the input as
+      # blockwise if its matching dimensions are unequal.
+      if all(self_d is None for self_d in self_dims):
+
+        # A nested tuple/list with a single outermost element is not blockwise
+        if len(arg_dims) == 1:
+          return False
+        elif any(dim != arg_dims[0] for dim in arg_dims):
+          return True
+        else:
+          raise ValueError(
+              "Parsing of the input structure is ambiguous. Please input "
+              "a blockwise iterable of `Tensor`s or a single `Tensor`.")
+
+      # If input dimensions equal the respective (known) blockwise operator
+      # dimensions, then the input is blockwise.
+      if all(self_d == arg_d or self_d is None
+             for self_d, arg_d in zip(self_dims, arg_dims)):
+        return True
+
+      # If input dimensions equals are all equal, and are greater than or equal
+      # to the sum of the known operator dimensions, interpret the input as
+      # blockwise.
+      # input is not blockwise.
+      self_dim = sum(self_d for self_d in self_dims if self_d is not None)
+      if all(s == arg_dims[0] for s in arg_dims) and arg_dims[0] >= self_dim:
+        return False
+
+      # If none of these conditions is met, the input shape is mismatched.
+      raise ValueError("Input dimension does not match operator dimension.")
+  else:
+    return False
+
+
+def split_arg_into_blocks(block_dims, block_dims_fn, arg, axis=-1):
+  """Split `x` into blocks matching `operators`'s `domain_dimension`.
+
+  Specifically, if we have a blockwise lower-triangular matrix, with block
+  sizes along the diagonal `[M_j, M_j] j = 0,1,2..J`,  this method splits `arg`
+  on `axis` into `J` tensors, whose shape at `axis` is `M_j`.
+
+  Args:
+    block_dims: Iterable of `TensorShapes`.
+    block_dims_fn: Callable returning an iterable of `Tensor`s.
+    arg: `Tensor`. `arg` is split into `J` tensors.
+    axis: Python `Integer` representing the axis to split `arg` on.
+
+  Returns:
+    A list of `Tensor`s.
+  """
+  block_sizes = [dim.value for dim in block_dims]
+  if any(d is None for d in block_sizes):
+    block_sizes = block_dims_fn()
+  return array_ops.split(arg, block_sizes, axis=axis)
diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py
index 457ab309435..7438e584227 100644
--- a/tensorflow/python/ops/map_fn.py
+++ b/tensorflow/python/ops/map_fn.py
@@ -20,15 +20,20 @@ from __future__ import division
 from __future__ import print_function
 
 
+import re
+
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
@@ -36,134 +41,324 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export(v1=["map_fn"])
-def map_fn(fn, elems, dtype=None, parallel_iterations=None, back_prop=True,
-           swap_memory=False, infer_shape=True, name=None):
-  """map on the list of tensors unpacked from `elems` on dimension 0.
+@deprecation.deprecated_args(None, "Use fn_output_signature instead", "dtype")
+def map_fn(fn,
+           elems,
+           dtype=None,
+           parallel_iterations=None,
+           back_prop=True,
+           swap_memory=False,
+           infer_shape=True,
+           name=None,
+           fn_output_signature=None):
+  """Transforms `elems` by applying `fn` to each element unstacked on axis 0.
 
-  The simplest version of `map_fn` repeatedly applies the callable `fn` to a
-  sequence of elements from first to last. The elements are made of the
-  tensors unpacked from `elems`. `dtype` is the data type of the return
-  value of `fn`. Users must provide `dtype` if it is different from
-  the data type of `elems`.
+  `map_fn` unstacks `elems` on axis 0 to obtain a sequence of elements;
+  calls `fn` to transform each element; and then stacks the transformed
+  values back together.
 
-  Suppose that `elems` is unpacked into `values`, a list of tensors. The shape
-  of the result tensor is `[values.shape[0]] + fn(values[0]).shape`.
+  #### Mapping functions with single-Tensor inputs and outputs
 
-  This method also allows multi-arity `elems` and output of `fn`.  If `elems`
-  is a (possibly nested) list or tuple of tensors, then each of these tensors
-  must have a matching first (unpack) dimension.  The signature of `fn` may
-  match the structure of `elems`.  That is, if `elems` is
-  `(t1, [t2, t3, [t4, t5]])`, then an appropriate signature for `fn` is:
-  `fn = lambda (t1, [t2, t3, [t4, t5]]):`.
+  If `elems` is a single tensor and `fn`'s signature is `tf.Tensor->tf.Tensor`,
+  then `map_fn(fn, elems)` is equivalent to
+  `tf.stack([fn(elem) for elem in tf.unstack(elems)])`.  E.g.:
 
-  Furthermore, `fn` may emit a different structure than its input.  For example,
-  `fn` may look like: `fn = lambda t1: return (t1 + 1, t1 - 1)`.  In this case,
-  the `dtype` parameter is not optional: `dtype` must be a type or (possibly
-  nested) tuple of types matching the output of `fn`.
+  >>> tf.map_fn(fn=lambda t: tf.range(t, t + 3), elems=tf.constant([3, 5, 2]))
+  <tf.Tensor: shape=(3, 3), dtype=int32, numpy=
+    array([[3, 4, 5],
+           [5, 6, 7],
+           [2, 3, 4]], dtype=int32)>
 
-  To apply a functional operation to the nonzero elements of a SparseTensor
-  one of the following methods is recommended. First, if the function is
-  expressible as TensorFlow ops, use
+  `map_fn(fn, elems).shape = [elems.shape[0]] + fn(elems[0]).shape`.
 
-  ```python
-    result = SparseTensor(input.indices, fn(input.values), input.dense_shape)
-  ```
+  #### Mapping functions with multi-arity inputs and outputs
 
-  If, however, the function is not expressible as a TensorFlow op, then use
+  `map_fn` also supports functions with multi-arity inputs and outputs:
 
-  ```python
-  result = SparseTensor(
-    input.indices, map_fn(fn, input.values), input.dense_shape)
-  ```
+  * If `elems` is a tuple (or nested structure) of tensors, then those tensors
+    must all have the same outer-dimension size (`num_elems`); and `fn` is
+    used to transform each tuple (or structure) of corresponding slices from
+    `elems`.  E.g., if `elems` is a tuple `(t1, t2, t3)`, then `fn` is used to
+    transform each tuple of slices `(t1[i], t2[i], t3[i])`
+    (where `0 <= i < num_elems`).
 
-  instead.
+  * If `fn` returns a tuple (or nested structure) of tensors, then the
+    result is formed by stacking corresponding elements from those structures.
 
-  When executing eagerly, map_fn does not execute in parallel even if
+  #### Specifying `fn`'s output signature
+
+  If `fn`'s input and output signatures are different, then the output
+  signature must be specified using `fn_output_signature`.  (The input and
+  output signatures are differ if their structures, dtypes, or tensor types do
+  not match).  E.g.:
+
+  >>> tf.map_fn(fn=tf.strings.length,  # input & output have different dtypes
+  ...           elems=tf.constant(["hello", "moon"]),
+  ...           fn_output_signature=tf.int32)
+  <tf.Tensor: shape=(2,), dtype=int32, numpy=array([5, 4], dtype=int32)>
+  >>> tf.map_fn(fn=tf.strings.join,  # input & output have different structures
+  ...           elems=[tf.constant(['The', 'A']), tf.constant(['Dog', 'Cat'])],
+  ...           fn_output_signature=tf.string)
+  <tf.Tensor: shape=(2,), dtype=string,
+   numpy=array([b'TheDog', b'ACat'], dtype=object)>
+
+  `fn_output_signature` can be specified using any of the following:
+
+    * A `tf.DType` or `tf.TensorSpec` (to describe a `tf.Tensor`)
+    * A `tf.RaggedTensorSpec` (to describe a `tf.RaggedTensor`)
+    * A `tf.SparseTensorSpec` (to describe a `tf.SparseTensor`)
+    * A (possibly nested) tuple, list, or dict containing the above types.
+
+  #### RaggedTensors
+
+  `map_fn` supports `tf.RaggedTensor` inputs and outputs.  In particular:
+
+    * If `elems` is a `RaggedTensor`, then `fn` will be called with each
+      row of that ragged tensor.
+
+      * If `elems` has only one ragged dimension, then the values passed to
+        `fn` will be `tf.Tensor`s.
+      * If `elems` has multiple ragged dimensions, then the values passed to
+        `fn` will be `tf.RaggedTensor`s with one fewer ragged dimension.
+
+    * If the result of `map_fn` should be a `RaggedTensor`, then use a
+      `tf.RaggedTensorSpec` to specify `fn_output_signature`.
+
+      * If `fn` returns `tf.Tensor`s with varying sizes, then use a
+        `tf.RaggedTensorSpec` with `ragged_rank=0` to combine them into a
+        single ragged tensor (which will have ragged_rank=1).
+      * If `fn` returns `tf.RaggedTensor`s, then use a `tf.RaggedTensorSpec`
+        with the same `ragged_rank`.
+
+  >>> # Example: RaggedTensor input
+  >>> rt = tf.ragged.constant([[1, 2, 3], [], [4, 5], [6]])
+  >>> tf.map_fn(tf.reduce_sum, rt, fn_output_signature=tf.int32)
+  <tf.Tensor: shape=(4,), dtype=int32, numpy=array([6, 0, 9, 6], dtype=int32)>
+
+  >>> # Example: RaggedTensor output
+  >>> elems = tf.constant([3, 5, 0, 2])
+  >>> tf.map_fn(tf.range, elems,
+  ...           fn_output_signature=tf.RaggedTensorSpec(shape=[None],
+  ...                                                   dtype=tf.int32))
+  <tf.RaggedTensor [[0, 1, 2], [0, 1, 2, 3, 4], [], [0, 1]]>
+
+  Note: `map_fn` should only be used if you need to map a function over the
+  *rows* of a `RaggedTensor`.  If you wish to map a function over the
+  individual values, then you should use:
+
+    * `tf.ragged.map_flat_values(fn, rt)`
+      (if fn is expressible as TensorFlow ops)
+    * `rt.with_flat_values(map_fn(fn, rt.flat_values))`
+      (otherwise)
+
+  E.g.:
+
+  >>> rt = tf.ragged.constant([[1, 2, 3], [], [4, 5], [6]])
+  >>> tf.ragged.map_flat_values(lambda x: x + 2, rt)
+  <tf.RaggedTensor [[3, 4, 5], [], [6, 7], [8]]>
+
+  #### SparseTensors
+
+  `map_fn` supports `tf.SparseTensor` inputs and outputs.  In particular:
+
+    * If `elems` is a `SparseTensor`, then `fn` will be called with each row
+      of that sparse tensor. In particular, the value passed to `fn` will be a
+      `tf.SparseTensor` with one fewer dimension than `elems`.
+
+    * If the result of `map_fn` should be a `SparseTensor`, then use a
+      `tf.SparseTensorSpec` to specify `fn_output_signature`.  The individual
+      `SparseTensor`s returned by `fn` will be stacked into a single
+      `SparseTensor` with one more dimension.
+
+  >>> # Example: SparseTensor input
+  >>> st = tf.SparseTensor([[0, 0], [2, 0], [2, 1]], [2, 3, 4], [4, 4])
+  >>> tf.map_fn(tf.sparse.reduce_sum, st, fn_output_signature=tf.int32)
+  <tf.Tensor: shape=(4,), dtype=int32, numpy=array([2, 0, 7, 0], dtype=int32)>
+
+  >>> # Example: SparseTensor output
+  >>> tf.sparse.to_dense(
+  ...     tf.map_fn(tf.sparse.eye, tf.constant([2, 3]),
+  ...               fn_output_signature=tf.SparseTensorSpec(None, tf.float32)))
+  <tf.Tensor: shape=(2, 3, 3), dtype=float32, numpy=
+    array([[[1., 0., 0.],
+            [0., 1., 0.],
+            [0., 0., 0.]],
+           [[1., 0., 0.],
+            [0., 1., 0.],
+            [0., 0., 1.]]], dtype=float32)>
+
+  Note: `map_fn` should only be used if you need to map a function over the
+  *rows* of a `SparseTensor`.  If you wish to map a function over the nonzero
+  values, then you should use:
+
+    * `tf.SparseTensor(st.indices, fn(st.values), st.dense_shape)`
+      (if the function is expressible as TensorFlow ops)
+    * `tf.SparseTensor(st.indices, tf.map_fn(fn, st.values), st.dense_shape)`
+      (otherwise).
+
+  #### `map_fn` vs. vectorized operations
+
+  `map_fn` will apply the operations used by `fn` to each element of `elems`,
+  resulting in `O(elems.shape[0])` total operations.  This is somewhat
+  mitigated by the fact that `map_fn` can process elements in parallel.
+  However, a transform expressed using `map_fn` is still typically less
+  efficient than an equivalent transform expressed using vectorized operations.
+
+  `map_fn` should typically only be used if one of the following is true:
+
+    * It is difficult or expensive to express the desired transform with
+      vectorized operations.
+    * `fn` creates large intermediate values, so an equivalent vectorized
+      transform would take too much memory.
+    * Processing elements in parallel is more efficient than an equivalent
+      vectorized transform.
+    * Efficiency of the transform is not critical, and using `map_fn` is
+      more readable.
+
+  E.g., the example given above that maps `fn=lambda t: tf.range(t, t + 3)`
+  across `elems` could be rewritten more efficiently using vectorized ops:
+
+  >>> elems = tf.constant([3, 5, 2])
+  >>> tf.range(3) + tf.expand_dims(elems, 1)
+  <tf.Tensor: shape=(3, 3), dtype=int32, numpy=
+    array([[3, 4, 5],
+           [5, 6, 7],
+           [2, 3, 4]], dtype=int32)>
+
+  In some cases, `tf.vectorized_map` can be used to automatically convert a
+  function to a vectorized eqivalent.
+
+  #### Eager execution
+
+  When executing eagerly, `map_fn` does not execute in parallel even if
   `parallel_iterations` is set to a value > 1. You can still get the
   performance benefits of running a function in parallel by using the
-  `tf.function` decorator,
+  `tf.function` decorator:
+
+  >>> fn=lambda t: tf.range(t, t + 3)
+  >>> @tf.function
+  ... def func(elems):
+  ...   return tf.map_fn(fn, elems, parallel_iterations=3)
+  >>> func(tf.constant([3, 5, 2]))
+  <tf.Tensor: shape=(3, 3), dtype=int32, numpy=
+    array([[3, 4, 5],
+           [5, 6, 7],
+           [2, 3, 4]], dtype=int32)>
 
-  ```python
-  # Assume the function being used in map_fn is fn.
-  # To ensure map_fn calls fn in parallel, use the tf.function decorator.
-  @tf.function
-  def func(tensor):
-    return tf.map_fn(fn, tensor)
-  ```
 
   Note that if you use the `tf.function` decorator, any non-TensorFlow Python
   code that you may have written in your function won't get executed. See
-  [`tf.function`](https://www.tensorflow.org/api_docs/python/tf/function) for
-  more  details. The recommendation would be to debug without `tf.function` but
-  switch to it to get performance benefits of running `map_fn` in parallel.
+  `tf.function` for more  details. The recommendation would be to debug without
+  `tf.function` but switch to it to get performance benefits of running `map_fn`
+  in parallel.
 
   Args:
-    fn: The callable to be performed.  It accepts one argument, which will
-      have the same (possibly nested) structure as `elems`.  Its output
-      must have the same structure as `dtype` if one is provided, otherwise
-      it must have the same structure as `elems`.
-    elems: A tensor or (possibly nested) sequence of tensors, each of which
-      will be unpacked along their first dimension.  The nested sequence
-      of the resulting slices will be applied to `fn`.
-    dtype: (optional) The output type(s) of `fn`.  If `fn` returns a structure
-      of Tensors differing from the structure of `elems`, then `dtype` is not
-      optional and must have the same structure as the output of `fn`.
-    parallel_iterations: (optional) The number of iterations allowed to run
-      in parallel. When graph building, the default value is 10. While executing
+    fn: The callable to be performed.  It accepts one argument, which will have
+      the same (possibly nested) structure as `elems`.  Its output must have the
+      same structure as `fn_output_signature` if one is provided; otherwise it
+      must have the same structure as `elems`.
+    elems: A tensor or (possibly nested) sequence of tensors, each of which will
+      be unstacked along their first dimension.  `fn` will be applied to the
+      nested sequence of the resulting slices.  `elems` may include ragged and
+      sparse tensors.
+    dtype: Deprecated: Equivalent to `fn_output_signature`.
+    parallel_iterations: (optional) The number of iterations allowed to run in
+      parallel. When graph building, the default value is 10. While executing
       eagerly, the default value is set to 1.
-    back_prop: (optional) True enables support for back propagation.
+    back_prop: (optional) False disables support for back propagation.
     swap_memory: (optional) True enables GPU-CPU memory swapping.
     infer_shape: (optional) False disables tests for consistent output shapes.
     name: (optional) Name prefix for the returned tensors.
+    fn_output_signature: The output signature of `fn`. Must be specified if
+      `fn`'s input and output signatures are different (i.e., if their
+      structures, dtypes, or tensor types do not match).
+      `fn_output_signature` can be specified using any of the following:
+
+      * A `tf.DType` or `tf.TensorSpec` (to describe a `tf.Tensor`)
+      * A `tf.RaggedTensorSpec` (to describe a `tf.RaggedTensor`)
+      * A `tf.SparseTensorSpec` (to describe a `tf.SparseTensor`)
+      * A (possibly nested) tuple, list, or dict containing the above types.
 
   Returns:
-    A tensor or (possibly nested) sequence of tensors.  Each tensor packs the
-    results of applying `fn` to tensors unpacked from `elems` along the first
-    dimension, from first to last.
+    A tensor or (possibly nested) sequence of tensors.  Each tensor stacks the
+    results of applying `fn` to tensors unstacked from `elems` along the first
+    dimension, from first to last.  The result may include ragged and sparse
+    tensors.
 
   Raises:
     TypeError: if `fn` is not callable or the structure of the output of
-      `fn` and `dtype` do not match, or if elems is a SparseTensor.
-    ValueError: if the lengths of the output of `fn` and `dtype` do not match.
+      `fn` and `fn_output_signature` do not match.
+    ValueError: if the lengths of the output of `fn` and `fn_output_signature`
+      do not match.
 
   Examples:
-    ```python
-    elems = np.array([1, 2, 3, 4, 5, 6])
-    squares = map_fn(lambda x: x * x, elems)
-    # squares == [1, 4, 9, 16, 25, 36]
-    ```
 
-    ```python
-    elems = (np.array([1, 2, 3]), np.array([-1, 1, -1]))
-    alternate = map_fn(lambda x: x[0] * x[1], elems, dtype=tf.int64)
-    # alternate == [-1, 2, -3]
-    ```
+    >>> elems = np.array([1, 2, 3, 4, 5, 6])
+    >>> tf.map_fn(lambda x: x * x, elems)
+    <tf.Tensor: shape=(6,), dtype=int64, numpy=array([ 1,  4,  9, 16, 25, 36])>
 
-    ```python
-    elems = np.array([1, 2, 3])
-    alternates = map_fn(lambda x: (x, -x), elems, dtype=(tf.int64, tf.int64))
-    # alternates[0] == [1, 2, 3]
-    # alternates[1] == [-1, -2, -3]
-    ```
+    >>> elems = (np.array([1, 2, 3]), np.array([-1, 1, -1]))
+    >>> tf.map_fn(lambda x: x[0] * x[1], elems, fn_output_signature=tf.int64)
+    <tf.Tensor: shape=(3,), dtype=int64, numpy=array([-1,  2, -3])>
+
+    >>> elems = np.array([1, 2, 3])
+    >>> tf.map_fn(lambda x: (x, -x), elems,
+    ...          fn_output_signature=(tf.int64, tf.int64))
+    (<tf.Tensor: shape=(3,), dtype=int64, numpy=array([1, 2, 3])>,
+     <tf.Tensor: shape=(3,), dtype=int64, numpy=array([-1, -2, -3])>)
   """
+  # This function uses a `while_loop` to call `fn` on each value of the input
+  # tensor(s) (unstacked on dimension 0).  The following sequence of variables
+  # are used to transform the input tensor(s) (`elems`) into the output
+  # tensor(s) (`result`):
+  #
+  #   - Preparing and unstacking input values for the while_loop:
+  #     - elems: The input tensor(s) to map_fn. May include composite tensors.
+  #     - elems_flat: Flattened list of tensors from elems (using nest.flatten)
+  #                   May include composite tensors.
+  #     - elems_batchable: Concatenation of "batchable tensor lists" for each
+  #                        tensor in elems_flat.  This "boxes" composite tensors
+  #                        into sliceable tf.Tensor objects.  For more info see:
+  #                        TensorSpec._to_batched_tensor_list
+  #     - elems_batchable_ta: List of TensorArrays used to unstack each Tensor
+  #                           in elems_batchable into elems_value_batchable.
+  #
+  #   - Calling `fn` on each unstacked value in the body of the while_loop:
+  #     - elems_value_batchable: Single unstacked value from elems_batchable.
+  #     - elems_value_flat: Single unstacked value from elems_flat,
+  #                         constructed from elems_value_batchable (using
+  #                         TensorSpec._from_tensor_list).
+  #     - elems_value: Single unstacked value from elems (the input to fn).
+  #     - result_value: Result of calling `fn(elems_value)`.  May contain
+  #                     composite tensors.
+  #     - result_value_flat: Flattened list of tensors from result_value.
+  #                          May contain composite tensors.
+  #     - result_value_batchable: Concatenation of batchable tensor lists for
+  #                               each tensor in result_value_flat
+  #                               (using TensorSpec._to_tensor_list).
+  #
+  #   - Collecting and stacking output values from the while_loop:
+  #     - result_batchable_ta: List of TensorArrays used to stack each tensor
+  #                            ta result_value_batchable into result_batchable.
+  #     - result_batchable: Stacked tensors from result_batchable_ta.
+  #     - result_flat: Flat list of tensors for the result, constructed from
+  #                    results bactchable (using TensorSpec._from_tensor_list).
+  #     - result: Structured result value packed from results flat
+  #               (using nest.pack_sequence_as).
+
+  if fn_output_signature is None:
+    fn_output_signature = dtype
+
   if not callable(fn):
     raise TypeError("fn must be callable.")
 
-  if isinstance(elems, sparse_tensor.SparseTensor):
-    raise TypeError(
-        "To perform a map on the values of a sparse tensor use either "
-        " SparseTensor(input.indices, fn(input.values), input.dense_shape) or "
-        " SparseTensor(input.indices, map_fn(fn, input.values), "
-        "input.dense_shape)")
-
   in_graph_mode = not context.executing_eagerly()
   # Set the default number of parallel_iterations depending on graph/eager mode.
   if in_graph_mode and not parallel_iterations:
     parallel_iterations = 10
   elif not in_graph_mode and not parallel_iterations:
     parallel_iterations = 1
-
-  if not in_graph_mode and parallel_iterations > 1:
+  elif not in_graph_mode and parallel_iterations > 1:
     logging.log_first_n(
         logging.WARN, "Setting parallel_iterations > 1 has no "
         "effect when executing eagerly. Consider calling map_fn"
@@ -171,23 +366,25 @@ def map_fn(fn, elems, dtype=None, parallel_iterations=None, back_prop=True,
         "parallel.", 1)
     parallel_iterations = 1
 
-  input_is_sequence = nest.is_sequence(elems)
-  input_flatten = lambda x: nest.flatten(x) if input_is_sequence else [x]
-  def input_pack(x):
-    return nest.pack_sequence_as(elems, x) if input_is_sequence else x[0]
+  # Flatten the input tensors, and get the TypeSpec for each one.
+  elems_flat = nest.flatten(elems)
+  elems_flat_signature = [type_spec.type_spec_from_value(e) for e in elems_flat]
+  elems_unflatten = lambda x: nest.pack_sequence_as(elems, x)
 
-  if dtype is None:
-    output_is_sequence = input_is_sequence
-    output_flatten = input_flatten
-    output_pack = input_pack
+  # Flatten fn's output signature.
+  if fn_output_signature is None:
+    # If fn_output_signature was not specified, then assume that it matches the
+    # input signature.
+    result_flat_signature = [
+        _most_general_compatible_type(s)._unbatch()  # pylint: disable=protected-access
+        for s in elems_flat_signature
+    ]
+    result_unflatten = elems_unflatten
   else:
-    output_is_sequence = nest.is_sequence(dtype)
-    output_flatten = lambda x: nest.flatten(x) if output_is_sequence else [x]
-    def output_pack(x):
-      return (nest.pack_sequence_as(dtype, x)
-              if output_is_sequence else x[0])
-
-  elems_flat = input_flatten(elems)
+    result_flat_signature = [
+        _dtype_to_spec(d) for d in nest.flatten(fn_output_signature)
+    ]
+    result_unflatten = lambda x: nest.pack_sequence_as(fn_output_signature, x)
 
   with ops.name_scope(name, "map", elems_flat):
     # TODO(akshayka): Remove the in_graph_mode check once caching devices are
@@ -204,42 +401,56 @@ def map_fn(fn, elems, dtype=None, parallel_iterations=None, back_prop=True,
         varscope_caching_device_was_none = True
 
     elems_flat = [
-        ops.convert_to_tensor(elem, name="elem") for elem in elems_flat]
+        ops.convert_to_tensor_or_composite(t, name="elem") for t in elems_flat
+    ]
 
-    dtype = dtype or input_pack([elem.dtype for elem in elems_flat])
-    dtype_flat = output_flatten(dtype)
-
-    # Convert elems to tensor array. n may be known statically.
-    static_shape = elems_flat[0].shape
-    if static_shape.ndims is not None and static_shape.ndims < 1:
+    # Check that inputs are not scalars.
+    elems_static_shape = elems_flat[0].shape
+    if elems_static_shape.ndims is not None and elems_static_shape.ndims < 1:
       if len(elems_flat) == 1:
         raise ValueError("elems must be a 1+ dimensional Tensor, not a scalar")
       else:
         raise ValueError(
             "elements in elems must be 1+ dimensional Tensors, not scalars"
         )
-    n = (tensor_shape.dimension_value(static_shape[0])
-         or array_ops.shape(elems_flat[0])[0])
 
-    # TensorArrays are always flat
-    elems_ta = [
-        tensor_array_ops.TensorArray(dtype=elem.dtype,
-                                     size=n,
-                                     dynamic_size=False,
-                                     infer_shape=True)
-        for elem in elems_flat]
+    # Box any composite tensors into tensor lists.
+    elems_batchable = _elems_flat_to_batchable(elems_flat)
+
+    # Find the number of iterations, n.  (may be known statically.)
+    n_static = tensor_shape.Dimension(
+        tensor_shape.dimension_value(
+            elems_batchable[0].get_shape().with_rank_at_least(1)[0]))
+    for tensor in elems_batchable[1:]:
+      n_static.merge_with(
+          tensor_shape.Dimension(
+              tensor_shape.dimension_value(
+                  tensor.get_shape().with_rank_at_least(1)[0])))
+    n = n_static.value or array_ops.shape(elems_batchable[0])[0]
+
+    # Convert elems to tensor array.
+    # TODO(edloper): Should we set infer_shape=False for composite tensors?
+    elems_batchable_ta = [
+        tensor_array_ops.TensorArray(
+            dtype=t.dtype, size=n, dynamic_size=False, infer_shape=True)
+        for t in elems_batchable
+    ]
     # Unpack elements
-    elems_ta = [
-        elem_ta.unstack(elem) for elem_ta, elem in zip(elems_ta, elems_flat)]
+    elems_batchable_ta = [
+        ta.unstack(t) for (ta, t) in zip(elems_batchable_ta, elems_batchable)
+    ]
 
     i = constant_op.constant(0)
 
-    accs_ta = [
-        tensor_array_ops.TensorArray(dtype=dt,
-                                     size=n,
-                                     dynamic_size=False,
-                                     infer_shape=infer_shape)
-        for dt in dtype_flat]
+    # Prepare result tensor array.
+    # TODO(edloper): Should we set infer_shape=False for composite tensors?
+    result_batchable_dtype = _result_flat_signature_to_batchable_dtype(
+        result_flat_signature)
+    result_batchable_ta = [
+        tensor_array_ops.TensorArray(
+            dtype=dt, size=n, dynamic_size=False, infer_shape=infer_shape)
+        for dt in result_batchable_dtype
+    ]
 
     def compute(i, tas):
       """The loop body of map_fn.
@@ -252,30 +463,34 @@ def map_fn(fn, elems, dtype=None, parallel_iterations=None, back_prop=True,
         (i + 1, tas): the updated counter + updated TensorArrays
 
       Raises:
-        TypeError: if dtype and packed_fn_values structure do not match
-        ValueType: if dtype and packed_fn_values lengths do not match
+        TypeError: if fn_output_signature and result_value structure don't match
+        ValueType: if fn_output_signature and result_value lengths don't match
       """
-      packed_values = input_pack([elem_ta.read(i) for elem_ta in elems_ta])
-      packed_fn_values = fn(packed_values)
-      nest.assert_same_structure(dtype or elems, packed_fn_values)
-      flat_fn_values = output_flatten(packed_fn_values)
-      tas = [ta.write(i, value) for (ta, value) in zip(tas, flat_fn_values)]
+      elems_value_batchable = [ta.read(i) for ta in elems_batchable_ta]
+      elems_value_flat = _elems_value_batchable_to_flat(elems_value_batchable,
+                                                        elems_flat_signature)
+      elems_value = elems_unflatten(elems_value_flat)
+      result_value = fn(elems_value)
+      nest.assert_same_structure(fn_output_signature or elems, result_value)
+      result_value_flat = nest.flatten(result_value)
+      result_value_batchable = _result_value_flat_to_batchable(
+          result_value_flat, result_flat_signature)
+      tas = [
+          ta.write(i, value) for (ta, value) in zip(tas, result_value_batchable)
+      ]
       return (i + 1, tas)
 
     _, r_a = control_flow_ops.while_loop(
-        lambda i, _: i < n, compute, (i, accs_ta),
+        lambda i, _: i < n,
+        compute, (i, result_batchable_ta),
         parallel_iterations=parallel_iterations,
         back_prop=back_prop,
         swap_memory=swap_memory,
         maximum_iterations=n)
-    results_flat = [r.stack() for r in r_a]
+    result_batchable = [r.stack() for r in r_a]
 
-    n_static = tensor_shape.Dimension(tensor_shape.dimension_value(
-        elems_flat[0].get_shape().with_rank_at_least(1)[0]))
-    for elem in elems_flat[1:]:
-      n_static.merge_with(tensor_shape.Dimension(tensor_shape.dimension_value(
-          elem.get_shape().with_rank_at_least(1)[0])))
-    for r in results_flat:
+    # Update each output tensor w/ static shape info about the outer dimension.
+    for r in result_batchable:
       r.set_shape(tensor_shape.TensorShape(n_static).concatenate(
           r.get_shape()[1:]))
 
@@ -284,7 +499,103 @@ def map_fn(fn, elems, dtype=None, parallel_iterations=None, back_prop=True,
     if in_graph_mode and varscope_caching_device_was_none:
       varscope.set_caching_device(None)
 
-    return output_pack(results_flat)
+    result_flat = _result_batchable_to_flat(result_batchable,
+                                            result_flat_signature)
+    result = result_unflatten(result_flat)
+    return result
+
+
+def _dtype_to_spec(d):
+  if not isinstance(d, type_spec.TypeSpec):
+    d = tensor_spec.TensorSpec(None, d)
+  return d
+
+
+def _most_general_compatible_type(spec):
+  """Returns the most general TypeSpec compatible with `spec`."""
+  # TODO(edloper): Consider adding most_general_compatible_type to TypeSpec API
+  if isinstance(spec, tensor_spec.TensorSpec):
+    return tensor_spec.TensorSpec(None, spec.dtype)
+  elif isinstance(spec, ragged_tensor.RaggedTensorSpec):
+    # pylint: disable=protected-access
+    return ragged_tensor.RaggedTensorSpec(None, spec._dtype, spec._ragged_rank,
+                                          spec._row_splits_dtype)
+  elif isinstance(spec, sparse_tensor.SparseTensorSpec):
+    # pylint: disable=protected-access
+    return sparse_tensor.SparseTensorSpec(None, spec.dtype)
+  else:
+    return spec
+
+
+def _result_flat_signature_to_batchable_dtype(result_flat_signature):
+  """Converts result_flat_signature -> result_batchable_dtype."""
+  components = []
+  for spec in result_flat_signature:
+    if not isinstance(spec, type_spec.BatchableTypeSpec):
+      raise TypeError("map_fn can not generate %s outputs" % (spec,))
+    # pylint: disable=protected-access
+    components.extend([s.dtype for s in spec._flat_tensor_specs])
+  return components
+
+
+def _elems_flat_to_batchable(elems_flat):
+  """Converts elems_flat -> elems_batchable."""
+  elems_batchable = []
+  for elems_tensor in elems_flat:
+    spec = type_spec.type_spec_from_value(elems_tensor)
+    if not isinstance(spec, type_spec.BatchableTypeSpec):
+      raise TypeError("map_fn can not consume %s inputs: got %r" %
+                      (spec, elems_tensor))
+    # pylint: disable=protected-access
+    elems_batchable.extend(spec._to_batched_tensor_list(elems_tensor))
+  return elems_batchable
+
+
+def _elems_value_batchable_to_flat(elems_value_batchable, elems_flat_signature):
+  """Converts elems_value_batchable -> elems_value_flat."""
+  elems_value_flat = []
+  i = 0
+  for spec in elems_flat_signature:
+    # pylint: disable=protected-access
+    spec = spec._unbatch()
+    tensor_list = elems_value_batchable[i:i + len(spec._flat_tensor_specs)]
+    elems_value_flat.append(spec._from_compatible_tensor_list(tensor_list))
+    i += len(tensor_list)
+  assert i == len(elems_value_batchable)
+  return elems_value_flat
+
+
+def _result_value_flat_to_batchable(result_value_flat, result_flat_signature):
+  """Converts result_value_flat -> result_value_batchable."""
+  result_value_batchable = []
+  for (r_value, r_spec) in zip(result_value_flat, result_flat_signature):
+    if isinstance(r_spec, tensor_spec.TensorSpec):
+      result_value_batchable.append(r_value)
+    else:
+      if not r_spec.is_compatible_with(r_value):
+        raise ValueError(
+            "Error in map_fn:\n  Expected `fn` to return a:\n    %s\n"
+            "  But it returned a:\n    %s\n    (value=%s)\n"
+            "  To fix, update the `fn_output_signature` (or `dtype`) "
+            "argument to `map_fn`." %
+            (r_spec, type_spec.type_spec_from_value(r_value), r_value))
+      result_value_batchable.extend(r_spec._to_tensor_list(r_value))  # pylint: disable=protected-access
+  return result_value_batchable
+
+
+def _result_batchable_to_flat(result_batchable, result_flat_signature):
+  """Converts result_batchable -> result_flat."""
+  result_flat = []
+  i = 0
+  for spec in result_flat_signature:
+    # pylint: disable=protected-access
+    num_tensors = len(spec._flat_tensor_specs)
+    result_flat.append(
+        spec._batch(None)._from_compatible_tensor_list(
+            result_batchable[i:i + num_tensors]))
+    i += num_tensors
+  assert i == len(result_batchable)
+  return result_flat
 
 
 @tf_export("map_fn", v1=[])
@@ -297,6 +608,7 @@ Use:
 results = tf.nest.map_structure(tf.stop_gradient, tf.map_fn(fn, elems))""",
     warn_once=True,
     back_prop=False)
+@deprecation.deprecated_args(None, "Use fn_output_signature instead", "dtype")
 def map_fn_v2(fn,
               elems,
               dtype=None,
@@ -304,122 +616,25 @@ def map_fn_v2(fn,
               back_prop=True,
               swap_memory=False,
               infer_shape=True,
-              name=None):
-  """map on the list of tensors unpacked from `elems` on dimension 0.
-
-  The simplest version of `map_fn` repeatedly applies the callable `fn` to a
-  sequence of elements from first to last. The elements are made of the
-  tensors unpacked from `elems`. `dtype` is the data type of the return
-  value of `fn`. Users must provide `dtype` if it is different from
-  the data type of `elems`.
-
-  Suppose that `elems` is unpacked into `values`, a list of tensors. The shape
-  of the result tensor is `[values.shape[0]] + fn(values[0]).shape`.
-
-  This method also allows multi-arity `elems` and output of `fn`.  If `elems`
-  is a (possibly nested) list or tuple of tensors, then each of these tensors
-  must have a matching first (unpack) dimension.  The signature of `fn` may
-  match the structure of `elems`.  That is, if `elems` is
-  `(t1, [t2, t3, [t4, t5]])`, then an appropriate signature for `fn` is:
-  `fn = lambda (t1, [t2, t3, [t4, t5]]):`.
-
-  Furthermore, `fn` may emit a different structure than its input.  For example,
-  `fn` may look like: `fn = lambda t1: return (t1 + 1, t1 - 1)`.  In this case,
-  the `dtype` parameter is not optional: `dtype` must be a type or (possibly
-  nested) tuple of types matching the output of `fn`.
-
-  To apply a functional operation to the nonzero elements of a SparseTensor
-  one of the following methods is recommended. First, if the function is
-  expressible as TensorFlow ops, use
-
-  ```python
-    result = SparseTensor(input.indices, fn(input.values), input.dense_shape)
-  ```
-
-  If, however, the function is not expressible as a TensorFlow op, then use
-
-  ```python
-  result = SparseTensor(
-    input.indices, map_fn(fn, input.values), input.dense_shape)
-  ```
-
-  instead.
-
-  When executing eagerly, map_fn does not execute in parallel even if
-  `parallel_iterations` is set to a value > 1. You can still get the
-  performance benefits of running a function in parallel by using the
-  `tf.function` decorator,
-
-  ```python
-  # Assume the function being used in map_fn is fn.
-  # To ensure map_fn calls fn in parallel, use the tf.function decorator.
-  @tf.function
-  def func(tensor):
-    return tf.map_fn(fn, tensor)
-  ```
-
-  Note that if you use the `tf.function` decorator, any non-TensorFlow Python
-  code that you may have written in your function won't get executed. See
-  [`tf.function`](https://www.tensorflow.org/api_docs/python/tf/function) for
-  more  details. The recommendation would be to debug without `tf.function` but
-  switch to it to get performance benefits of running `map_fn` in parallel.
-
-  Args:
-    fn: The callable to be performed.  It accepts one argument, which will have
-      the same (possibly nested) structure as `elems`.  Its output must have the
-      same structure as `dtype` if one is provided, otherwise it must have the
-      same structure as `elems`.
-    elems: A tensor or (possibly nested) sequence of tensors, each of which will
-      be unpacked along their first dimension.  The nested sequence of the
-      resulting slices will be applied to `fn`.
-    dtype: (optional) The output type(s) of `fn`.  If `fn` returns a structure
-      of Tensors differing from the structure of `elems`, then `dtype` is not
-      optional and must have the same structure as the output of `fn`.
-    parallel_iterations: (optional) The number of iterations allowed to run in
-      parallel. When graph building, the default value is 10. While executing
-      eagerly, the default value is set to 1.
-    back_prop: (optional) Deprecated. False disables support for back
-      propagation. Prefer using `tf.stop_gradient` instead.
-    swap_memory: (optional) True enables GPU-CPU memory swapping.
-    infer_shape: (optional) False disables tests for consistent output shapes.
-    name: (optional) Name prefix for the returned tensors.
-
-  Returns:
-    A tensor or (possibly nested) sequence of tensors.  Each tensor packs the
-    results of applying `fn` to tensors unpacked from `elems` along the first
-    dimension, from first to last.
-
-  Raises:
-    TypeError: if `fn` is not callable or the structure of the output of
-      `fn` and `dtype` do not match, or if elems is a SparseTensor.
-    ValueError: if the lengths of the output of `fn` and `dtype` do not match.
-
-  Examples:
-    ```python
-    elems = np.array([1, 2, 3, 4, 5, 6])
-    squares = map_fn(lambda x: x * x, elems)
-    # squares == [1, 4, 9, 16, 25, 36]
-    ```
-
-    ```python
-    elems = (np.array([1, 2, 3]), np.array([-1, 1, -1]))
-    alternate = map_fn(lambda x: x[0] * x[1], elems, dtype=tf.int64)
-    # alternate == [-1, 2, -3]
-    ```
-
-    ```python
-    elems = np.array([1, 2, 3])
-    alternates = map_fn(lambda x: (x, -x), elems, dtype=(tf.int64, tf.int64))
-    # alternates[0] == [1, 2, 3]
-    # alternates[1] == [-1, -2, -3]
-    ```
-  """
+              name=None,
+              fn_output_signature=None):
+  """Transform `elems` by applying `fn` to each element unstacked on axis 0."""
+  if fn_output_signature is None:
+    fn_output_signature = dtype
   return map_fn(
       fn=fn,
       elems=elems,
-      dtype=dtype,
+      fn_output_signature=fn_output_signature,
       parallel_iterations=parallel_iterations,
       back_prop=back_prop,
       swap_memory=swap_memory,
       infer_shape=infer_shape,
       name=name)
+
+
+# Docstring for v2 is the same as v1, except that back_prop is deprecated.
+map_fn_v2.__doc__ = re.sub(
+    r"(  back_prop: \(optional\) )(.*)",
+    r"\1Deprecated: prefer using `tf.stop_gradient` instead.  \2",
+    map_fn.__doc__)
+assert "prefer using `tf.stop_gradient` instead" in map_fn_v2.__doc__
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index 614496e63e9..cc39861f91e 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -597,7 +597,7 @@ def _SqrtGradGrad(op, grad):
   a = op.inputs[0]
   y = op.outputs[0]  # y = 0.5 * b / conj(a)
   with ops.control_dependencies([grad]):
-    if compat.forward_compatible(2020, 3, 14):
+    if compat.forward_compatible(2020, 6, 14):
       ga = gen_math_ops.xdivy(grad, a)
       return -gen_math_ops.mul_no_nan(y, math_ops.conj(ga)), 0.5 * ga
     else:
@@ -631,7 +631,7 @@ def _ExpGrad(op, grad):
   y = op.outputs[0]  # y = e^x
   with ops.control_dependencies([grad]):
     y = math_ops.conj(y)
-    if compat.forward_compatible(2020, 3, 14):
+    if compat.forward_compatible(2020, 6, 14):
       return math_ops.mul_no_nan(y, grad)
     else:
       return grad * y
@@ -644,7 +644,7 @@ def _Expm1Grad(op, grad):
   with ops.control_dependencies([grad]):
     x = math_ops.conj(x)
     y = math_ops.exp(x)
-    if compat.forward_compatible(2020, 3, 14):
+    if compat.forward_compatible(2020, 6, 14):
       return math_ops.mul_no_nan(y, grad)
     else:
       return grad * y
@@ -656,7 +656,7 @@ def _LogGrad(op, grad):
   x = op.inputs[0]
   with ops.control_dependencies([grad]):
     x = math_ops.conj(x)
-    if compat.forward_compatible(2020, 3, 14):
+    if compat.forward_compatible(2020, 6, 14):
       return gen_math_ops.xdivy(grad, x)
     else:
       return grad * math_ops.reciprocal(x)
@@ -668,7 +668,7 @@ def _Log1pGrad(op, grad):
   x = op.inputs[0]
   with ops.control_dependencies([grad]):
     x = math_ops.conj(x)
-    if compat.forward_compatible(2020, 3, 14):
+    if compat.forward_compatible(2020, 6, 14):
       return gen_math_ops.xdivy(grad, 1 + x)
     else:
       return grad * math_ops.reciprocal(1 + x)
@@ -767,7 +767,7 @@ def _AcoshGrad(op, grad):
   y = op.outputs[0]
   with ops.control_dependencies([grad]):
     y = math_ops.conj(y)
-    if compat.forward_compatible(2020, 3, 14):
+    if compat.forward_compatible(2020, 6, 14):
       return math_ops.xdivy(grad, math_ops.sinh(y))
     else:
       return grad / math_ops.sinh(y)
@@ -838,7 +838,7 @@ def _LgammaGrad(op, grad):
   x = op.inputs[0]
   with ops.control_dependencies([grad]):
     x = math_ops.conj(x)
-    if compat.forward_compatible(2020, 3, 14):
+    if compat.forward_compatible(2020, 6, 14):
       return math_ops.mul_no_nan(math_ops.digamma(x), grad)
     else:
       return grad * math_ops.digamma(x)
@@ -851,7 +851,7 @@ def _DigammaGrad(op, grad):
   with ops.control_dependencies([grad]):
     x = math_ops.conj(x)
     partial_x = math_ops.polygamma(array_ops.constant(1, dtype=x.dtype), x)
-    if compat.forward_compatible(2020, 3, 14):
+    if compat.forward_compatible(2020, 6, 14):
       return math_ops.mul_no_nan(partial_x, grad)
     else:
       return grad * partial_x
@@ -908,7 +908,7 @@ def _BesselI0eGrad(op, grad):
   y = op.outputs[0]
   with ops.control_dependencies([grad]):
     partial_x = (math_ops.bessel_i1e(x) - math_ops.sign(x) * y)
-    if compat.forward_compatible(2020, 3, 14):
+    if compat.forward_compatible(2020, 6, 14):
       return math_ops.mul_no_nan(partial_x, grad)
     else:
       return grad * partial_x
@@ -932,7 +932,7 @@ def _BesselI1eGrad(op, grad):
     dy_dx = math_ops.bessel_i0e(safe_x) - y * (
         math_ops.sign(safe_x) + math_ops.reciprocal(safe_x))
     dy_dx = array_ops.where_v2(x_is_not_tiny, dy_dx, 0.5 + zeros)
-    if compat.forward_compatible(2020, 3, 14):
+    if compat.forward_compatible(2020, 6, 14):
       return math_ops.mul_no_nan(dy_dx, grad)
     else:
       return grad * dy_dx
@@ -953,7 +953,7 @@ def _IgammaGrad(op, grad):
     # and Gamma'(a) can grow large.
     partial_x = math_ops.exp(-x + (a - 1) * math_ops.log(x) -
                              math_ops.lgamma(a))
-    if compat.forward_compatible(2020, 3, 14):
+    if compat.forward_compatible(2020, 6, 14):
       return (array_ops.reshape(
           math_ops.reduce_sum(math_ops.mul_no_nan(partial_a, grad), ra), sa),
               array_ops.reshape(
@@ -994,7 +994,7 @@ def _BetaincGrad(op, grad):
                            math_ops.xlogy(a - 1, x) - log_beta)
 
   # TODO(b/36815900): Mark None return values as NotImplemented
-  if compat.forward_compatible(2020, 3, 14):
+  if compat.forward_compatible(2020, 6, 14):
     return (
         None,  # da
         None,  # db
@@ -1023,7 +1023,7 @@ def _ZetaGrad(op, grad):
     q = math_ops.conj(q)
     partial_q = -x * math_ops.zeta(x + 1, q)
     # TODO(b/36815900): Mark None return values as NotImplemented
-    if compat.forward_compatible(2020, 3, 14):
+    if compat.forward_compatible(2020, 6, 14):
       return (None,
               array_ops.reshape(
                   math_ops.reduce_sum(math_ops.mul_no_nan(partial_q, grad), rq),
@@ -1049,7 +1049,7 @@ def _PolygammaGrad(op, grad):
     x = math_ops.conj(x)
     partial_x = math_ops.polygamma(n + 1, x)
     # TODO(b/36815900): Mark None return values as NotImplemented
-    if compat.forward_compatible(2020, 3, 14):
+    if compat.forward_compatible(2020, 6, 14):
       return (None,
               array_ops.reshape(
                   math_ops.reduce_sum(math_ops.mul_no_nan(partial_x, grad), rx),
@@ -1110,7 +1110,7 @@ def _TanGrad(op, grad):
     x = math_ops.conj(x)
     secx = math_ops.reciprocal(math_ops.cos(x))
     secx2 = math_ops.square(secx)
-    if compat.forward_compatible(2020, 3, 14):
+    if compat.forward_compatible(2020, 6, 14):
       return math_ops.mul_no_nan(secx2, grad)
     else:
       return secx2 * grad
@@ -1125,7 +1125,7 @@ def _AsinGrad(op, grad):
     x2 = math_ops.square(x)
     one = constant_op.constant(1, dtype=grad.dtype)
     den = math_ops.sqrt(math_ops.subtract(one, x2))
-    if compat.forward_compatible(2020, 3, 14):
+    if compat.forward_compatible(2020, 6, 14):
       return math_ops.xdivy(grad, den)
     else:
       inv = math_ops.reciprocal(den)
@@ -1141,7 +1141,7 @@ def _AcosGrad(op, grad):
     x2 = math_ops.square(x)
     one = constant_op.constant(1, dtype=grad.dtype)
     den = math_ops.sqrt(math_ops.subtract(one, x2))
-    if compat.forward_compatible(2020, 3, 14):
+    if compat.forward_compatible(2020, 6, 14):
       return -math_ops.xdivy(grad, den)
     else:
       inv = math_ops.reciprocal(den)
@@ -1166,7 +1166,7 @@ def _Atan2Grad(op, grad):
   y = op.inputs[0]
   x = op.inputs[1]
   with ops.control_dependencies([grad]):
-    if compat.forward_compatible(2020, 3, 14):
+    if compat.forward_compatible(2020, 6, 14):
       grad_inv = math_ops.xdivy(grad, (math_ops.square(x) + math_ops.square(y)))
     else:
       grad_inv = grad / (math_ops.square(x) + math_ops.square(y))
@@ -1328,7 +1328,7 @@ def _DivGrad(op, grad):
   rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   x = math_ops.conj(x)
   y = math_ops.conj(y)
-  if compat.forward_compatible(2020, 3, 14):
+  if compat.forward_compatible(2020, 6, 14):
     return (array_ops.reshape(
         math_ops.reduce_sum(math_ops.xdivy(grad, y), rx), sx),
             array_ops.reshape(
@@ -1381,7 +1381,7 @@ def _RealDivGrad(op, grad):
   rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   x = math_ops.conj(x)
   y = math_ops.conj(y)
-  if compat.forward_compatible(2020, 3, 14):
+  if compat.forward_compatible(2020, 6, 14):
     return (array_ops.reshape(
         math_ops.reduce_sum(math_ops.xdivy(grad, y), rx), sx),
             array_ops.reshape(
@@ -1408,7 +1408,7 @@ def _DivNoNanGrad(op, grad):
   rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   x = math_ops.conj(x)
   y = math_ops.conj(y)
-  if compat.forward_compatible(2020, 3, 14):
+  if compat.forward_compatible(2020, 6, 14):
     return (array_ops.reshape(
         math_ops.reduce_sum(math_ops.div_no_nan(grad, y), rx), sx),
             array_ops.reshape(
@@ -1430,7 +1430,7 @@ def _PowGrad(op, grad):
   """Returns grad * (y*x^(y-1), z*log(x))."""
   x = op.inputs[0]
   y = op.inputs[1]
-  use_mul_no_nan = compat.forward_compatible(2020, 3, 14)
+  use_mul_no_nan = compat.forward_compatible(2020, 6, 14)
   skip_input_indices = None
   try:
     skip_input_indices = op.skip_input_indices
diff --git a/tensorflow/python/ops/nn_loss_scaling_utilities_test.py b/tensorflow/python/ops/nn_loss_scaling_utilities_test.py
index 427972f5ce1..9b1c8cc791a 100644
--- a/tensorflow/python/ops/nn_loss_scaling_utilities_test.py
+++ b/tensorflow/python/ops/nn_loss_scaling_utilities_test.py
@@ -57,7 +57,7 @@ class LossUtilitiesTest(test_lib.TestCase, parameterized.TestCase):
 
     # With strategy - num replicas = 2
     with distribution.scope():
-      per_replica_losses = distribution.experimental_run_v2(
+      per_replica_losses = distribution.run(
           nn_impl.compute_average_loss, args=(per_example_loss,))
       loss = distribution.reduce("SUM", per_replica_losses, axis=None)
       self.assertAllClose(self.evaluate(loss), (2.5 + 6.2 + 5.) / 3)
@@ -71,7 +71,7 @@ class LossUtilitiesTest(test_lib.TestCase, parameterized.TestCase):
   def testComputeAverageLossSampleWeights(self, distribution):
     with distribution.scope():
       # Scalar sample weight
-      per_replica_losses = distribution.experimental_run_v2(
+      per_replica_losses = distribution.run(
           nn_impl.compute_average_loss,
           args=([2., 4., 6.],),
           kwargs={"sample_weight": 2})
@@ -79,7 +79,7 @@ class LossUtilitiesTest(test_lib.TestCase, parameterized.TestCase):
       self.assertAllClose(self.evaluate(loss), (2. + 4. + 6.) * 2. / 3)
 
       # Per example sample weight
-      per_replica_losses = distribution.experimental_run_v2(
+      per_replica_losses = distribution.run(
           nn_impl.compute_average_loss,
           args=([2., 4., 6.],),
           kwargs={"sample_weight": [0.3, 0.5, 0.2]})
@@ -88,7 +88,7 @@ class LossUtilitiesTest(test_lib.TestCase, parameterized.TestCase):
           self.evaluate(loss), (2. * 0.3 + 4. * 0.5 + 6. * 0.2) / 3)
 
       # Time-step sample weight
-      per_replica_losses = distribution.experimental_run_v2(
+      per_replica_losses = distribution.run(
           nn_impl.compute_average_loss,
           args=([[2., 0.5], [4., 1.]],),
           kwargs={"sample_weight": [[0.3, 0.7], [0.2, 0.8]]})
@@ -114,7 +114,7 @@ class LossUtilitiesTest(test_lib.TestCase, parameterized.TestCase):
     with distribution.scope():
       per_example_loss = constant_op.constant([2., 4., 6.],
                                               dtype=dtypes.float64)
-      per_replica_losses = distribution.experimental_run_v2(
+      per_replica_losses = distribution.run(
           nn_impl.compute_average_loss,
           args=(per_example_loss,),
           kwargs={"sample_weight": 2})
@@ -169,7 +169,7 @@ class LossUtilitiesTest(test_lib.TestCase, parameterized.TestCase):
 
     # With strategy - num replicas = 2
     with distribution.scope():
-      per_replica_losses = distribution.experimental_run_v2(
+      per_replica_losses = distribution.run(
           nn_impl.scale_regularization_loss, args=(reg_losses,))
       loss = distribution.reduce("SUM", per_replica_losses, axis=None)
       self.assertAllClose(self.evaluate(loss), (2.5 + 6.2 + 5.))
diff --git a/tensorflow/python/ops/parallel_for/array_test.py b/tensorflow/python/ops/parallel_for/array_test.py
index d5aecd69dbd..c21ec40e709 100644
--- a/tensorflow/python/ops/parallel_for/array_test.py
+++ b/tensorflow/python/ops/parallel_for/array_test.py
@@ -292,6 +292,17 @@ class ArrayTest(PForTestCase):
 
     self._test_loop_fn(loop_fn, 3)
 
+  def test_conjugate_transpose(self):
+    x = math_ops.complex(
+        random_ops.random_uniform([3, 2, 3, 4]),
+        random_ops.random_uniform([3, 2, 3, 4]))
+
+    def loop_fn(i):
+      x_i = array_ops.gather(x, i)
+      return array_ops.conjugate_transpose(x_i, [2, 1, 0])
+
+    self._test_loop_fn(loop_fn, 3)
+
   def test_zeros_like(self):
     x = random_ops.random_uniform([3, 2, 3])
 
@@ -476,5 +487,15 @@ class ArrayTest(PForTestCase):
 
     self._test_loop_fn(loop_fn, 7)
 
+  def test_check_numerics(self):
+    x = random_ops.random_uniform([2, 3, 4])
+
+    def loop_fn(i):
+      x_i = array_ops.gather(x, i)
+      return array_ops.check_numerics(x_i, "test_message")
+
+    self._test_loop_fn(loop_fn, 2)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/parallel_for/math_test.py b/tensorflow/python/ops/parallel_for/math_test.py
index 6f149bc0e47..753b61cc572 100644
--- a/tensorflow/python/ops/parallel_for/math_test.py
+++ b/tensorflow/python/ops/parallel_for/math_test.py
@@ -50,7 +50,6 @@ class MathTest(PForTestCase, parameterized.TestCase):
           x = math_ops.complex(x, y)
 
       # pylint: disable=cell-var-from-loop
-      output_dtypes = []
 
       def loop_fn(i):
         with g:
@@ -65,8 +64,6 @@ class MathTest(PForTestCase, parameterized.TestCase):
           grad = g.gradient(loss, x1)
           if grad is not None:
             outputs.append(grad)
-        del output_dtypes[:]
-        output_dtypes.extend(t.dtype for t in outputs)
         return outputs
 
       # pylint: enable=cell-var-from-loop
@@ -374,6 +371,24 @@ class MathTest(PForTestCase, parameterized.TestCase):
 
           self._test_loop_fn(loop_fn, 2)
 
+  def test_bucketize(self):
+    x = random_ops.random_uniform([2, 3, 4])
+
+    def loop_fn(i):
+      a = array_ops.gather(x, i)
+      return math_ops.bucketize(a, [-1, 0.5, 1])
+
+    self._test_loop_fn(loop_fn, 2)
+
+  def test_clip_by_value(self):
+    x = random_ops.random_uniform([2, 3, 4])
+
+    def loop_fn(i):
+      a = array_ops.gather(x, i)
+      return clip_ops.clip_by_value(a, 0.5, 1.0)
+
+    self._test_loop_fn(loop_fn, 2)
+
   def test_cum_sum(self):
     x = random_ops.random_uniform([2, 3, 4, 5])
     for axis in (1, -2):
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index 70c897bd903..87642778257 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -45,6 +45,7 @@ from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_image_ops
 from tensorflow.python.ops import gen_linalg_ops
+from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gen_parsing_ops
 from tensorflow.python.ops import gen_random_ops
@@ -2167,12 +2168,13 @@ def _convert_reverse(pfor_input):
   return wrap(gen_array_ops.reverse_v2(value, axis=new_axis), True)
 
 
-@RegisterPFor("Transpose")
-def _convert_transpose(pfor_input):
+@RegisterPForWithArgs("Transpose", gen_array_ops.transpose)
+@RegisterPForWithArgs("ConjugateTranspose", gen_array_ops.conjugate_transpose)
+def _convert_transpose(pfor_input, _, op_func):
   t = pfor_input.stacked_input(0)
   perm = pfor_input.unstacked_input(1)
   new_perm = array_ops.concat([[0], perm + 1], axis=0)
-  return wrap(array_ops.transpose(t, new_perm), True)
+  return wrap(op_func(t, new_perm), True)
 
 
 @RegisterPFor("ZerosLike")
@@ -2339,6 +2341,13 @@ def _convert_strided_slice_grad(pfor_input):
           shrink_axis_mask=shrink_axis_mask), True)
 
 
+@RegisterPFor("CheckNumerics")
+def _convert_check_numerics(pfor_input):
+  t = pfor_input.stacked_input(0)
+  message = pfor_input.get_attr("message")
+  return wrap(gen_array_ops.check_numerics(t, message), True)
+
+
 # math_ops
 
 
@@ -2453,6 +2462,22 @@ def _convert_argmax_argmin(pfor_input, _, op_func):
   return wrap(op_func(t, axis=dimension, output_type=output_type), True)
 
 
+@RegisterPFor("Bucketize")
+def _convert_bucketize(pfor_input):
+  t = pfor_input.stacked_input(0)
+  boundaries = pfor_input.get_attr("boundaries")
+  return wrap(math_ops.bucketize(t, boundaries), True)
+
+
+@RegisterPFor("ClipByValue")
+def _convert_clip_by_value(pfor_input):
+  t = pfor_input.stacked_input(0)
+  clip_value_min = pfor_input.unstacked_input(1)
+  clip_value_max = pfor_input.unstacked_input(2)
+  return wrap(gen_math_ops.clip_by_value(t, clip_value_min, clip_value_max),
+              True)
+
+
 @RegisterPForWithArgs("Cumsum", math_ops.cumsum)
 @RegisterPForWithArgs("Cumprod", math_ops.cumprod)
 def _convert_cumfoo(pfor_input, _, op_func):
@@ -2963,6 +2988,7 @@ def _convert_multinomial(pfor_input):
 @RegisterPFor("StatelessRandomPoisson")
 @RegisterPFor("StatelessRandomUniform")
 @RegisterPFor("StatelessRandomUniformInt")
+@RegisterPFor("StatelessRandomUniformFullInt")
 @RegisterPFor("StatelessTruncatedNormal")
 def _convert_stateless_multinomial(pfor_input):
   # Unlike stateful random ops, for stateless ones we want better
diff --git a/tensorflow/python/ops/parsing_config.py b/tensorflow/python/ops/parsing_config.py
index bed17c7859e..64553a1f169 100644
--- a/tensorflow/python/ops/parsing_config.py
+++ b/tensorflow/python/ops/parsing_config.py
@@ -29,21 +29,12 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops.ragged import ragged_math_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import tf_logging
-from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export
 
 
-# Avoid circular dependencies with RaggedTensor.
-# TODO(b/141170488) Refactor ragged modules so this is unnecessary.
-ragged_tensor = LazyLoader(
-    "ragged_tensor", globals(),
-    "tensorflow.python.ops.ragged.ragged_tensor")
-ragged_math_ops = LazyLoader(
-    "ragged_math_ops", globals(),
-    "tensorflow.python.ops.ragged.ragged_math_ops")
-
-
 # TODO(b/122887740) Refactor code:
 #   * Move input verification to feature configuration objects (e.g.,
 #     VarLenFeature should check that dtype is a valid dtype).
diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD
index 285a695c26b..c7157f6ac1d 100644
--- a/tensorflow/python/ops/ragged/BUILD
+++ b/tensorflow/python/ops/ragged/BUILD
@@ -9,6 +9,7 @@ package(
         "//nlp/nlx/i18n/pangloss:__subpackages__",
         "//nlp/nlx/infrastructure/multiscale:__subpackages__",
         "//nlp/projects/atc/tf/ops:__pkg__",
+        "//research/graph/convolutions/model:__subpackages__",
         "//research/socrates:__subpackages__",
         "//tensorflow:internal",
     ],
@@ -41,6 +42,7 @@ py_library(
         ":ragged_map_ops",
         ":ragged_math_ops",
         ":ragged_operators",
+        ":ragged_ops",
         ":ragged_string_ops",
         ":ragged_tensor",
         ":ragged_tensor_shape",
@@ -231,6 +233,35 @@ py_library(
     ],
 )
 
+py_library(
+    name = "ragged_ops",
+    srcs = ["ragged_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_batch_gather_ops",
+        ":ragged_batch_gather_with_default_op",
+        ":ragged_concat_ops",
+        ":ragged_config",
+        ":ragged_conversion_ops",
+        ":ragged_dispatch",
+        ":ragged_factory_ops",
+        ":ragged_functional_ops",
+        ":ragged_gather_ops",
+        ":ragged_getitem",
+        ":ragged_map_ops",
+        ":ragged_math_ops",
+        ":ragged_operators",
+        ":ragged_string_ops",
+        ":ragged_tensor",
+        ":ragged_tensor_shape",
+        ":ragged_tensor_value",
+        ":ragged_util",
+        ":ragged_where_op",
+        ":segment_id_ops",
+    ],
+)
+
 py_library(
     name = "ragged_string_ops",
     srcs = ["ragged_string_ops.py"],
@@ -269,6 +300,27 @@ py_library(
     srcs_version = "PY2AND3",
 )
 
+py_library(
+    name = "row_partition",
+    srcs = ["row_partition.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":segment_id_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:util",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_library(
     name = "ragged_tensor",
     srcs = ["ragged_tensor.py"],
@@ -277,7 +329,7 @@ py_library(
         ":ragged_config",
         ":ragged_tensor_value",
         ":ragged_util",
-        ":segment_id_ops",
+        ":row_partition",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:composite_tensor",
@@ -292,6 +344,7 @@ py_library(
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python:tf2",
         "//tensorflow/python:type_spec",
         "//tensorflow/python:util",
         "//third_party/py/numpy",
@@ -432,9 +485,40 @@ py_library(
 
 py_test(
     name = "ragged_tensor_test",
+    srcs = ["ragged_tensor_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",
+    ],
+    deps = [
+        ":ragged",  # fixdeps: keep
+        ":ragged_factory_ops",
+        ":ragged_math_ops",
+        ":ragged_tensor",
+        ":ragged_tensor_value",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_getitem_test",
     size = "medium",
     timeout = "long",
-    srcs = ["ragged_tensor_test.py"],
+    srcs = ["ragged_getitem_test.py"],
     python_version = "PY3",
     shard_count = 4,
     srcs_version = "PY2AND3",
@@ -449,12 +533,41 @@ py_test(
         ":ragged_tensor_value",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "row_partition_test",
+    srcs = ["row_partition_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",
+    ],
+    deps = [
+        ":ragged",  # fixdeps: keep
+        ":row_partition",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -521,6 +634,7 @@ py_test(
     name = "ragged_gather_op_test",
     srcs = ["ragged_gather_op_test.py"],
     python_version = "PY3",
+    shard_count = 4,
     srcs_version = "PY2AND3",
     deps = [
         ":ragged_factory_ops",
@@ -753,8 +867,8 @@ py_test(
 )
 
 py_test(
-    name = "ragged_string_ops_test",
-    srcs = ["ragged_string_ops_test.py"],
+    name = "strings_reduce_join_op_test",
+    srcs = ["strings_reduce_join_op_test.py"],
     python_version = "PY3",
     srcs_version = "PY2AND3",
     deps = [
@@ -1129,6 +1243,23 @@ py_test(
     srcs = ["ragged_cross_op_test.py"],
     python_version = "PY3",
     srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",  # TODO(b/150702952): Reenable windows test once fixed.
+    ],
+    deps = [
+        ":ragged_array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_one_hot_op_test",
+    srcs = ["ragged_one_hot_op_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
     deps = [
         ":ragged_array_ops",
         "//tensorflow/python:constant_op",
diff --git a/tensorflow/python/ops/ragged/__init__.py b/tensorflow/python/ops/ragged/__init__.py
index e9232a1c641..1874943b913 100644
--- a/tensorflow/python/ops/ragged/__init__.py
+++ b/tensorflow/python/ops/ragged/__init__.py
@@ -27,26 +27,3 @@ and the [Ragged Tensor Guide](/guide/ragged_tensors).
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
-from tensorflow.python.ops.ragged import ragged_array_ops
-from tensorflow.python.ops.ragged import ragged_batch_gather_ops
-from tensorflow.python.ops.ragged import ragged_batch_gather_with_default_op
-from tensorflow.python.ops.ragged import ragged_concat_ops
-from tensorflow.python.ops.ragged import ragged_conversion_ops
-from tensorflow.python.ops.ragged import ragged_dispatch
-from tensorflow.python.ops.ragged import ragged_factory_ops
-from tensorflow.python.ops.ragged import ragged_functional_ops
-from tensorflow.python.ops.ragged import ragged_gather_ops
-from tensorflow.python.ops.ragged import ragged_getitem
-from tensorflow.python.ops.ragged import ragged_map_ops
-from tensorflow.python.ops.ragged import ragged_math_ops
-from tensorflow.python.ops.ragged import ragged_operators
-from tensorflow.python.ops.ragged import ragged_string_ops
-from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.ops.ragged import ragged_tensor_shape
-from tensorflow.python.ops.ragged import ragged_tensor_value
-from tensorflow.python.ops.ragged import ragged_where_op
-from tensorflow.python.ops.ragged import segment_id_ops
-
-# Add a list of the ops that support Ragged Tensors.
-__doc__ += ragged_dispatch.ragged_op_list()  # pylint: disable=redefined-builtin
diff --git a/tensorflow/python/ops/ragged/ragged_array_ops.py b/tensorflow/python/ops/ragged/ragged_array_ops.py
index ec58ac3953f..7f971cd558f 100644
--- a/tensorflow/python/ops/ragged/ragged_array_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_array_ops.py
@@ -517,14 +517,18 @@ def ragged_one_hot(indices,
                    dtype=None,
                    name=None):
   """Applies tf.one_hot along the values of a RaggedTensor."""
-  with ops.name_scope(name, 'RaggedOneHot', [indices]):
+  # Get the adjusted axis value for the call to array_ops.one_hot.
+  # Note: the only negative `axis` value supported by array_ops.one_hot is -1.
+  if isinstance(axis, int) and axis >= 0:
+    if axis <= indices.ragged_rank:
+      raise ValueError('axis (%d) must be greater than indices.ragged_rank '
+                       '(%d).' % (axis, indices.ragged_rank))
+    axis -= indices.ragged_rank
+
+  with ops.name_scope(name, 'RaggedOneHot',
+                      [indices, depth, on_value, off_value, axis]):
     indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         indices, name='indices')
-    if axis is not None:
-      axis = array_ops.get_positive_axis(
-          axis, indices.shape.ndims, ndims_name='rank(indices)')
-      if axis < indices.ragged_rank:
-        raise ValueError('axis may not be less than indices.ragged_rank.')
     return indices.with_flat_values(
         array_ops.one_hot(indices.flat_values, depth, on_value, off_value, axis,
                           dtype, name))
@@ -804,6 +808,11 @@ def _cross_internal(inputs,
     else:
       out_row_splits_type = dtypes.int64
 
+    # Convert hash_key from uint64 -> int64, since we need to pass it via
+    # an int64 attr.
+    if hash_key > 2**63:
+      hash_key -= 2**64
+
     values_out, splits_out = gen_ragged_array_ops.ragged_cross(
         ragged_values=[rt.values for rt in ragged_inputs],
         ragged_row_splits=[rt.row_splits for rt in ragged_inputs],
@@ -815,8 +824,8 @@ def _cross_internal(inputs,
         hashed_output=hashed_output,
         num_buckets=num_buckets,
         hash_key=hash_key,
-        out_values_type=out_values_type,
-        out_row_splits_type=out_row_splits_type,
+        out_values_type=out_values_type.as_datatype_enum,
+        out_row_splits_type=out_row_splits_type.as_datatype_enum,
         name=name)
 
     return ragged_tensor.RaggedTensor.from_row_splits(
diff --git a/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py b/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
index 5f6a7018b85..549a660ee12 100644
--- a/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
@@ -476,12 +476,12 @@ class RaggedBatchGatherOpTest(test_util.TensorFlowTestCase,
     ragged_indices = ragged_tensor.RaggedTensor.from_row_splits(
         indices, [0, 2, 4])
 
-    with self.assertRaisesRegexp(
-        ValueError, 'batch_gather does not allow indices with unknown shape.'):
+    with self.assertRaisesRegexp(ValueError, r'batch_dims may only be negative '
+                                 r'if rank\(indices\) is statically known.'):
       ragged_batch_gather_ops.batch_gather(params, indices)
 
-    with self.assertRaisesRegexp(
-        ValueError, 'batch_gather does not allow indices with unknown shape.'):
+    with self.assertRaisesRegexp(ValueError, r'batch_dims may only be negative '
+                                 r'if rank\(indices\) is statically known.'):
       ragged_batch_gather_ops.batch_gather(params, ragged_indices)
 
   @parameterized.parameters(
@@ -489,7 +489,7 @@ class RaggedBatchGatherOpTest(test_util.TensorFlowTestCase,
           dict(
               params=ragged_factory_ops.constant_value([['a'], ['b'], ['c']]),
               indices=ragged_factory_ops.constant_value([[0], [0]]),
-              message='Dimensions 3 and 2 are not compatible'),
+              message=(r'batch shape from indices .* does not match params')),
           dict(
               params=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
               indices=ragged_factory_ops.constant_value([[[0, 0], [0, 0, 0]],
@@ -506,20 +506,21 @@ class RaggedBatchGatherOpTest(test_util.TensorFlowTestCase,
                                                         [[0]], [[0]]]),
               indices=ragged_factory_ops.constant_value([[[0, 0]], [[0, 0, 0]],
                                                          [[0]]]),
-              error=errors.InvalidArgumentError,
-              message='.*Condition x == y did not hold.*'),
+              error=(ValueError, errors.InvalidArgumentError),
+              message=(r'batch shape from indices .* does not match '
+                       r'params shape|dimension size mismatch')),
           dict(
               params=ragged_factory_ops.constant_value(['a', 'b', 'c']),
               indices=ragged_factory_ops.constant_value([[0], [0]]),
-              message='batch shape from indices does not match params shape'),
+              message=r'batch_dims must be less than rank\(params\)'),
           dict(
               params=ragged_factory_ops.constant_value([['a']]),
               indices=0,
-              message='indices.rank must be at least 1.'),
+              message='batch_dims=-1 out of bounds: expected 0<=batch_dims<0'),
           dict(
               params=ragged_factory_ops.constant_value([['a']]),
               indices=[[[0]]],
-              message='batch shape from indices does not match params shape'),
+              message=r'batch_dims must be less than rank\(params\)'),
       ])
   def testRaggedBatchGatherStaticError(self,
                                        params,
diff --git a/tensorflow/python/ops/ragged/ragged_batch_gather_ops.py b/tensorflow/python/ops/ragged/ragged_batch_gather_ops.py
index 8f4271fc821..7f9d483663c 100644
--- a/tensorflow/python/ops/ragged/ragged_batch_gather_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_batch_gather_ops.py
@@ -18,13 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_gather_ops
-from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.ops.ragged import ragged_util
 
 
 #===============================================================================
@@ -61,64 +55,4 @@ def batch_gather(params, indices, name=None):
   >>> tf.compat.v1.batch_gather(params, indices)
   <tf.RaggedTensor [[b'b', b'c', b'a'], [], [], [b'e', b'e']]>
   """
-  if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)):
-    return array_ops.batch_gather(params, indices, name)
-
-  with ops.name_scope(name, 'RaggedBatchGather', [params, indices]):
-    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
-        params, name='params')
-    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
-        indices, name='indices')
-    params, indices = ragged_tensor.match_row_splits_dtypes(params, indices)
-    indices_ndims = indices.shape.ndims
-    if indices_ndims is None:
-      raise ValueError(
-          'batch_gather does not allow indices with unknown shape.')
-    if indices_ndims == 0:
-      raise ValueError('indices.rank must be at least 1.')
-
-    if ragged_tensor.is_ragged(indices):
-      # If the outermost ragged dimension is a batch dimension, recurse.
-      if indices_ndims > 2:
-        if not ragged_tensor.is_ragged(params):
-          raise ValueError('batch shape from indices does '
-                           'not match params shape')
-        checks = [check_ops.assert_equal(params.row_splits, indices.row_splits)]
-        with ops.control_dependencies(checks):
-          return ragged_tensor.RaggedTensor.from_row_splits(
-              batch_gather(params.values, indices.values), indices.row_splits,
-              validate=False)
-
-      # Otherwise, indices is a 2D ragged tensor with 1 ragged dimension.
-      else:
-        # Ensure that `params` is ragged and has at least 2 dimensions.
-        if not ragged_tensor.is_ragged(params):
-          if params.shape.ndims is not None and params.shape.ndims < 2:
-            raise ValueError('batch shape from indices does '
-                             'not match params shape')
-          params = ragged_tensor.RaggedTensor.from_tensor(
-              params, ragged_rank=1,
-              row_splits_dtype=indices.row_splits.dtype)
-
-        # Adjust indices from within-batch to global (in params.values), and
-        # then use ragged.gather to gather them.
-        num_indices = indices.row_lengths()
-        params_starts = params.row_starts()
-        adjustments = ragged_util.repeat(params_starts, num_indices, axis=0)
-        adjusted_index_values = (
-            math_ops.cast(indices.values, adjustments.dtype) + adjustments)
-        return ragged_tensor.RaggedTensor.from_row_splits(
-            ragged_gather_ops.gather(params.values, adjusted_index_values),
-            indices.row_splits, validate=False)
-
-    else:  # params is a RaggedTensor and indices is a Tensor.
-      if indices_ndims == 1:
-        return ragged_gather_ops.gather(params, indices)
-      elif indices_ndims == 2:
-        # Adjust indices from batch-local to global (in params.values)
-        adjustments = array_ops.expand_dims(params.row_starts(), 1)
-        adjusted_indices = (
-            math_ops.cast(indices, adjustments.dtype) + adjustments)
-        return ragged_gather_ops.gather(params.values, adjusted_indices)
-      else:
-        raise ValueError('batch shape from indices does not match params shape')
+  return ragged_gather_ops.gather(params, indices, batch_dims=-1, name=name)
diff --git a/tensorflow/python/ops/ragged/ragged_cross_op_test.py b/tensorflow/python/ops/ragged/ragged_cross_op_test.py
index 26ed243c485..1b089868c73 100644
--- a/tensorflow/python/ops/ragged/ragged_cross_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_cross_op_test.py
@@ -51,6 +51,7 @@ def sparse_const(matrix):
   return sparse_tensor.SparseTensorValue(indices, values, shape)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class RaggedCrossOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   @parameterized.named_parameters([
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch.py b/tensorflow/python/ops/ragged/ragged_dispatch.py
index 7a4bfef154f..dd5bd782462 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch.py
@@ -136,18 +136,16 @@ class UnaryRaggedElementwiseDispatcher(dispatch.OpDispatcher):
             if ragged_tensor.is_ragged(elt) else elt for elt in x
         ]
         x = ragged_tensor.match_row_splits_dtypes(*x)
-        nested_splits_lists = [
-            elt.nested_row_splits for elt in x if ragged_tensor.is_ragged(elt)
-        ]
+        ragged_elts = [elt for elt in x if ragged_tensor.is_ragged(elt)]
+        nested_splits_lists = [elt.nested_row_splits for elt in ragged_elts]
         flat_values = [
             elt.flat_values if ragged_tensor.is_ragged(elt) else elt
             for elt in x
         ]
         with ops.control_dependencies(
             ragged_util.assert_splits_match(nested_splits_lists)):
-          return ragged_tensor.RaggedTensor.from_nested_row_splits(
-              self._original_op(flat_values, *args, **kwargs),
-              nested_splits_lists[0], validate=False)
+          return ragged_elts[0].with_flat_values(
+              self._original_op(flat_values, *args, **kwargs))
       else:
         return self.NOT_SUPPORTED
     else:
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch_test.py b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
index 6edceeddcb5..0ce9a6f9771 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch_test.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
@@ -543,7 +543,7 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
               'depth':
                   4,
               'axis':
-                  1
+                  -1
           },
           expected=ragged_factory_ops.constant_value(
               [[[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]], [[1, 0, 0, 0]]],
@@ -771,6 +771,21 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
     else:
       self.assertAllEqual(result, expected)
 
+  def testUnaryElementwiseOpsPreserveUniformRowLength(self):
+    # Unary elementwise op
+    rt = ragged_tensor.RaggedTensor.from_uniform_row_length(
+        ragged_factory_ops.constant([[1, 2], [3]]),
+        uniform_row_length=2)
+    self.assertAllEqual(rt.uniform_row_length,
+                        array_ops.zeros_like(rt).uniform_row_length)
+
+    # Unary-list elementwise op
+    rt = ragged_tensor.RaggedTensor.from_uniform_row_length(
+        ragged_factory_ops.constant([[1, 2], [3]]),
+        uniform_row_length=2)
+    self.assertAllEqual(rt.uniform_row_length,
+                        math_ops.add_n([rt, rt]).uniform_row_length)
+
   def test_ragged_op_list(self):
     # Ops that should be listed as supported in both v1 and v2.
     supported_ops = [
diff --git a/tensorflow/python/ops/ragged/ragged_factory_ops.py b/tensorflow/python/ops/ragged/ragged_factory_ops.py
index bb4337cc011..aa148ae7fe8 100644
--- a/tensorflow/python/ops/ragged/ragged_factory_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_factory_ops.py
@@ -342,5 +342,5 @@ def placeholder(dtype, ragged_rank, value_shape=None, name=None):
     for i in reversed(range(ragged_rank)):
       row_splits = array_ops.placeholder(dtypes.int64, [None],
                                          "row_splits_%d" % i)
-      result = ragged_tensor.RaggedTensor(result, row_splits, internal=True)
+      result = ragged_tensor.RaggedTensor.from_row_splits(result, row_splits)
     return result
diff --git a/tensorflow/python/ops/ragged/ragged_gather_op_test.py b/tensorflow/python/ops/ragged/ragged_gather_op_test.py
index 99f6316c26c..928e634989c 100644
--- a/tensorflow/python/ops/ragged/ragged_gather_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_gather_op_test.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
+import numpy as np
+
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -31,106 +33,141 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_gather_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import googletest
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class RaggedGatherOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
-  def testDocStringExamples(self):
-    params = constant_op.constant(['a', 'b', 'c', 'd', 'e'])
-    indices = constant_op.constant([3, 1, 2, 1, 0])
-    ragged_params = ragged_factory_ops.constant([['a', 'b', 'c'], ['d'], [],
-                                                 ['e']])
-    ragged_indices = ragged_factory_ops.constant([[3, 1, 2], [1], [], [0]])
-    self.assertAllEqual(
-        ragged_gather_ops.gather(params, ragged_indices),
-        [[b'd', b'b', b'c'], [b'b'], [], [b'a']])
-    self.assertAllEqual(
-        ragged_gather_ops.gather(ragged_params, indices),
-        [[b'e'], [b'd'], [], [b'd'], [b'a', b'b', b'c']])
-    self.assertAllEqual(
-        ragged_gather_ops.gather(ragged_params, ragged_indices),
-        [[[b'e'], [b'd'], []], [[b'd']], [], [[b'a', b'b', b'c']]])
+  @parameterized.named_parameters([
+      # Basic gather (axis=0 and batch_dims=0)
+      dict(testcase_name='Params1DTensor_Indices1DTensor',
+           params=['a', 'b', 'c', 'd', 'e'],
+           indices=[2, 0, 2, 1],
+           expected=['c', 'a', 'c', 'b']),
+      dict(testcase_name='Params1DTensor_Indices2DRagged',
+           params=['a', 'b', 'c', 'd', 'e'],
+           indices=[[3, 1, 2], [1], [], [0]],
+           expected=[['d', 'b', 'c'], ['b'], [], ['a']]),
+      dict(testcase_name='Params2DRagged_Indices0DTensor',
+           params=[['a', 'b'], ['c', 'd', 'e'], ['f'], [], ['g']],
+           indices=1,
+           expected=['c', 'd', 'e']),
+      dict(testcase_name='Params2DRagged_Indices1DTensor',
+           params=[['a', 'b', 'c'], ['d'], [], ['e']],
+           indices=[3, 1, 2, 1, 0],
+           expected=[
+               ['e'], ['d'], [], ['d'], ['a', 'b', 'c']]),
+      dict(testcase_name='Params2DRagged_Indices2DRagged',
+           params=[['a', 'b', 'c'], ['d'], [], ['e']],
+           indices=[[3, 1, 2], [1], [], [0]],
+           expected=[
+               [['e'], ['d'], []], [['d']], [], [['a', 'b', 'c']]]),
+      dict(testcase_name='Params3DRagged_Indices2DTensor',
+           params=[
+               [['a', 'b'], []], [['c', 'd'], ['e'], ['f']], [['g']]],
+           indices=[[1, 2], [0, 1], [2, 2]],
+           indices_ragged_rank=0,
+           expected=[
+               [[['c', 'd'], ['e'], ['f']], [['g']]],
+               [[['a', 'b'], []], [['c', 'd'], ['e'], ['f']]],
+               [[['g']], [['g']]]]),
+      dict(testcase_name='Params3DRagged_Indices3DTensor',
+           params=[[['a', 'b'], []],
+                   [['c', 'd'], ['e'], ['f']],
+                   [['g']]],
+           indices=[[[1, 2], [0, 1], [2, 2]], [[0, 0], [1, 2], [0, 1]]],
+           indices_ragged_rank=0,
+           expected=[
+               [[[['c', 'd'], ['e'], ['f']], [['g']]],
+                [[['a', 'b'], []], [['c', 'd'], ['e'], ['f']]],
+                [[['g']], [['g']]]],
+               [[[['a', 'b'], []], [['a', 'b'], []]],
+                [[['c', 'd'], ['e'], ['f']], [['g']]],
+                [[['a', 'b'], []], [['c', 'd'], ['e'], ['f']]]]]),
+      dict(testcase_name='Params1DTensor_Indices4DRaggedRank2',
+           params=['a', 'b', 'c', 'd', 'e', 'f', 'g'],
+           indices=[[[[3, 4], [0, 6]], []],
+                    [[[2, 1], [1, 0]], [[2, 5]], [[2, 3]]],
+                    [[[1, 0]]]],
+           indices_ragged_rank=2,
+           expected=[
+               [[['d', 'e'], ['a', 'g']], []],
+               [[['c', 'b'], ['b', 'a']], [['c', 'f']], [['c', 'd']]],
+               [[['b', 'a']]]]),
+      # Batch gather (batch_dims=1)
+      dict(testcase_name='Batch1D_Params2DRagged_Indices1DTensor',
+           params=[['a', 'b'], ['c'], ['d', 'e', 'f', 'g'], ['h']],
+           indices=[1, 0, 3, 0],
+           batch_dims=1,
+           expected=['b', 'c', 'g', 'h']),
+      dict(testcase_name='Batch1D_Params2DRagged_Indices2DTensor',
+           params=[['a', 'b'], ['c'], ['d', 'e', 'f', 'g'], ['h']],
+           indices=[[1, 0], [0, 0], [3, 1], [0, 0]],
+           indices_ragged_rank=0,
+           batch_dims=1,
+           expected=[['b', 'a'], ['c', 'c'], ['g', 'e'], ['h', 'h']]),
+      dict(testcase_name='Batch1D_Params2DRagged_Indices2DRagged',
+           params=[['a', 'b'], ['c'], ['d', 'e', 'f', 'g'], ['h']],
+           indices=[[1, 0], [], [3, 2, 1], [0]],
+           batch_dims=1,
+           expected=[['b', 'a'], [], ['g', 'f', 'e'], ['h']]),
+      dict(testcase_name='Batch1D_Params3DRagged_Indices3DRagged',
+           params=[[['a'], ['b', 'c']],
+                   [],
+                   [['d', 'e', 'f'], ['g'], ['h', 'i'], ['j']],
+                   [['k']]],
+           indices=[[[1, 0], []], [], [[3, 2, 1], [0]], [[0]]],
+           batch_dims=1,
+           expected=[[[['b', 'c'], ['a']], []],
+                     [],
+                     [[['j'], ['h', 'i'], ['g']], [['d', 'e', 'f']]],
+                     [[['k']]]]),
+      # Batch gather (batch_dims=2)
+      dict(testcase_name='Batch2D_Params3DRagged_Indices2DRagged',
+           params=[[['a', 'b', 'c'], ['d', 'e'], ['f']],
+                   [['g'], ['h', 'i']]],
+           indices=[[0, 1, 0], [0, 1]],
+           batch_dims=2,
+           expected=[['a', 'e', 'f'], ['g', 'i']]),
+      dict(testcase_name='Batch2D_Params3DRagged_Indices3DRagged',
+           params=[[['a', 'b', 'c'], ['d', 'e'], ['f']],
+                   [['g'], ['h', 'i']]],
+           indices=[[[2, 1, 0], [1, 1], [0]], [[0], []]],
+           batch_dims=2,
+           expected=[[['c', 'b', 'a'], ['e', 'e'], ['f']], [['g'], []]]),
+      # Batch gather (batch_dims=3)
+      dict(testcase_name='Batch3D_Params4DRagged_Indices3DRagged',
+           params=[[[['a', 'b', 'c'], ['d', 'e'], ['f']],
+                    [['g'], ['h', 'i']]], [[['j']]]],
+           indices=[[[0, 1, 0], [0, 1]], [[0]]],
+           batch_dims=3,
+           expected=[[['a', 'e', 'f'], ['g', 'i']], [['j']]]),
 
-  def testTensorParamsAndTensorIndices(self):
-    params = ['a', 'b', 'c', 'd', 'e']
-    indices = [2, 0, 2, 1]
-    self.assertAllEqual(
-        ragged_gather_ops.gather(params, indices), [b'c', b'a', b'c', b'b'])
-    self.assertIsInstance(ragged_gather_ops.gather(params, indices), ops.Tensor)
-
-  def testRaggedParamsAndTensorIndices(self):
-    params = ragged_factory_ops.constant([['a', 'b'], ['c', 'd', 'e'], ['f'],
-                                          [], ['g']])
-    indices = [2, 0, 2, 1]
-    self.assertAllEqual(
-        ragged_gather_ops.gather(params, indices),
-        [[b'f'], [b'a', b'b'], [b'f'], [b'c', b'd', b'e']])
-
-  def testTensorParamsAndRaggedIndices(self):
-    params = ['a', 'b', 'c', 'd', 'e']
-    indices = ragged_factory_ops.constant([[2, 1], [1, 2, 0], [3]])
-    self.assertAllEqual(
-        ragged_gather_ops.gather(params, indices),
-        [[b'c', b'b'], [b'b', b'c', b'a'], [b'd']])
-
-  def testRaggedParamsAndRaggedIndices(self):
-    params = ragged_factory_ops.constant([['a', 'b'], ['c', 'd', 'e'], ['f'],
-                                          [], ['g']])
-    indices = ragged_factory_ops.constant([[2, 1], [1, 2, 0], [3]])
-    self.assertAllEqual(
-        ragged_gather_ops.gather(params, indices),
-        [[[b'f'], [b'c', b'd', b'e']],                # [[p[2], p[1]      ],
-         [[b'c', b'd', b'e'], [b'f'], [b'a', b'b']],  #  [p[1], p[2], p[0]],
-         [[]]]                                        #  [p[3]            ]]
-    )  # pyformat: disable
-
-  def testRaggedParamsAndScalarIndices(self):
-    params = ragged_factory_ops.constant([['a', 'b'], ['c', 'd', 'e'], ['f'],
-                                          [], ['g']])
-    indices = 1
-    self.assertAllEqual(
-        ragged_gather_ops.gather(params, indices), [b'c', b'd', b'e'])
-
-  def test3DRaggedParamsAnd2DTensorIndices(self):
-    params = ragged_factory_ops.constant([[['a', 'b'], []],
-                                          [['c', 'd'], ['e'], ['f']], [['g']]])
-    indices = [[1, 2], [0, 1], [2, 2]]
-    self.assertAllEqual(
-        ragged_gather_ops.gather(params, indices),
-        [[[[b'c', b'd'], [b'e'], [b'f']], [[b'g']]],            # [[p1, p2],
-         [[[b'a', b'b'], []], [[b'c', b'd'], [b'e'], [b'f']]],  #  [p0, p1],
-         [[[b'g']], [[b'g']]]]                                  #  [p2, p2]]
-    )  # pyformat: disable
-
-  def test3DRaggedParamsAnd3DTensorIndices(self):
-    params = ragged_factory_ops.constant([[['a', 'b'], []],            # p0
-                                          [['c', 'd'], ['e'], ['f']],  # p1
-                                          [['g']]                      # p2
-                                         ])  # pyformat: disable
-    indices = [[[1, 2], [0, 1], [2, 2]], [[0, 0], [1, 2], [0, 1]]]
-    self.assertAllEqual(
-        ragged_gather_ops.gather(params, indices),
-        [[[[[b'c', b'd'], [b'e'], [b'f']], [[b'g']]],             # [[p1, p2],
-          [[[b'a', b'b'], []], [[b'c', b'd'], [b'e'], [b'f']]],   #  [p0, p1],
-          [[[b'g']], [[b'g']]]],                                  #  [p2, p2]]
-         [[[[b'a', b'b'], []], [[b'a', b'b'], []]],               # [[p0, p0],
-          [[[b'c', b'd'], [b'e'], [b'f']], [[b'g']]],             #  [p1, p2],
-          [[[b'a', b'b'], []], [[b'c', b'd'], [b'e'], [b'f']]]]]  #  [p0, p1]]
-    )  # pyformat: disable
-
-  def testTensorParamsAnd4DRaggedIndices(self):
+  ])  # pyformat: disable
+  def testRaggedGather(self,
+                       params,
+                       indices,
+                       expected,
+                       axis=None,
+                       batch_dims=0,
+                       params_ragged_rank=None,
+                       indices_ragged_rank=None):
+    params = ragged_factory_ops.constant(params, ragged_rank=params_ragged_rank)
     indices = ragged_factory_ops.constant(
-        [[[[3, 4], [0, 6]], []], [[[2, 1], [1, 0]], [[2, 5]], [[2, 3]]],
-         [[[1, 0]]]],  # pyformat: disable
-        ragged_rank=2,
-        inner_shape=(2,))
-    params = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
-    self.assertAllEqual(
-        ragged_gather_ops.gather(params, indices),
-        [[[[b'd', b'e'], [b'a', b'g']], []],
-         [[[b'c', b'b'], [b'b', b'a']], [[b'c', b'f']], [[b'c', b'd']]],
-         [[[b'b', b'a']]]])  # pyformat: disable
+        indices, ragged_rank=indices_ragged_rank)
+    actual = ragged_gather_ops.gather(
+        params, indices, axis=axis, batch_dims=batch_dims)
+    self.assertAllEqual(actual, self._str_to_bytes(expected))
+
+  def _str_to_bytes(self, x):
+    if isinstance(x, list):
+      return [self._str_to_bytes(v) for v in x]
+    elif isinstance(x, str) and bytes is not str:
+      return bytes(x, 'utf-8')
+    else:
+      return x
 
   def testOutOfBoundsError(self):
     tensor_params = ['a', 'b', 'c']
@@ -154,7 +191,7 @@ class RaggedGatherOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     indices = constant_op.constant([0], dtype=dtypes.int64)
     indices = array_ops.placeholder_with_default(indices, None)
     self.assertRaisesRegexp(ValueError,
-                            r'indices\.shape\.ndims must be known statically',
+                            r'rank\(indices\) must be known statically',
                             ragged_gather_ops.gather, params, indices)
 
   # pylint: disable=bad-whitespace
@@ -272,6 +309,87 @@ class RaggedGatherOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     params_grad = params.with_flat_values(params_flat_values_grad)
     self.assertAllClose(params_grad, expected_grad, atol=2e-6, rtol=2e-6)
 
+  @parameterized.parameters([
+      # Basic gather (batch_dims == 0, axis == 0)
+      dict(params_shape=[3, 4], indices_shape=[], axis=0),
+      dict(params_shape=[3, 4], indices_shape=[5], axis=0),
+      dict(params_shape=[3, 4], indices_shape=[2, 5], axis=0),
+      # Gather over axis (axis > 0)
+      dict(params_shape=[3, 4], indices_shape=[], axis=1),
+      dict(params_shape=[3, 4], indices_shape=[2], axis=1),
+      dict(params_shape=[3, 4], indices_shape=[2, 5], axis=1),
+      dict(params_shape=[7, 3, 1], indices_shape=[2, 4], axis=1),
+      dict(params_shape=[3, 4, 5, 6], indices_shape=[2, 1, 7], axis=1),
+      dict(params_shape=[7, 3, 5], indices_shape=[], axis=2),
+      dict(params_shape=[7, 3, 5], indices_shape=[2], axis=2),
+      dict(params_shape=[7, 3, 5], indices_shape=[4, 2], axis=2),
+      dict(params_shape=[7, 3, 5, 6], indices_shape=[4, 2], axis=2),
+      dict(params_shape=[7, 3, 5, 6], indices_shape=[], axis=3),
+      dict(params_shape=[7, 3, 5, 6], indices_shape=[4], axis=3),
+      dict(params_shape=[7, 3, 5, 6], indices_shape=[8, 4], axis=3),
+      dict(params_shape=[7, 3, 5, 6], indices_shape=[2, 3, 2, 3], axis=3),
+      # Batched gather (batch_dims > 0)
+      dict(params_shape=[7, 3], indices_shape=[7], batch_dims=1),
+      dict(params_shape=[7, 3], indices_shape=[7, 5], batch_dims=1),
+      dict(params_shape=[5, 3], indices_shape=[5, 7, 4, 2], batch_dims=1),
+      dict(params_shape=[2, 3, 6], indices_shape=[2], batch_dims=1),
+      dict(params_shape=[7, 3, 6], indices_shape=[7, 5, 4, 2], batch_dims=1),
+      dict(params_shape=[7, 3, 5], indices_shape=[7, 3], batch_dims=2),
+      dict(params_shape=[7, 3, 5], indices_shape=[7, 3, 2], batch_dims=2),
+      dict(params_shape=[7, 3, 5, 6], indices_shape=[7, 3, 5], batch_dims=3),
+      dict(params_shape=[2, 3, 5, 6], indices_shape=[2, 3, 5, 7], batch_dims=3),
+      # Batched gather with axis (axis > batch_dims > 0)
+      dict(params_shape=[2, 3, 6], indices_shape=[2], axis=2, batch_dims=1),
+      dict(params_shape=[2, 3, 6], indices_shape=[2, 4], axis=2, batch_dims=1),
+      dict(
+          params_shape=[3, 1, 6, 7], indices_shape=[3, 4], axis=3,
+          batch_dims=1),
+      dict(
+          params_shape=[3, 2, 6, 7], indices_shape=[3, 4], axis=3,
+          batch_dims=1),
+      dict(
+          params_shape=[2, 3, 6, 7], indices_shape=[2, 3], axis=3,
+          batch_dims=2),
+  ])
+  def testMatchesDenseGather(self,
+                             params_shape,
+                             indices_shape,
+                             axis=None,
+                             batch_dims=0):
+    # Build random params & indices matrics w/ the expected shapes.
+    if axis is None:
+      axis = batch_dims
+    params = np.random.randint(100, size=params_shape, dtype=np.int32)
+    indices = np.random.randint(
+        params_shape[axis], size=indices_shape, dtype=np.int32)
+
+    # Use array_ops.gather to get the expected value.
+    expected = array_ops.gather(
+        params, indices, axis=axis, batch_dims=batch_dims)
+
+    # Build ragged tensors with varying ragged_ranks from params & axis.
+    params_tensors = [params] + [
+        ragged_tensor.RaggedTensor.from_tensor(params, ragged_rank=i)
+        for i in range(1, len(params_shape))
+    ]
+    indices_tensors = [indices] + [
+        ragged_tensor.RaggedTensor.from_tensor(indices, ragged_rank=i)
+        for i in range(1, len(indices_shape))
+    ]
+
+    # For each combination of params & axis tensors, check that
+    # ragged_gather_ops.gather matches array_ops.gather.
+    for params_tensor in params_tensors:
+      for indices_tensor in indices_tensors:
+        actual = ragged_gather_ops.gather(
+            params_tensor, indices_tensor, axis=axis, batch_dims=batch_dims)
+        if isinstance(actual, ragged_tensor.RaggedTensor):
+          actual = actual.to_tensor()
+        self.assertAllEqual(
+            expected, actual, 'params.ragged_rank=%s, indices.ragged_rank=%s' %
+            (getattr(params_tensor, 'ragged_rank',
+                     0), getattr(indices_tensor, 'ragged_rank', 0)))
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_gather_ops.py b/tensorflow/python/ops/ragged/ragged_gather_ops.py
index 9b8fdb21c11..11f429a3695 100644
--- a/tensorflow/python/ops/ragged/ragged_gather_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_gather_ops.py
@@ -33,22 +33,16 @@ from tensorflow.python.ops.ragged import ragged_tensor
 #===============================================================================
 # ragged_gather
 #===============================================================================
-# TODO(edloper): Add an `axis` argument
-def gather(params, indices, validate_indices=None, axis=0, batch_dims=0,
+def gather(params,
+           indices,
+           validate_indices=None,
+           axis=None,
+           batch_dims=0,
            name=None):
   """Gathers ragged slices from `params` axis `0` according to `indices`.
 
-  Returns `RaggedTensor` output, such that:
-
-  ```python
-  output.shape = indices.shape + params.shape[1:]
-  output.ragged_rank = indices.shape.ndims + params.ragged_rank
-  output[i...j, d0...dn] = params[indices[i...j], d0...dn]
-  ```
-
-  `params` may be ragged.  `indices` may be ragged.
-  `indices` must have dtype `int32` or `int64`. If any index is out of bounds,
-  then an error is returned.
+  See `tf.gather` for full documentation.  (This version has the same API
+  as `tf.gather`, but supports ragged `params` and `indices`.)
 
   Examples:
 
@@ -73,8 +67,8 @@ def gather(params, indices, validate_indices=None, axis=0, batch_dims=0,
       Must have dtype `int32` or `int64`.  Values must be in the range `[0,
       params.shape[0]]`.
     validate_indices: Ignored.
-    axis: Must be zero.
-    batch_dims: Must be zero.
+    axis: The axis in `params` to gather `indices` from.
+    batch_dims: The number of batch dimensions.
     name: A name for the operation (optional).
 
   Returns:
@@ -86,10 +80,7 @@ def gather(params, indices, validate_indices=None, axis=0, batch_dims=0,
     ValueError: If indices.shape.ndims is not known statically.
   """
   del validate_indices
-  if not isinstance(axis, int) or axis != 0:
-    raise ValueError('axis != 0 is not supported for ragged gather yet.')
-  if not isinstance(batch_dims, int) or batch_dims != 0:
-    raise ValueError('batch_dims != 0 is not supported for ragged gather yet.')
+
   with ops.name_scope(name, 'RaggedGather', [params, indices]):
     params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         params, name='params')
@@ -97,26 +88,240 @@ def gather(params, indices, validate_indices=None, axis=0, batch_dims=0,
         indices, name='indices')
     params, indices = ragged_tensor.match_row_splits_dtypes(params, indices)
 
-    if ragged_tensor.is_ragged(indices):
-      return indices.with_values(gather(params, indices.values))
+    if batch_dims != indices.shape.rank:
+      batch_dims = array_ops.get_positive_axis(
+          batch_dims,
+          indices.shape.rank,
+          axis_name='batch_dims',
+          ndims_name='rank(indices)')
+    if params.shape.rank is not None and batch_dims >= params.shape.rank:
+      raise ValueError('batch_dims must be less than rank(params)')
+    if axis is None:
+      axis = batch_dims
+    axis = array_ops.get_positive_axis(
+        axis, params.shape.rank, ndims_name='rank(params)')
+    if axis < batch_dims:
+      raise ValueError('axis must be greater than or equal to batch_dims')
+    if indices.shape.rank is not None:
+      if not 0 <= batch_dims <= indices.shape.rank:
+        raise ValueError(
+            'batch_dims=%s must be between 0 and rank(indices)=%s' %
+            (batch_dims, indices.shape.rank))
 
-    if not ragged_tensor.is_ragged(params):
-      return array_ops.gather(params, indices)
+    return _gather(params, indices, axis, batch_dims)
 
-    indices = ops.convert_to_tensor(indices)
-    if indices.shape.ndims is None:
-      raise ValueError('indices.shape.ndims must be known statically')
 
-    result = gen_ragged_array_ops.ragged_gather(
-        indices=indices,
-        params_dense_values=params.flat_values,
-        params_nested_splits=params.nested_row_splits,
-        OUTPUT_RAGGED_RANK=indices.shape.ndims + len(params.nested_row_splits) -
-        1)
+def _gather(params, indices, axis, batch_dims):
+  """Helper that implements the body for ragged gather().
 
-    # Compose the RaggedTensor from splits & values.
-    return ragged_tensor.RaggedTensor.from_nested_row_splits(
-        result.output_dense_values, result.output_nested_splits, validate=False)
+  Assumes that `params` and `indices` have been converted to tensors or
+  ragged tensors, and that `axis` and `batch_dims` have been normalized to
+  be positive.  (So these conversions & normalizations can be skipped in
+  recursive calls to _gather).
+
+  Args:
+    params: The tensor from which to gather values.
+    indices: The indices of values to gather.
+    axis: The axis in `params` to gather `indices` from.
+    batch_dims: The number of batch dimensions.
+
+  Returns:
+    A potentially ragged tensor.
+  """
+  params_is_ragged = ragged_tensor.is_ragged(params)
+  indices_is_ragged = ragged_tensor.is_ragged(indices)
+
+  if not (params_is_ragged or indices_is_ragged):
+    return array_ops.gather(params, indices, axis=axis, batch_dims=batch_dims)
+
+  if batch_dims > 0:
+    return _batch_gather(params, indices, axis, batch_dims)
+
+  if axis > 0:
+    return _axis_gather(params, indices, axis)
+
+  if indices_is_ragged:
+    return indices.with_values(_gather(params, indices.values, 0, 0))
+
+  if indices.shape.ndims is None:
+    raise ValueError('rank(indices) must be known statically')
+
+  out_ragged_rank = indices.shape.ndims + len(params.nested_row_splits) - 1
+  result = gen_ragged_array_ops.ragged_gather(
+      indices=indices,
+      params_dense_values=params.flat_values,
+      params_nested_splits=params.nested_row_splits,
+      OUTPUT_RAGGED_RANK=out_ragged_rank)
+
+  result = ragged_tensor.RaggedTensor.from_nested_row_splits(
+      result.output_dense_values, result.output_nested_splits, validate=False)
+
+  # Inject uniform_row_lengths into the result RaggedTensors for dimensions
+  # corresponding to dense outer dimensions of `indices`.
+  # TODO(edloper): Change this to construct the result using RowPartition
+  # objects instead, so we don't need to modify private variables.
+  if indices.shape.ndims > 1:
+    target = result
+    indices_shape = array_ops.shape(indices, out_type=params.row_splits.dtype)
+    shape_cumprod = math_ops.cumprod(indices_shape)
+    for dim in range(indices.shape.ndims - 1):
+      # pylint: disable=protected-access
+      target._cached_nrows = shape_cumprod[dim]
+      target._uniform_row_length = indices_shape[dim + 1]
+      target = target.values
+
+  return result
+
+
+def _batch_gather(params, indices, axis, batch_dims):
+  """Helper that implements the body for ragged gather() when batch_dims>0.
+
+  Args:
+    params: The tensor from which to gather values.
+    indices: The indices of values to gather.
+    axis: The axis in `params` to gather `indices` from.
+    batch_dims: The number of batch dimensions.
+
+  Returns:
+    A potentially ragged tensor.
+  """
+  # Perform static checks that `params` and `indices` have compatible batch
+  # dimensions.  Note: we do not perform *runtime* checks that `params` and
+  # `indices` actually have the same row-splits (because we wish to avoid the
+  # runtime cost of those checks).  If `params` and `indices` are
+  # incompatible, the resulting `RaggedTensor` may be nonsensical.
+  if not params.shape[:batch_dims].is_compatible_with(
+      indices.shape[:batch_dims]):
+    raise ValueError('batch shape from indices %s does not match params '
+                     'shape %s' % (indices.shape[:batch_dims], params.shape))
+
+  if batch_dims > 1:
+    # Convert params & indices to ragged tensors.
+    if not isinstance(params, ragged_tensor.RaggedTensor):
+      if indices.uniform_row_length is None:
+        raise ValueError(
+            'batch shape from indices does not match params shape: ragged '
+            'indices dimension corresponds to uniform params dimension')
+      params = ragged_tensor.RaggedTensor.from_tensor(
+          params, ragged_rank=1, row_splits_dtype=indices.row_splits.dtype)
+    if not isinstance(indices, ragged_tensor.RaggedTensor):
+      if params.uniform_row_length is None:
+        raise ValueError(
+            'batch shape from indices does not match params shape: ragged '
+            'params dimension corresponds to uniform indices dimension')
+      indices = ragged_tensor.RaggedTensor.from_tensor(
+          indices, ragged_rank=1, row_splits_dtype=params.row_splits.dtype)
+    # Flatten the two outer batch dimensions into a single batch dimension,
+    # and recurse.
+    return params.with_values(
+        _gather(params.values, indices.values, axis - 1, batch_dims - 1))
+
+  if axis > 1:
+    # Convert an axis dimension into a batch dimension, by adding a dimension
+    # to `indices`, and tiling it to match `params`.  E.g., if `params`
+    # had shape `[B, P1, P2]`, and `indices` had shape `[B, I1, I2]`, then we
+    # tile `indices` to have shape `[B, P1, I1, I2]`.  That way, we can treat
+    # the `P1` dimension as a batch dimension.
+    if not isinstance(indices, ragged_tensor.RaggedTensor):
+      adjusted_indices = params.with_values(
+          array_ops.repeat(indices, params.row_lengths(), 0))
+    else:
+      if not isinstance(params, ragged_tensor.RaggedTensor):
+        params = ragged_tensor.RaggedTensor.from_tensor(
+            params, ragged_rank=1, row_splits_dtype=indices.row_splits.dtype)
+      adjusted_indices = _gather(
+          indices,
+          params.with_values(
+              array_ops.repeat(
+                  math_ops.range(params.nrows()), params.row_lengths())), 0, 0)
+    return _batch_gather(params, adjusted_indices, axis, batch_dims + 1)
+
+  if indices.shape.rank is None:
+    raise ValueError('rank(indices) must be known statically')
+
+  assert batch_dims == 1
+  # If params.shape=[B, P1...PN] and indices.shape=[B, I1...IM], then:
+  #
+  #     output[b,        i1...im,      p2...pn] =
+  #     params[b, indices[b, i1...im], p2...pn]
+  #
+  # We construct `output` by flattening `params`, adjusting the `indices` to
+  # point into that flattened list, and recursively calling `gather`.
+  flat_params = _flatten_dims_0_and_1(params)
+  adjustments = _row_starts(params, indices.dtype)  # offset for each batch
+  # increase adjustments's rank so it broadcasts w/ the outer dim of indices
+  adjustments = _increase_rank_to(adjustments, indices.shape.ndims)
+  adjusted_indices = indices + adjustments
+  return _gather(flat_params, adjusted_indices, axis - 1, 0)
+
+
+def _axis_gather(params, indices, axis):
+  """Helper that implements ragged gather when axis>0 and batch_dims==0.
+
+  Args:
+    params: The tensor from which to gather values.
+    indices: The indices of values to gather.
+    axis: The axis in `params` to gather `indices` from.
+
+  Returns:
+    A potentially ragged tensor.
+  """
+  if axis > 1:
+    if not isinstance(params, ragged_tensor.RaggedTensor):
+      params = ragged_tensor.RaggedTensor.from_tensor(
+          params, ragged_rank=1, row_splits_dtype=indices.row_splits.dtype)
+    # Recurse, using the flattened params (but do not flatten indices).
+    return params.with_values(_gather(params.values, indices, axis - 1, 0))
+
+  if indices.shape.rank is None:
+    raise ValueError('rank(indices) must be known statically')
+  if (isinstance(params, ragged_tensor.RaggedTensor) and
+      params.uniform_row_length is None):
+    raise ValueError('axis may not be a ragged dimension')
+
+  assert axis == 1
+  # If params.shape=[P1...PN] and indices.shape=[I1...IM], then:
+  #
+  #     output[p1,      i1...im,     p3...pn] =
+  #     params[p1, indices[i1...im], p3...pn]
+  #
+  # We construct `output` by flattening `params`, adjusting the `indices` to
+  # have one additional dimension, and to point into that flattened list, and
+  # recursively calling `gather`.
+  flat_params = _flatten_dims_0_and_1(params)
+  adjustments = _row_starts(params, indices.dtype)  # offset for each batch
+  adjustments = _increase_rank_to(adjustments, indices.shape.ndims + 1)
+  adjusted_indices = indices + adjustments
+  return _gather(flat_params, adjusted_indices, axis - 1, 0)
+
+
+def _flatten_dims_0_and_1(t):
+  """Returns a copy of `t` with the outer two dimensions merged."""
+  if isinstance(t, ragged_tensor.RaggedTensor):
+    return t.values
+  else:
+    t_shape = array_ops.shape(t)
+    return array_ops.reshape(t, array_ops.concat([[-1], t_shape[2:]], axis=0))
+
+
+def _row_starts(t, dtype):
+  """Returns the start indices for the rows in `t`."""
+  if isinstance(t, ragged_tensor.RaggedTensor):
+    return math_ops.cast(t.row_starts(), dtype)
+  else:
+    t_shape = array_ops.shape(t, out_type=dtype)
+    return math_ops.range(t_shape[0]) * t_shape[1]
+
+
+def _increase_rank_to(t, rank):
+  """Adds *trailing* size-1 dimensions to `t` until it has the given rank."""
+  if isinstance(t, ragged_tensor.RaggedTensor):
+    return t.with_values(_increase_rank_to(t, rank - 1))
+  else:
+    old_dims = array_ops.shape(t)
+    new_dims = array_ops.ones([rank - array_ops.rank(t)], old_dims.dtype)
+    new_shape = array_ops.concat([old_dims, new_dims], axis=0)
+    return array_ops.reshape(t, new_shape)
 
 
 #===============================================================================
diff --git a/tensorflow/python/ops/ragged/ragged_getitem_test.py b/tensorflow/python/ops/ragged/ragged_getitem_test.py
new file mode 100644
index 00000000000..f02e308a29d
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_getitem_test.py
@@ -0,0 +1,590 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for third_party.tensorflow.python.ops.ragged_tensor."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+
+from absl.testing import parameterized
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensor
+
+from tensorflow.python.platform import googletest
+
+
+class _SliceBuilder(object):
+  """Helper to construct arguments for __getitem__.
+
+  Usage: _SliceBuilder()[<expr>] slice_spec Python generates for <expr>.
+  """
+
+  def __getitem__(self, slice_spec):
+    return slice_spec
+
+
+SLICE_BUILDER = _SliceBuilder()
+
+
+def _make_tensor_slice_spec(slice_spec, use_constant=True):
+  """Wraps all integers in an extended slice spec w/ a tensor.
+
+  This function is used to help test slicing when the slice spec contains
+  tensors, rather than integers.
+
+  Args:
+    slice_spec: The extended slice spec.
+    use_constant: If true, then wrap each integer with a tf.constant.  If false,
+      then wrap each integer with a tf.placeholder.
+
+  Returns:
+    A copy of slice_spec, but with each integer i replaced with tf.constant(i).
+  """
+
+  def make_piece_scalar(piece):
+    if isinstance(piece, int):
+      scalar = constant_op.constant(piece)
+      if use_constant:
+        return scalar
+      else:
+        return array_ops.placeholder_with_default(scalar, [])
+    elif isinstance(piece, slice):
+      return slice(
+          make_piece_scalar(piece.start), make_piece_scalar(piece.stop),
+          make_piece_scalar(piece.step))
+    else:
+      return piece
+
+  if isinstance(slice_spec, tuple):
+    return tuple(make_piece_scalar(piece) for piece in slice_spec)
+  else:
+    return make_piece_scalar(slice_spec)
+
+
+# Example 2D ragged tensor value with one ragged dimension and with scalar
+# values, expressed as nested python lists and as splits+values.
+EXAMPLE_RAGGED_TENSOR_2D = [[b'a', b'b'], [b'c', b'd', b'e'], [b'f'], [],
+                            [b'g']]
+EXAMPLE_RAGGED_TENSOR_2D_SPLITS = [0, 2, 5, 6, 6, 7]
+EXAMPLE_RAGGED_TENSOR_2D_VALUES = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
+
+# Example 4D ragged tensor value, with two ragged dimensions and with values
+# whose shape is [2], expressed as nested python lists and as splits+values.
+EXAMPLE_RAGGED_TENSOR_4D = [
+    [                                       # rt[0]
+        [[1, 2], [3, 4], [5, 6]],           # rt[0][0]
+        [[7, 8], [9, 10], [11, 12]]],       # rt[0][1]
+    [],                                     # rt[1]
+    [                                       # rt[2]
+        [[13, 14], [15, 16], [17, 18]]],    # rt[2][0]
+    [                                       # rt[3]
+        [[19, 20]]]                         # rt[3][0]
+]  # pyformat: disable
+EXAMPLE_RAGGED_TENSOR_4D_SPLITS1 = [0, 2, 2, 3, 4]
+EXAMPLE_RAGGED_TENSOR_4D_SPLITS2 = [0, 3, 6, 9, 10]
+EXAMPLE_RAGGED_TENSOR_4D_VALUES = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10],
+                                   [11, 12], [13, 14], [15, 16], [17, 18],
+                                   [19, 20]]
+
+# Example 3D ragged tensor with uniform_row_lengths.
+EXAMPLE_RAGGED_TENSOR_3D = [[[1, 2, 3], [4], [5, 6]], [[], [7, 8, 9], []]]
+EXAMPLE_RAGGED_TENSOR_3D_ROWLEN = 3
+EXAMPLE_RAGGED_TENSOR_3D_SPLITS = [0, 3, 4, 6, 6, 9, 9]
+EXAMPLE_RAGGED_TENSOR_3D_VALUES = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedGetItemTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+  longMessage = True  # Property in unittest.Testcase. pylint: disable=invalid-name
+
+  #=============================================================================
+  # RaggedTensor.__getitem__
+  #=============================================================================
+
+  def _TestGetItem(self, rt, slice_spec, expected, expected_shape=None):
+    """Helper function for testing RaggedTensor.__getitem__.
+
+    Checks that calling `rt.__getitem__(slice_spec) returns the expected value.
+    Checks three different configurations for each slice spec:
+
+      * Call __getitem__ with the slice spec as-is (with int values)
+      * Call __getitem__ with int values in the slice spec wrapped in
+        `tf.constant()`.
+      * Call __getitem__ with int values in the slice spec wrapped in
+        `tf.compat.v1.placeholder()` (so value is not known at graph
+        construction time).
+
+    Args:
+      rt: The RaggedTensor to test.
+      slice_spec: The slice spec.
+      expected: The expected value of rt.__getitem__(slice_spec), as a python
+        list; or an exception class.
+      expected_shape: The expected shape for `rt.__getitem__(slice_spec)`.
+    """
+    tensor_slice_spec1 = _make_tensor_slice_spec(slice_spec, True)
+    tensor_slice_spec2 = _make_tensor_slice_spec(slice_spec, False)
+    value1 = rt.__getitem__(slice_spec)
+    value2 = rt.__getitem__(tensor_slice_spec1)
+    value3 = rt.__getitem__(tensor_slice_spec2)
+    self.assertAllEqual(value1, expected, 'slice_spec=%s' % (slice_spec,))
+    self.assertAllEqual(value2, expected, 'slice_spec=%s' % (slice_spec,))
+    self.assertAllEqual(value3, expected, 'slice_spec=%s' % (slice_spec,))
+    if expected_shape is not None:
+      value1.shape.assert_is_compatible_with(expected_shape)
+      value2.shape.assert_is_compatible_with(expected_shape)
+      value3.shape.assert_is_compatible_with(expected_shape)
+
+  def _TestGetItemException(self, rt, slice_spec, expected, message):
+    """Helper function for testing RaggedTensor.__getitem__ exceptions."""
+    tensor_slice_spec = _make_tensor_slice_spec(slice_spec, True)
+    with self.assertRaisesRegexp(expected, message):
+      self.evaluate(rt.__getitem__(slice_spec))
+    with self.assertRaisesRegexp(expected, message):
+      self.evaluate(rt.__getitem__(tensor_slice_spec))
+
+  @parameterized.parameters(
+      # Tests for rt[i]
+      (SLICE_BUILDER[-5], EXAMPLE_RAGGED_TENSOR_2D[-5]),
+      (SLICE_BUILDER[-4], EXAMPLE_RAGGED_TENSOR_2D[-4]),
+      (SLICE_BUILDER[-1], EXAMPLE_RAGGED_TENSOR_2D[-1]),
+      (SLICE_BUILDER[0], EXAMPLE_RAGGED_TENSOR_2D[0]),
+      (SLICE_BUILDER[1], EXAMPLE_RAGGED_TENSOR_2D[1]),
+      (SLICE_BUILDER[4], EXAMPLE_RAGGED_TENSOR_2D[4]),
+
+      # Tests for rt[i:]
+      (SLICE_BUILDER[-6:], EXAMPLE_RAGGED_TENSOR_2D[-6:]),
+      (SLICE_BUILDER[-3:], EXAMPLE_RAGGED_TENSOR_2D[-3:]),
+      (SLICE_BUILDER[-1:], EXAMPLE_RAGGED_TENSOR_2D[-1:]),
+      (SLICE_BUILDER[0:], EXAMPLE_RAGGED_TENSOR_2D[0:]),
+      (SLICE_BUILDER[3:], EXAMPLE_RAGGED_TENSOR_2D[3:]),
+      (SLICE_BUILDER[5:], EXAMPLE_RAGGED_TENSOR_2D[5:]),
+
+      # Tests for rt[:j]
+      (SLICE_BUILDER[:-6], EXAMPLE_RAGGED_TENSOR_2D[:-6]),
+      (SLICE_BUILDER[:-3], EXAMPLE_RAGGED_TENSOR_2D[:-3]),
+      (SLICE_BUILDER[:-1], EXAMPLE_RAGGED_TENSOR_2D[:-1]),
+      (SLICE_BUILDER[:0], EXAMPLE_RAGGED_TENSOR_2D[:0]),
+      (SLICE_BUILDER[:3], EXAMPLE_RAGGED_TENSOR_2D[:3]),
+      (SLICE_BUILDER[:5], EXAMPLE_RAGGED_TENSOR_2D[:5]),
+
+      # Tests for rt[i:j]
+      (SLICE_BUILDER[0:3], EXAMPLE_RAGGED_TENSOR_2D[0:3]),
+      (SLICE_BUILDER[3:5], EXAMPLE_RAGGED_TENSOR_2D[3:5]),
+      (SLICE_BUILDER[-5:3], EXAMPLE_RAGGED_TENSOR_2D[-5:3]),
+      (SLICE_BUILDER[3:1], EXAMPLE_RAGGED_TENSOR_2D[3:1]),
+      (SLICE_BUILDER[-1:1], EXAMPLE_RAGGED_TENSOR_2D[-1:1]),
+      (SLICE_BUILDER[1:-1], EXAMPLE_RAGGED_TENSOR_2D[1:-1]),
+
+      # Tests for rt[i, j]
+      (SLICE_BUILDER[0, 1], EXAMPLE_RAGGED_TENSOR_2D[0][1]),
+      (SLICE_BUILDER[1, 2], EXAMPLE_RAGGED_TENSOR_2D[1][2]),
+      (SLICE_BUILDER[-1, 0], EXAMPLE_RAGGED_TENSOR_2D[-1][0]),
+      (SLICE_BUILDER[-3, 0], EXAMPLE_RAGGED_TENSOR_2D[-3][0]),
+      (SLICE_BUILDER[:], EXAMPLE_RAGGED_TENSOR_2D),
+      (SLICE_BUILDER[:, :], EXAMPLE_RAGGED_TENSOR_2D),
+
+      # Empty slice spec.
+      ([], EXAMPLE_RAGGED_TENSOR_2D),
+
+      # Test for ellipsis
+      (SLICE_BUILDER[...], EXAMPLE_RAGGED_TENSOR_2D),
+      (SLICE_BUILDER[2, ...], EXAMPLE_RAGGED_TENSOR_2D[2]),
+      (SLICE_BUILDER[..., :], EXAMPLE_RAGGED_TENSOR_2D),
+      (SLICE_BUILDER[..., 2, 0], EXAMPLE_RAGGED_TENSOR_2D[2][0]),
+      (SLICE_BUILDER[2, ..., 0], EXAMPLE_RAGGED_TENSOR_2D[2][0]),
+      (SLICE_BUILDER[2, 0, ...], EXAMPLE_RAGGED_TENSOR_2D[2][0]),
+
+      # Test for array_ops.newaxis
+      (SLICE_BUILDER[array_ops.newaxis, :], [EXAMPLE_RAGGED_TENSOR_2D]),
+      (SLICE_BUILDER[:, array_ops.newaxis],
+       [[row] for row in EXAMPLE_RAGGED_TENSOR_2D]),
+
+      # Slicing inner ragged dimensions.
+      (SLICE_BUILDER[-1:,
+                     1:4], [row[1:4] for row in EXAMPLE_RAGGED_TENSOR_2D[-1:]]),
+      (SLICE_BUILDER[:, 1:4], [row[1:4] for row in EXAMPLE_RAGGED_TENSOR_2D]),
+      (SLICE_BUILDER[:, -2:], [row[-2:] for row in EXAMPLE_RAGGED_TENSOR_2D]),
+
+      # Strided slices
+      (SLICE_BUILDER[::2], EXAMPLE_RAGGED_TENSOR_2D[::2]),
+      (SLICE_BUILDER[::-1], EXAMPLE_RAGGED_TENSOR_2D[::-1]),
+      (SLICE_BUILDER[::-2], EXAMPLE_RAGGED_TENSOR_2D[::-2]),
+      (SLICE_BUILDER[::-3], EXAMPLE_RAGGED_TENSOR_2D[::-3]),
+      (SLICE_BUILDER[:, ::2], [row[::2] for row in EXAMPLE_RAGGED_TENSOR_2D]),
+      (SLICE_BUILDER[:, ::-1], [row[::-1] for row in EXAMPLE_RAGGED_TENSOR_2D]),
+      (SLICE_BUILDER[:, ::-2], [row[::-2] for row in EXAMPLE_RAGGED_TENSOR_2D]),
+      (SLICE_BUILDER[:, ::-3], [row[::-3] for row in EXAMPLE_RAGGED_TENSOR_2D]),
+      (SLICE_BUILDER[:, 2::-1],
+       [row[2::-1] for row in EXAMPLE_RAGGED_TENSOR_2D]),
+      (SLICE_BUILDER[:, -1::-1],
+       [row[-1::-1] for row in EXAMPLE_RAGGED_TENSOR_2D]),
+      (SLICE_BUILDER[..., -1::-1],
+       [row[-1::-1] for row in EXAMPLE_RAGGED_TENSOR_2D]),
+      (SLICE_BUILDER[:, 2::-2],
+       [row[2::-2] for row in EXAMPLE_RAGGED_TENSOR_2D]),
+      (SLICE_BUILDER[::-1, ::-1],
+       [row[::-1] for row in EXAMPLE_RAGGED_TENSOR_2D[::-1]]),
+  )  # pyformat: disable
+  def testWithRaggedRank1(self, slice_spec, expected):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    # Ragged tensor
+    rt = RaggedTensor.from_row_splits(EXAMPLE_RAGGED_TENSOR_2D_VALUES,
+                                      EXAMPLE_RAGGED_TENSOR_2D_SPLITS)
+
+    self.assertAllEqual(rt, EXAMPLE_RAGGED_TENSOR_2D)
+    self._TestGetItem(rt, slice_spec, expected)
+
+  # pylint: disable=g-complex-comprehension
+  @parameterized.parameters([(start, stop)
+                             for start in [-2, -1, None, 0, 1, 2]
+                             for stop in [-2, -1, None, 0, 1, 2]])
+  def testWithStridedSlices(self, start, stop):
+    test_value = [[1, 2, 3, 4, 5], [6, 7], [8, 9, 10], [], [9],
+                  [1, 2, 3, 4, 5, 6, 7, 8]]
+    rt = ragged_factory_ops.constant(test_value)
+    for step in [-3, -2, -1, 1, 2, 3]:
+      # Slice outer dimension
+      self.assertAllEqual(rt[start:stop:step], test_value[start:stop:step],
+                          'slice=%s:%s:%s' % (start, stop, step))
+      # Slice inner dimension
+      self.assertAllEqual(rt[:, start:stop:step],
+                          [row[start:stop:step] for row in test_value],
+                          'slice=%s:%s:%s' % (start, stop, step))
+
+  # pylint: disable=invalid-slice-index
+  @parameterized.parameters(
+      # Tests for out-of-bound errors
+      (SLICE_BUILDER[5], (IndexError, ValueError, errors.InvalidArgumentError),
+       '.*out of bounds.*'),
+      (SLICE_BUILDER[-6], (IndexError, ValueError, errors.InvalidArgumentError),
+       '.*out of bounds.*'),
+      (SLICE_BUILDER[0, 2], (IndexError, ValueError,
+                             errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[3, 0], (IndexError, ValueError,
+                             errors.InvalidArgumentError), '.*out of bounds.*'),
+
+      # Indexing into an inner ragged dimension
+      (SLICE_BUILDER[:, 3], ValueError,
+       'Cannot index into an inner ragged dimension'),
+      (SLICE_BUILDER[:1, 3], ValueError,
+       'Cannot index into an inner ragged dimension'),
+      (SLICE_BUILDER[..., 3], ValueError,
+       'Cannot index into an inner ragged dimension'),
+
+      # Tests for type errors
+      (SLICE_BUILDER[0.5], TypeError, re.escape(array_ops._SLICE_TYPE_ERROR)),
+      (SLICE_BUILDER[1:3:0.5], TypeError, re.escape(
+          array_ops._SLICE_TYPE_ERROR)),
+      (SLICE_BUILDER[:, 1:3:0.5], TypeError,
+       'slice strides must be integers or None'),
+      (SLICE_BUILDER[:, 0.5:1.5], TypeError,
+       'slice offsets must be integers or None'),
+      (SLICE_BUILDER['foo'], TypeError, re.escape(array_ops._SLICE_TYPE_ERROR)),
+      (SLICE_BUILDER[:, 'foo':'foo'], TypeError,
+       'slice offsets must be integers or None'),
+
+      # Tests for other errors
+      (SLICE_BUILDER[..., 0, 0,
+                     0], IndexError, 'Too many indices for RaggedTensor'),
+  )
+  def testErrorsWithRaggedRank1(self, slice_spec, expected, message):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    # Ragged tensor
+    rt = RaggedTensor.from_row_splits(EXAMPLE_RAGGED_TENSOR_2D_VALUES,
+                                      EXAMPLE_RAGGED_TENSOR_2D_SPLITS)
+
+    self.assertAllEqual(rt, EXAMPLE_RAGGED_TENSOR_2D)
+    self._TestGetItemException(rt, slice_spec, expected, message)
+
+  @parameterized.parameters(
+      # Tests for rt[index, index, ...]
+      (SLICE_BUILDER[2, 0], EXAMPLE_RAGGED_TENSOR_4D[2][0]),
+      (SLICE_BUILDER[2, 0, 1], EXAMPLE_RAGGED_TENSOR_4D[2][0][1]),
+      (SLICE_BUILDER[2, 0, 1, 1], EXAMPLE_RAGGED_TENSOR_4D[2][0][1][1]),
+      (SLICE_BUILDER[2, 0, 1:], EXAMPLE_RAGGED_TENSOR_4D[2][0][1:]),
+      (SLICE_BUILDER[2, 0, 1:, 1:], [[16], [18]]),
+      (SLICE_BUILDER[2, 0, :, 1], [14, 16, 18]),
+      (SLICE_BUILDER[2, 0, 1, :], EXAMPLE_RAGGED_TENSOR_4D[2][0][1]),
+
+      # Tests for rt[index, slice, ...]
+      (SLICE_BUILDER[0, :], EXAMPLE_RAGGED_TENSOR_4D[0]),
+      (SLICE_BUILDER[1, :], EXAMPLE_RAGGED_TENSOR_4D[1]),
+      (SLICE_BUILDER[0, :, :, 1], [[2, 4, 6], [8, 10, 12]]),
+      (SLICE_BUILDER[1, :, :, 1], []),
+      (SLICE_BUILDER[2, :, :, 1], [[14, 16, 18]]),
+      (SLICE_BUILDER[3, :, :, 1], [[20]]),
+
+      # Tests for rt[slice, slice, ...]
+      (SLICE_BUILDER[:, :], EXAMPLE_RAGGED_TENSOR_4D),
+      (SLICE_BUILDER[:, :, :, 1], [[[2, 4, 6], [8, 10, 12]], [], [[14, 16, 18]],
+                                   [[20]]]),
+      (SLICE_BUILDER[1:, :, :, 1], [[], [[14, 16, 18]], [[20]]]),
+      (SLICE_BUILDER[-3:, :, :, 1], [[], [[14, 16, 18]], [[20]]]),
+
+      # Test for ellipsis
+      (SLICE_BUILDER[...], EXAMPLE_RAGGED_TENSOR_4D),
+      (SLICE_BUILDER[2, ...], EXAMPLE_RAGGED_TENSOR_4D[2]),
+      (SLICE_BUILDER[2, 0, ...], EXAMPLE_RAGGED_TENSOR_4D[2][0]),
+      (SLICE_BUILDER[..., 0], [[[1, 3, 5], [7, 9, 11]], [], [[13, 15, 17]],
+                               [[19]]]),
+      (SLICE_BUILDER[2, ..., 0], [[13, 15, 17]]),
+      (SLICE_BUILDER[2, 0, ..., 0], [13, 15, 17]),
+
+      # Test for array_ops.newaxis
+      (SLICE_BUILDER[array_ops.newaxis, :], [EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, array_ops.newaxis],
+       [[row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+
+      # Empty slice spec.
+      ([], EXAMPLE_RAGGED_TENSOR_4D),
+
+      # Slicing inner ragged dimensions.
+      (SLICE_BUILDER[:, 1:4], [row[1:4] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, -2:], [row[-2:] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, :, :-1],
+       [[v[:-1] for v in row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, :, 1:2],
+       [[v[1:2] for v in row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[1:, 1:3, 1:2],
+       [[v[1:2] for v in row[1:3]] for row in EXAMPLE_RAGGED_TENSOR_4D[1:]]),
+
+      # Strided slices
+      (SLICE_BUILDER[::2], EXAMPLE_RAGGED_TENSOR_4D[::2]),
+      (SLICE_BUILDER[::-1], EXAMPLE_RAGGED_TENSOR_4D[::-1]),
+      (SLICE_BUILDER[::-2], EXAMPLE_RAGGED_TENSOR_4D[::-2]),
+      (SLICE_BUILDER[1::2], EXAMPLE_RAGGED_TENSOR_4D[1::2]),
+      (SLICE_BUILDER[:, ::2], [row[::2] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, 1::2], [row[1::2] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, :, ::2],
+       [[v[::2] for v in row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, :, 1::2],
+       [[v[1::2] for v in row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, :, ::-1],
+       [[v[::-1] for v in row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, :, ::-2],
+       [[v[::-2] for v in row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[..., ::-1, :],
+       [[v[::-1] for v in row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[..., ::-1], [[[v[::-1] for v in col] for col in row]
+                                  for row in EXAMPLE_RAGGED_TENSOR_4D]),
+  )  # pyformat: disable
+  def testWithRaggedRank2(self, slice_spec, expected):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    rt = RaggedTensor.from_nested_row_splits(
+        EXAMPLE_RAGGED_TENSOR_4D_VALUES,
+        [EXAMPLE_RAGGED_TENSOR_4D_SPLITS1, EXAMPLE_RAGGED_TENSOR_4D_SPLITS2])
+    self.assertAllEqual(rt, EXAMPLE_RAGGED_TENSOR_4D)
+    self._TestGetItem(rt, slice_spec, expected)
+
+  @parameterized.parameters(
+      # Test for errors in unsupported cases
+      (SLICE_BUILDER[:, 0], ValueError,
+       'Cannot index into an inner ragged dimension.'),
+      (SLICE_BUILDER[:, :, 0], ValueError,
+       'Cannot index into an inner ragged dimension.'),
+
+      # Test for out-of-bounds errors.
+      (SLICE_BUILDER[1, 0], (IndexError, ValueError,
+                             errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[0, 0, 3],
+       (IndexError, ValueError,
+        errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[5], (IndexError, ValueError, errors.InvalidArgumentError),
+       '.*out of bounds.*'),
+      (SLICE_BUILDER[0, 5], (IndexError, ValueError,
+                             errors.InvalidArgumentError), '.*out of bounds.*'),
+  )
+  def testErrorsWithRaggedRank2(self, slice_spec, expected, message):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    rt = RaggedTensor.from_nested_row_splits(
+        EXAMPLE_RAGGED_TENSOR_4D_VALUES,
+        [EXAMPLE_RAGGED_TENSOR_4D_SPLITS1, EXAMPLE_RAGGED_TENSOR_4D_SPLITS2])
+    self.assertAllEqual(rt, EXAMPLE_RAGGED_TENSOR_4D)
+    self._TestGetItemException(rt, slice_spec, expected, message)
+
+  @parameterized.parameters(
+      (SLICE_BUILDER[:], []),
+      (SLICE_BUILDER[2:], []),
+      (SLICE_BUILDER[:-3], []),
+  )
+  def testWithEmptyTensor(self, slice_spec, expected):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    rt = RaggedTensor.from_row_splits([], [0])
+    self._TestGetItem(rt, slice_spec, expected)
+
+  @parameterized.parameters(
+      (SLICE_BUILDER[0], (IndexError, ValueError, errors.InvalidArgumentError),
+       '.*out of bounds.*'),
+      (SLICE_BUILDER[-1], (IndexError, ValueError, errors.InvalidArgumentError),
+       '.*out of bounds.*'),
+  )
+  def testErrorsWithEmptyTensor(self, slice_spec, expected, message):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    rt = RaggedTensor.from_row_splits([], [0])
+    self._TestGetItemException(rt, slice_spec, expected, message)
+
+  @parameterized.parameters(
+      (SLICE_BUILDER[-4], EXAMPLE_RAGGED_TENSOR_2D[-4]),
+      (SLICE_BUILDER[0], EXAMPLE_RAGGED_TENSOR_2D[0]),
+      (SLICE_BUILDER[-3:], EXAMPLE_RAGGED_TENSOR_2D[-3:]),
+      (SLICE_BUILDER[:3], EXAMPLE_RAGGED_TENSOR_2D[:3]),
+      (SLICE_BUILDER[3:5], EXAMPLE_RAGGED_TENSOR_2D[3:5]),
+      (SLICE_BUILDER[0, 1], EXAMPLE_RAGGED_TENSOR_2D[0][1]),
+      (SLICE_BUILDER[-3, 0], EXAMPLE_RAGGED_TENSOR_2D[-3][0]),
+  )
+  def testWithPlaceholderShapes(self, slice_spec, expected):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    # Intentionally use an unknown shape for `splits`, to force the code path
+    # that deals with having nrows unknown at graph construction time.
+    splits = constant_op.constant(
+        EXAMPLE_RAGGED_TENSOR_2D_SPLITS, dtype=dtypes.int64)
+    splits = array_ops.placeholder_with_default(splits, None)
+    rt = RaggedTensor.from_row_splits(EXAMPLE_RAGGED_TENSOR_2D_VALUES, splits)
+    self.assertAllEqual(rt, EXAMPLE_RAGGED_TENSOR_2D)
+    self._TestGetItem(rt, slice_spec, expected)
+
+  @parameterized.parameters(
+      (SLICE_BUILDER[..., 2], ValueError,
+       'Ellipsis not supported for unknown shape RaggedTensors'),)
+  def testErrorsWithPlaceholderShapes(self, slice_spec, expected, message):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    if not context.executing_eagerly():
+      # Intentionally use an unknown shape for `values`.
+      values = array_ops.placeholder_with_default([0], None)
+      rt = RaggedTensor.from_row_splits(values, [0, 1])
+      self._TestGetItemException(rt, slice_spec, expected, message)
+
+  def testNewAxis(self):
+    # rt: [[[['a', 'b'], ['c', 'd']], [], [['e', 'f']]], []]
+    splits1 = [0, 3, 3]
+    splits2 = [0, 2, 2, 3]
+    values = constant_op.constant([['a', 'b'], ['c', 'd'], ['e', 'f']])
+    rt = RaggedTensor.from_nested_row_splits(values, [splits1, splits2])
+    rt_newaxis0 = rt[array_ops.newaxis]
+    rt_newaxis1 = rt[:, array_ops.newaxis]
+    rt_newaxis2 = rt[:, :, array_ops.newaxis]
+    rt_newaxis3 = rt[:, :, :, array_ops.newaxis]
+    rt_newaxis4 = rt[:, :, :, :, array_ops.newaxis]
+
+    self.assertAllEqual(
+        rt, [[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]], []])
+    self.assertAllEqual(
+        rt_newaxis0, [[[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]], []]])
+    self.assertAllEqual(
+        rt_newaxis1,
+        [[[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]]], [[]]])
+    self.assertAllEqual(
+        rt_newaxis2,
+        [[[[[b'a', b'b'], [b'c', b'd']]], [[]], [[[b'e', b'f']]]], []])
+    self.assertAllEqual(
+        rt_newaxis3,
+        [[[[[b'a', b'b']], [[b'c', b'd']]], [], [[[b'e', b'f']]]], []])
+    self.assertAllEqual(
+        rt_newaxis4,
+        [[[[[b'a'], [b'b']], [[b'c'], [b'd']]], [], [[[b'e'], [b'f']]]], []])
+
+    self.assertEqual(rt.ragged_rank, 2)
+    self.assertEqual(rt_newaxis0.ragged_rank, 3)
+    self.assertEqual(rt_newaxis1.ragged_rank, 3)
+    self.assertEqual(rt_newaxis2.ragged_rank, 3)
+    self.assertEqual(rt_newaxis3.ragged_rank, 2)
+    self.assertEqual(rt_newaxis4.ragged_rank, 2)
+
+    self.assertEqual(rt_newaxis0.shape.as_list(), [1, 2, None, None, 2])
+    self.assertEqual(rt_newaxis1.shape.as_list(), [2, 1, None, None, 2])
+    self.assertEqual(rt_newaxis2.shape.as_list(), [2, None, 1, None, 2])
+    self.assertEqual(rt_newaxis3.shape.as_list(), [2, None, None, 1, 2])
+    self.assertEqual(rt_newaxis4.shape.as_list(), [2, None, None, 2, 1])
+
+  @parameterized.parameters(
+      # EXAMPLE_RAGGED_TENSOR_3D.shape = [2, 3, None]
+
+      # Indexing into uniform_row_splits dimension:
+      (SLICE_BUILDER[:, 1], [r[1] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, None]),
+      (SLICE_BUILDER[:, 2], [r[2] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, None]),
+      (SLICE_BUILDER[:, -2], [r[-2] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, None]),
+      (SLICE_BUILDER[:, -3], [r[-3] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, None]),
+      (SLICE_BUILDER[1:, 2], [r[2] for r in EXAMPLE_RAGGED_TENSOR_3D[1:]],
+       [1, None]),
+      (SLICE_BUILDER[:, 1, 1:], [r[1][1:] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, None]),
+      (SLICE_BUILDER[1:, 1, 1:],
+       [r[1][1:] for r in EXAMPLE_RAGGED_TENSOR_3D[1:]],
+       [1, None]),
+
+      # Slicing uniform_row_splits dimension:
+      (SLICE_BUILDER[:, 2:], [r[2:] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, 1, None]),
+      (SLICE_BUILDER[:, -2:], [r[-2:] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, 2, None]),
+      (SLICE_BUILDER[:, :, 1:],
+       [[c[1:] for c in r] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, 3, None]),
+      (SLICE_BUILDER[:, 5:], [r[5:] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, 0, None]),
+
+      # Slicing uniform_row_splits dimension with a non-default step size:
+      (SLICE_BUILDER[:, ::2], [r[::2] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, 2, None]),
+      (SLICE_BUILDER[:, ::-1], [r[::-1] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, 3, None]),
+  )  # pyformat: disable
+  def testWithUniformRowLength(self, slice_spec, expected, expected_shape):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    rt = RaggedTensor.from_uniform_row_length(
+        RaggedTensor.from_row_splits(EXAMPLE_RAGGED_TENSOR_3D_VALUES,
+                                     EXAMPLE_RAGGED_TENSOR_3D_SPLITS),
+        EXAMPLE_RAGGED_TENSOR_3D_ROWLEN)
+    self.assertAllEqual(rt, EXAMPLE_RAGGED_TENSOR_3D)
+    self.assertIsNot(rt.uniform_row_length, None)
+    self._TestGetItem(rt, slice_spec, expected, expected_shape)
+
+    # If the result is 3D, then check that it still has a uniform row length:
+    actual = rt.__getitem__(slice_spec)
+    if actual.shape.rank == 3:
+      self.assertIsNot(actual.uniform_row_length, None)
+      self.assertAllEqual(actual.uniform_row_length, expected_shape[1])
+
+  @parameterized.parameters(
+      (SLICE_BUILDER[:, 3], errors.InvalidArgumentError, 'out of bounds'),
+      (SLICE_BUILDER[:, -4], errors.InvalidArgumentError, 'out of bounds'),
+      (SLICE_BUILDER[:, 10], errors.InvalidArgumentError, 'out of bounds'),
+      (SLICE_BUILDER[:, -10], errors.InvalidArgumentError, 'out of bounds'),
+  )
+  def testErrorsWithUniformRowLength(self, slice_spec, expected, message):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    rt = RaggedTensor.from_uniform_row_length(
+        RaggedTensor.from_row_splits(EXAMPLE_RAGGED_TENSOR_3D_VALUES,
+                                     EXAMPLE_RAGGED_TENSOR_3D_SPLITS),
+        EXAMPLE_RAGGED_TENSOR_3D_ROWLEN)
+    self.assertAllEqual(rt, EXAMPLE_RAGGED_TENSOR_3D)
+    self._TestGetItemException(rt, slice_spec, expected, message)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py b/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
index 5ec6d54bc6b..c0325628e6e 100644
--- a/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
@@ -46,13 +46,16 @@ class RaggedMapOpTest(test_util.TensorFlowTestCase,
       dict(
           fn=mo.reduce_mean,
           elems=[[1, 2, 3], [4, 5], [6, 7]],
+          elems_dtype=dtypes.int32,
           expected_output=[2, 4, 6],
+          result_dtype=dtypes.int32,
       ),
       dict(
           fn=string_ops.reduce_join,
           elems=[['foo', 'bar', 'baz'], ['a'], ['b', 'c']],
           expected_output=[b'foobarbaz', b'a', b'bc'],
-          dtype=dtypes.string,
+          elems_dtype=dtypes.string,
+          result_dtype=dtypes.string,
       ),
       # [d1, (d2)] -> [d1, 2]
       dict(
@@ -60,7 +63,8 @@ class RaggedMapOpTest(test_util.TensorFlowTestCase,
           # fn=self.stack_mean_and_sum,
           elems=[[1, 2, 3], [4, 5], [6, 7]],
           expected_output=[[2, 6], [4.5, 9], [6.5, 13]],
-          dtype=dtypes.float32,
+          elems_dtype=dtypes.float32,
+          result_dtype=dtypes.float32,
           expected_ragged_rank=0,
       ),
       # [d1, (d2)] -> [d1, (d2)]
@@ -68,7 +72,7 @@ class RaggedMapOpTest(test_util.TensorFlowTestCase,
           fn=lambda x: x + np.int64(1),
           elems=[[1, 2, 3], [4, 5], [6, 7]],
           expected_output=[[2, 3, 4], [5, 6], [7, 8]],
-          dtype=dtypes.int64,
+          elems_dtype=dtypes.int64,
           result_dtype=ragged_tensor.RaggedTensorType(
               dtype=dtypes.int64, ragged_rank=1),
       ),
@@ -157,11 +161,11 @@ class RaggedMapOpTest(test_util.TensorFlowTestCase,
       expected_ragged_rank=None,
       result_ragged_rank=None,
       elems_ragged_rank=None,
-      dtype=dtypes.int64,
+      elems_dtype=dtypes.int64,
       result_dtype=None,
-      infer_shape=False,
+      infer_shape=True,
   ):
-    elems = ragged_factory_ops.constant(elems, dtype, elems_ragged_rank)
+    elems = ragged_factory_ops.constant(elems, elems_dtype, elems_ragged_rank)
     output = ragged_map_ops.map_fn(
         fn=fn, elems=elems, dtype=result_dtype, infer_shape=infer_shape)
 
@@ -260,8 +264,8 @@ class RaggedMapOpTest(test_util.TensorFlowTestCase,
   def testMismatchRaggedRank(self):
     elems = ragged_factory_ops.constant([[[1, 2, 3]], [[4, 5], [6, 7]]])
     fn = lambda x: ragged_math_ops.reduce_sum(x, axis=0)
-    with self.assertRaisesWithLiteralMatch(
-        ValueError, r'The declared ragged rank (23) mismatches the result (1)'):
+    with self.assertRaisesRegexp(
+        ValueError, r'(?s)Expected `fn` to return.*But it returned.*'):
       _ = ragged_map_ops.map_fn(
           fn,
           elems,
@@ -271,8 +275,8 @@ class RaggedMapOpTest(test_util.TensorFlowTestCase,
   def testMismatchRaggedRank2(self):
     elems = ragged_factory_ops.constant([[1, 2, 3], [4, 5], [6, 7]])
     fn = lambda x: ragged_tensor.RaggedTensor.from_row_starts(x, [0])
-    with self.assertRaisesWithLiteralMatch(
-        ValueError, r'The declared ragged rank (10) mismatches the result (2)'):
+    with self.assertRaisesRegexp(
+        ValueError, r'(?s)Expected `fn` to return.*But it returned.*'):
       _ = ragged_map_ops.map_fn(
           fn,
           elems,
diff --git a/tensorflow/python/ops/ragged/ragged_map_ops.py b/tensorflow/python/ops/ragged/ragged_map_ops.py
index 64bae498b31..69d10529685 100644
--- a/tensorflow/python/ops/ragged/ragged_map_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_map_ops.py
@@ -17,22 +17,14 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import collections
-from tensorflow.python.eager import context
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import tensor_array_ops
-from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.ops.ragged import ragged_config
 from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
+from tensorflow.python.util.lazy_loader import LazyLoader
+
+
+map_fn_lib = LazyLoader(
+    "map_fn_lib", globals(),
+    "tensorflow.python.ops.map_fn")
 
 
 def map_fn(fn,
@@ -166,298 +158,25 @@ def map_fn(fn,
     # out = tf.ragged.constant([[2, 3, 4], [5, 6], [7, 8]])
     ```
   """
-  if not callable(fn):
-    raise TypeError("fn must be callable.")
-
-  if isinstance(elems, sparse_tensor.SparseTensor):
-    raise TypeError(
-        "To perform a map on the values of a sparse tensor use either "
-        " SparseTensor(input.indices, fn(input.values), input.dense_shape) or "
-        " SparseTensor(input.indices, map_fn(fn, input.values), "
-        "input.dense_shape)")
-
-  in_graph_mode = not context.executing_eagerly()
-  # Set the default number of parallel_iterations depending on graph/eager mode.
-  if in_graph_mode and not parallel_iterations:
-    parallel_iterations = 10
-  elif not in_graph_mode and not parallel_iterations:
-    parallel_iterations = 1
-
-  if not in_graph_mode and parallel_iterations > 1:
-    logging.log_first_n(logging.WARN, "Setting parallel_iterations > 1 has no "
-                        "effect when executing eagerly. Consider calling map_fn"
-                        " with tf.contrib.eager.defun to execute fn in "
-                        "parallel.", 1)
-    parallel_iterations = 1
-
-  input_is_sequence = nest.is_sequence(elems)
-  input_flatten = lambda x: nest.flatten(x) if input_is_sequence else [x]
-
-  def input_pack(x):
-    return nest.pack_sequence_as(elems, x) if input_is_sequence else x[0]
-
-  elems_flat = input_flatten(elems)
-  elems_flat = ragged_tensor.match_row_splits_dtypes(*elems_flat)
-
-  with ops.name_scope(name, "map", elems_flat):
-    # TODO(akshayka): Remove the in_graph_mode check once caching devices are
-    # supported in Eager
-    if in_graph_mode:
-      # Any get_variable calls in fn will cache the first call locally
-      # and not issue repeated network I/O requests for each iteration.
-      varscope = vs.get_variable_scope()
-      varscope_caching_device_was_none = False
-      if varscope.caching_device is None:
-        # TODO(ebrevdo): Change to using colocate_with here and in other
-        # methods.
-        varscope.set_caching_device(lambda op: op.device)
-        varscope_caching_device_was_none = True
-
-    elems_flat = [
-        ragged_tensor.convert_to_tensor_or_ragged_tensor(elem, name="elem")
-        for elem in elems_flat
-    ]
-
-    # We can either infer the output, or we can assume that it will be the same
-    # as the input structure.
-    dtype = dtype or input_pack([elem.dtype for elem in elems_flat])
-
-    # Find the number of iterations, n may be known statically.
-    if isinstance(elems_flat[0], ragged_tensor.RaggedTensor):
-      n = elems_flat[0].nrows(out_type=dtypes.int32)
-    else:
-      static_shape = elems_flat[0].shape
-      if static_shape.ndims is not None and static_shape.ndims < 1:
-        if len(elems_flat) == 1:
-          raise ValueError(
-              "elems must be a 1+ dimensional Tensor, not a scalar")
-        else:
-          raise ValueError(
-              "elements in elems must be 1+ dimensional Tensors, not scalars")
-      n = (tensor_shape.dimension_value(static_shape[0]) or
-           array_ops.shape(elems_flat[0])[0])
-
-    n = math_ops.cast(n, dtype=dtypes.int32)
-    # Create a flat list of TAs.
-
-    # Flatten the dtype structure to a list.
-    dtype_flat = nest.flatten(dtype)
-
-    # decompose to components
-    dtype_components = [_maybe_decompose_dtype(d) for d in dtype_flat]
-    dtype_components_flat = nest.flatten(dtype_components)
-
-    # Create TensorArrays.
-    accs_ta = [
-        tensor_array_ops.TensorArray(
-            dtype=t, dynamic_size=False, infer_shape=infer_shape, size=n)
-        for t in dtype_components_flat
-    ]
-
-    i = constant_op.constant(0, dtype=dtypes.int32)
-
-    def compute(i, tas):
-      """The loop body of map_fn.
-
-      Args:
-        i: the loop counter
-        tas: the flat TensorArray accumulator list
-
-      Returns:
-        (i + 1, tas): the updated counter + updated TensorArrays
-
-      Raises:
-        TypeError: if dtype and packed_fn_values structure do not match
-        ValueType: if dtype and packed_fn_values lengths do not match
-      """
-      # Get Tensors or RaggedTensors sliced at i, then pack it back to the
-      # original structure.
-      packed_values = input_pack([elem_flat[i] for elem_flat in elems_flat])
-      packed_fn_values = fn(packed_values)
-
-      # Check that the structure of the output matches what was declared or
-      # inferred.
-      # nest.assert_same_structure(dtype or elems, packed_fn_values)
-
-      # Flatten and decompose to a list of Tensors
-      flat_fn_values = nest.flatten(packed_fn_values)
-
-      # If we declared that we are expecting a RaggedTensor output, but we get a
-      # Tensor output. We should try to convert it to a RaggedTensor.
-      flat_fn_composite_tensors = list(
-          _convert_declared(flat_fn_values, dtype_flat))
-
-      flat_fn_components = [
-          _maybe_decompose_tensor(t) for t in flat_fn_composite_tensors
-      ]
-      flat_fn_tensors = nest.flatten(flat_fn_components)
-
-      # Write to TAs.
-      tas = [ta.write(i, value) for (ta, value) in zip(tas, flat_fn_tensors)]
-
-      return (i + 1, tas)
-
-    _, r_a = control_flow_ops.while_loop(
-        lambda i, _: i < n, compute, (i, accs_ta),
-        parallel_iterations=parallel_iterations,
-        back_prop=back_prop,
-        swap_memory=swap_memory,
-        maximum_iterations=n)
-
-    # TODO(akshayka): Remove the in_graph_mode check once caching devices are
-    # supported in Eager
-    if in_graph_mode and varscope_caching_device_was_none:
-      varscope.set_caching_device(None)
-
-    # Pack back into a list of components
-    results_as_components = nest.pack_sequence_as(dtype_components, r_a)
-
-    # Stack TensorArrays for Tensor outputs, and concat RaggedTensor outputs.
-    def _stack_or_concat(e):
-      if isinstance(e, _RaggedTensorComponents):
-        return _concat_ragged_tensor_components(e)
-      else:
-        result = e.stack()
-        return result
-
-    results_flat_components = [
-        _stack_or_concat(e) for e in results_as_components
-    ]
-
-    results_packed = [
-        _maybe_recompose_tensor(c) for c in results_flat_components
-    ]
-    results_packed = nest.pack_sequence_as(dtype, results_packed)
-    return results_packed
+  if dtype is None:
+    dtype = nest.map_structure(lambda e: e.dtype, elems)
+  dtype = nest.map_structure(_ragged_type_to_spec, dtype)
+  return map_fn_lib.map_fn(fn,
+                           elems,
+                           dtype,
+                           parallel_iterations,
+                           back_prop,
+                           swap_memory,
+                           infer_shape,
+                           name)
 
 
-class _RaggedTensorComponents(
-    collections.namedtuple(
-        "_RaggedTensorComponents",
-        ["flat_values", "nested_row_lengths", "outer_row_length"])):
-  """A namedtuple of components which represent a `RaggedTensor`.
-
-  _RaggedTensorComponents is a list of components which can be used to create a
-  `RaggedTensor`. Use this class to represent a `RaggedTensor` in situations
-  where nest.flatten and nest.pack_sequence_as should decompose ragged tensors
-  into their components..
-
-  The following are a list of components for a `RaggedTensor`:
-
-  flat_values: The flat and inner values of a RaggedTensor. This could be
-    a `Tensor`, a `TensorArray`, or a data type.
-  nested_row_lengths: a tuple containing the row lengths of each rank. The
-    elements of the tuple could be `Tensor`s or `TensorArray`s.
-  outer_row_length: a `Tensor` or `TensorArray` containing the row length of the
-    `RaggedTensor`'s outermost dimension.
-
-  See `RaggedTensor` for more details of the use of each component.
-  """
-  __slots__ = ()
-
-
-def _concat_ragged_tensor_components(rt_ta):
-  flat_values = rt_ta.flat_values.concat()
-  nested_row_lengths = tuple(
-      row_lengths_ta.concat() for row_lengths_ta in rt_ta.nested_row_lengths)
-  outer_row_length = rt_ta.outer_row_length.concat()
-  return _RaggedTensorComponents(
-      flat_values=flat_values,
-      nested_row_lengths=nested_row_lengths,
-      outer_row_length=outer_row_length)
-
-
-def _maybe_decompose_tensor(rt):
-  """Decompose tensors to their composite tensors."""
-  if not isinstance(rt, ragged_tensor.RaggedTensor):
-    return rt
-
-  # The three component pieces we need:
-  # - inner values
-  flat_values = rt.flat_values
-
-  # - row_splits of the RT
-  splits = rt.nested_row_splits
-  nested_row_lengths = tuple(split[1:] - split[:-1] for split in splits)
-
-  # - outer row length
-  outer_row_length = array_ops.expand_dims(rt.nrows(), axis=0)
-
-  return _RaggedTensorComponents(
-      flat_values=flat_values,
-      nested_row_lengths=nested_row_lengths,
-      outer_row_length=outer_row_length,
-  )
-
-
-def _maybe_recompose_tensor(t):
-  """Reconstructs a _RaggedTensorComponents into a RaggedTensor."""
-  if not isinstance(t, _RaggedTensorComponents):
-    return t
-
-  values = t.flat_values
-  nested_row_lengths = tuple(t.nested_row_lengths)
-  for nested_row_length in reversed(nested_row_lengths):
-    values = ragged_tensor.RaggedTensor.from_row_lengths(
-        values, nested_row_length, validate=False)
-  return ragged_tensor.RaggedTensor.from_row_lengths(values, t.outer_row_length,
-                                                     validate=False)
-
-
-def _maybe_decompose_dtype(d):
-  """Decompose dtypes into composite tensors (if necessary)."""
-  if not isinstance(d, ragged_tensor.RaggedTensorType):
-    return d
-
-  result = _RaggedTensorComponents(
-      flat_values=d.dtype,
-      nested_row_lengths=tuple(
-          d.row_splits_dtype for i in range(d.ragged_rank - 1)),
-      outer_row_length=d.row_splits_dtype,
-  )
-  return result
-
-
-def _convert_declared(fn_output_flat, output_declared):
-  """Convert outputs which are `Tensor`s into `_RaggedTensorComponents`."""
-  for current, declared in zip(fn_output_flat, output_declared):
-    if isinstance(declared, ragged_tensor.RaggedTensorType):
-      yield _convert_declared_ragged(current, declared)
-    else:
-      yield current
-
-
-def _convert_declared_ragged(current, declared):
-  """Converts an output with RaggedTensorType into a _RaggedTensorComponents."""
-  # Check that the ragged ranks match up.
-  # + 1 to account for the rank of the outermost dimension.
-  current_ragged_rank = getattr(current, "ragged_rank", 0)
-  if declared.ragged_rank != current_ragged_rank + 1:
-    raise ValueError(
-        "The declared ragged rank (%d) mismatches the result (%d)" %
-        (declared.ragged_rank, current_ragged_rank + 1))
-
-  # Check that dtypes match up.
-  if declared.dtype != current.dtype:
-    raise ValueError(
-        "The declared dtype (%s) mismatches the result (%s)" %
-        (declared.dtype, current.dtype))
-  if (isinstance(current, ragged_tensor.RaggedTensor) and
-      declared.row_splits_dtype != current.row_splits.dtype):
-    if not ragged_config.auto_cast_partition_dtype():
-      raise ValueError(
-          "The declared row_splits dtype (%s) mismatches the result (%s)."
-          "  Use RaggedTensor.with_row_splits_dtype to convert it."
-          % (declared.row_splits_dtype, current.row_splits.dtype))
-    current = current.with_row_splits_dtype(declared.row_splits_dtype)
-
-  if isinstance(current, ragged_tensor.RaggedTensor):
-    return current
+def _ragged_type_to_spec(t):
+  if isinstance(t, ragged_tensor.RaggedTensorType):
+    # Note: need to adjust ragged_rank by 1, since RaggedTensorSpec gives the
+    # type for the mapped `fn` output, but RaggedTensorType gives the type for
+    # the result of stacking the mapped `fn` outputs.
+    return ragged_tensor.RaggedTensorSpec(
+        None, t.dtype, t.ragged_rank - 1, t.row_splits_dtype)
   else:
-    nrows = array_ops.shape(current, out_type=declared.row_splits_dtype)[0]
-    row_length = array_ops.expand_dims(nrows, axis=0)
-    return _RaggedTensorComponents(
-        flat_values=current,
-        nested_row_lengths=(),
-        outer_row_length=row_length)
-
+    return t
diff --git a/tensorflow/python/ops/ragged/ragged_one_hot_op_test.py b/tensorflow/python/ops/ragged/ragged_one_hot_op_test.py
new file mode 100644
index 00000000000..83a4cf3605c
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_one_hot_op_test.py
@@ -0,0 +1,198 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged_one_hot."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedOneHotTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @parameterized.parameters([
+      # 2D Indices (ragged_rank=1)
+      dict(indices=[[0, 2, -1], [3]],
+           depth=4,
+           expected=[[[1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 0]],
+                     [[0, 0, 0, 1]]]),
+      dict(indices=[[0, 2, -1], [3]],
+           depth=4,
+           axis=-1,
+           expected=[[[1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 0]],
+                     [[0, 0, 0, 1]]]),
+      dict(indices=[[0, 2, -1], [3]],
+           depth=4,
+           axis=2,
+           expected=[[[1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 0]],
+                     [[0, 0, 0, 1]]]),
+      dict(indices=[[0, 2, -1], [3]],
+           depth=4,
+           on_value=8,
+           off_value=4,
+           expected=[[[8, 4, 4, 4], [4, 4, 8, 4], [4, 4, 4, 4]],
+                     [[4, 4, 4, 8]]]),
+      dict(indices=[[0, 2, -1], [3]],
+           depth=0,
+           expected=[[[], [], []], [[]]]),
+      # 3D Indices (ragged_rank=2)
+      dict(indices=[[[0, 2, -1], [3]], [[2, 8]]],
+           depth=4,
+           expected=[[[[1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 0]],
+                      [[0, 0, 0, 1]]],
+                     [[[0, 0, 1, 0], [0, 0, 0, 0]]]]),
+      # 3D Indices (ragged_rank=1)
+      dict(indices=[[[0, 2], [-1, 3]], [[2, 8]]],
+           ragged_rank=1,
+           depth=4,
+           expected=[[[[1, 0, 0, 0], [0, 0, 1, 0]],
+                      [[0, 0, 0, 0], [0, 0, 0, 1]]],
+                     [[[0, 0, 1, 0], [0, 0, 0, 0]]]]),
+      dict(indices=[[[0, 2], [-1, 3]], [[2, 8]]],
+           ragged_rank=1,
+           axis=2,
+           depth=4,
+           expected=[[[[1, 0], [0, 0], [0, 1], [0, 0]],
+                      [[0, 0], [0, 0], [0, 0], [0, 1]]],
+                     [[[0, 0], [0, 0], [1, 0], [0, 0]]]]),
+  ])  # pyformat: disable
+  def testRaggedOneHot(self,
+                       indices,
+                       depth,
+                       on_value=None,
+                       off_value=None,
+                       axis=None,
+                       dtype=None,
+                       expected=None,
+                       ragged_rank=None):
+    ragged_indices = ragged_factory_ops.constant(
+        indices, ragged_rank=ragged_rank)
+    result = ragged_array_ops.ragged_one_hot(
+        ragged_indices,
+        depth,
+        on_value=on_value,
+        off_value=off_value,
+        axis=axis,
+        dtype=dtype)
+    self.assertAllEqual(result, expected)
+    self.assertEqual(result.ragged_rank, ragged_indices.ragged_rank)
+
+  @parameterized.parameters([
+      dict(indices=[[1]], depth=4, axis=0,  # axis < ragged_rank
+           message=r'axis \(0\) must be greater than indices.ragged_rank'),
+      dict(indices=[[1]], depth=4, axis=1,  # axis == ragged_rank
+           message=r'axis \(1\) must be greater than indices.ragged_rank'),
+      dict(indices=[[1]], depth=4, axis=-2,
+           # Note: the only negative `axis` value supported by
+           # array_ops.one_hot is -1.
+           message=(r'axis must be >= -1|'  # graph mode
+                    r'Expected axis to be -1 or between .*'),  # eager mode
+           exception=(ValueError, errors.InvalidArgumentError)),
+  ])  # pyformat: disable
+  def testErrors(self,
+                 indices,
+                 depth,
+                 on_value=None,
+                 off_value=None,
+                 axis=None,
+                 dtype=None,
+                 exception=ValueError,
+                 message=None,
+                 ragged_rank=None):
+    ragged_indices = ragged_factory_ops.constant(
+        indices, ragged_rank=ragged_rank)
+    with self.assertRaisesRegexp(exception, message):
+      array_ops.one_hot(
+          ragged_indices,
+          depth,
+          on_value=on_value,
+          off_value=off_value,
+          axis=axis,
+          dtype=dtype)
+
+  @parameterized.parameters([
+      dict(indices_shape=[5, 7], depth=6, axis=-1),
+      dict(indices_shape=[5, 7], depth=6, axis=2),
+      dict(indices_shape=[5, 2, 7], depth=3, axis=-1),
+      dict(indices_shape=[5, 2, 7], depth=3, axis=3),
+      dict(indices_shape=[5, 2, 7], depth=3, axis=2),
+      dict(indices_shape=[5, 2, 7, 4], depth=3, axis=-1),
+      dict(indices_shape=[5, 2, 7, 4], depth=3, axis=4),
+      dict(indices_shape=[5, 2, 7, 4], depth=3, axis=3),
+      dict(indices_shape=[5, 2, 7, 4], depth=3, axis=2),
+      dict(indices_shape=[5, 2, 7], depth=3, on_value=True, off_value=False),
+      dict(indices_shape=[5, 2, 7], depth=3, dtype=dtypes.float32),
+  ])  # pyformat: disable
+  def testRaggedOneHotMatchesArrayOpsOneHot(self,
+                                            indices_shape,
+                                            depth,
+                                            on_value=None,
+                                            off_value=None,
+                                            axis=None,
+                                            dtype=None):
+    """Tests that tf.one_hot gives the same result for ragged & uniform tensors.
+
+    Runs tf.one_hot with a uniform tensor, and compares the output with the
+    results of calling tf.one_hot with ragged version of that tensor with
+    varying ragged ranks.
+
+    Args:
+      indices_shape: Shape for `indices` arg to `tf.one_hot`
+      depth: `depth` arg to `tf.one_hot`
+      on_value: `on_value` arg to `tf.one_hot`
+      off_value: `off_value` arg to `tf.one_hot`
+      axis: `axis` arg to `tf.one_hot`
+      dtype: `dtype` arg to `tf.one_hot`
+    """
+    indices_shape = tensor_shape.as_shape(indices_shape)
+    indices = np.random.randint(depth + 1, size=indices_shape)
+    expected = array_ops.one_hot(
+        indices,
+        depth,
+        on_value=on_value,
+        off_value=off_value,
+        axis=axis,
+        dtype=dtype)
+    for ragged_rank in range(1, len(indices_shape)):
+      if axis is not None and 0 <= axis <= ragged_rank:
+        continue  # axis <= ragged_rank is not supported.
+      ragged_indices = ragged_tensor.RaggedTensor.from_tensor(
+          indices, ragged_rank=ragged_rank)
+      result = ragged_array_ops.ragged_one_hot(
+          ragged_indices,
+          depth,
+          on_value=on_value,
+          off_value=off_value,
+          axis=axis,
+          dtype=dtype)
+      self.assertAllEqual(result.to_tensor(), expected)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_ops.py b/tensorflow/python/ops/ragged/ragged_ops.py
new file mode 100644
index 00000000000..4c5a9909141
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_ops.py
@@ -0,0 +1,50 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Import all modules in the `ragged` package that define exported symbols.
+
+Additional, import ragged_dispatch (which has the side-effect of registering
+dispatch handlers for many standard TF ops) and ragged_operators (which has the
+side-effect of overriding RaggedTensor operators, such as RaggedTensor.__add__).
+
+We don't import these modules from ragged/__init__.py, since we want to avoid
+circular dependencies.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+# pylint: disable=unused-import
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_batch_gather_ops
+from tensorflow.python.ops.ragged import ragged_batch_gather_with_default_op
+from tensorflow.python.ops.ragged import ragged_concat_ops
+from tensorflow.python.ops.ragged import ragged_conversion_ops
+from tensorflow.python.ops.ragged import ragged_dispatch
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_functional_ops
+from tensorflow.python.ops.ragged import ragged_gather_ops
+from tensorflow.python.ops.ragged import ragged_getitem
+from tensorflow.python.ops.ragged import ragged_map_ops
+from tensorflow.python.ops.ragged import ragged_math_ops
+from tensorflow.python.ops.ragged import ragged_operators
+from tensorflow.python.ops.ragged import ragged_squeeze_op
+from tensorflow.python.ops.ragged import ragged_string_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_shape
+from tensorflow.python.ops.ragged import ragged_tensor_value
+from tensorflow.python.ops.ragged import ragged_where_op
+from tensorflow.python.ops.ragged import segment_id_ops
diff --git a/tensorflow/python/ops/ragged/ragged_placeholder_op_test.py b/tensorflow/python/ops/ragged/ragged_placeholder_op_test.py
index 0c5307031ac..d2261d408b3 100644
--- a/tensorflow/python/ops/ragged/ragged_placeholder_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_placeholder_op_test.py
@@ -37,22 +37,27 @@ class RaggedPlaceholderOpTest(test_util.TensorFlowTestCase,
       (dtypes.int32, 1, [], 'ph',
        'tf.RaggedTensor('
        'values=Tensor("ph/flat_values:0", shape=(None,), dtype=int32), '
-       'row_splits=Tensor("ph/row_splits_0:0", shape=(None,), dtype=int64))'),
+       'row_splits=Tensor("ph/RaggedFromRowSplits/control_dependency:0", '
+       'shape=(None,), dtype=int64))'),
       (dtypes.string, 1, [5], 'ph',
        'tf.RaggedTensor('
        'values=Tensor("ph/flat_values:0", shape=(None, 5), dtype=string), '
-       'row_splits=Tensor("ph/row_splits_0:0", shape=(None,), dtype=int64))'),
+       'row_splits=Tensor("ph/RaggedFromRowSplits/control_dependency:0", '
+       'shape=(None,), dtype=int64))'),
       (dtypes.float32, 2, [], 'ph',
        'tf.RaggedTensor(values=tf.RaggedTensor('
        'values=Tensor("ph/flat_values:0", shape=(None,), dtype=float32), '
-       'row_splits=Tensor("ph/row_splits_1:0", shape=(None,), dtype=int64)), '
-       'row_splits=Tensor("ph/row_splits_0:0", shape=(None,), dtype=int64))'),
+       'row_splits=Tensor("ph/RaggedFromRowSplits/control_dependency:0", '
+       'shape=(None,), dtype=int64)), '
+       'row_splits=Tensor("ph/RaggedFromRowSplits_1/control_dependency:0", '
+       'shape=(None,), dtype=int64))'),
       (dtypes.int32, 2, [3, 5], 'ph',
        'tf.RaggedTensor(values=tf.RaggedTensor('
        'values=Tensor("ph/flat_values:0", shape=(None, 3, 5), dtype=int32), '
-       'row_splits=Tensor("ph/row_splits_1:0", shape=(None,), dtype=int64)), '
-       'row_splits=Tensor("ph/row_splits_0:0", shape=(None,), dtype=int64))'),
-
+       'row_splits=Tensor("ph/RaggedFromRowSplits/control_dependency:0", '
+       'shape=(None,), dtype=int64)), '
+       'row_splits=Tensor("ph/RaggedFromRowSplits_1/control_dependency:0", '
+       'shape=(None,), dtype=int64))'),
   ])
   def testRaggedPlaceholder(self, dtype, ragged_rank, value_shape, name,
                             expected):
diff --git a/tensorflow/python/ops/ragged/ragged_string_ops.py b/tensorflow/python/ops/ragged/ragged_string_ops.py
index 493e5b97cd6..d5f21832044 100755
--- a/tensorflow/python/ops/ragged/ragged_string_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_string_ops.py
@@ -411,7 +411,7 @@ def _unicode_decode(input, input_encoding, errors, replacement_char,
       input = input.with_flat_values(
           ragged_tensor.RaggedTensor.from_tensor(
               input.flat_values,
-              ragged_rank=input_ndims - input.ragged_rank + 1))
+              ragged_rank=input_ndims - input.ragged_rank - 1))
 
   # Reshape the input to a flat vector, and apply the gen_string_ops op.
   if ragged_tensor.is_ragged(input):
diff --git a/tensorflow/python/ops/ragged/ragged_tensor.py b/tensorflow/python/ops/ragged/ragged_tensor.py
index 4eee88f283d..78be28b7ec6 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor.py
@@ -32,7 +32,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
-from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -42,11 +41,11 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_config
 from tensorflow.python.ops.ragged import ragged_tensor_value
 from tensorflow.python.ops.ragged import ragged_util
-from tensorflow.python.ops.ragged import segment_id_ops
+from tensorflow.python.ops.ragged.row_partition import RowPartition
 from tensorflow.python.util.tf_export import tf_export
 
 # pylint: disable=protected-access
-_eval_using_default_session = ops._eval_using_default_session
+_convert_row_partition = RowPartition._convert_row_partition
 # pylint: enable=protected-access
 
 #===============================================================================
@@ -112,7 +111,7 @@ class RaggedTensor(composite_tensor.CompositeTensor):
 
   ### Alternative Row-Partitioning Schemes
 
-  In addition to `row_splits`, ragged tensors provide support for four other
+  In addition to `row_splits`, ragged tensors provide support for five other
   row-partitioning schemes:
 
     * `row_lengths`: a vector with shape `[nrows]`, which specifies the length
@@ -226,14 +225,7 @@ class RaggedTensor(composite_tensor.CompositeTensor):
   #=============================================================================
   # Constructor (private)
   #=============================================================================
-  def __init__(self,
-               values,
-               row_splits,
-               cached_row_lengths=None,
-               cached_value_rowids=None,
-               cached_nrows=None,
-               internal=False,
-               uniform_row_length=None):
+  def __init__(self, values, row_partition, internal=False):
     """Creates a `RaggedTensor` with a specified partitioning for `values`.
 
     This constructor is private -- please use one of the following ops to
@@ -250,70 +242,90 @@ class RaggedTensor(composite_tensor.CompositeTensor):
 
     Args:
       values: A potentially ragged tensor of any dtype and shape `[nvals, ...]`.
-      row_splits: A 1-D integer tensor with shape `[nrows+1]`.
-      cached_row_lengths: A 1-D integer tensor with shape `[nrows]`
-      cached_value_rowids: A 1-D integer tensor with shape `[nvals]`.
-      cached_nrows: A 1-D integer scalar tensor.
+      row_partition: A `RowPartition` object, representing the arrangement of
+        the lists at the top level.
       internal: True if the constructor is being called by one of the factory
         methods.  If false, an exception will be raised.
-      uniform_row_length: A scalar tensor.
 
     Raises:
-      TypeError: If a row partitioning tensor has an inappropriate dtype.
-      TypeError: If exactly one row partitioning argument was not specified.
-      ValueError: If a row partitioning tensor has an inappropriate shape.
-      ValueError: If multiple partitioning arguments are specified.
-      ValueError: If nrows is specified but value_rowids is not None.
+      ValueError: If internal = False. Note that this method is intended only
+                 for internal use.
+      TypeError: If values is not a `RaggedTensor` or `Tensor`, or
+                 row_partition is not a `RowPartition`.
     """
+
     if not internal:
       raise ValueError("RaggedTensor constructor is private; please use one "
                        "of the factory methods instead (e.g., "
                        "RaggedTensor.from_row_lengths())")
-
-    # Validate the arguments.
-    if not isinstance(row_splits, ops.Tensor):
-      raise TypeError("Row-partitioning argument must be a Tensor, got %r" %
-                      row_splits)
     if not isinstance(values, (RaggedTensor, ops.Tensor)):
       raise TypeError("values must be a Tensor or RaggedTensor, got %r" %
                       values)
-    if row_splits.dtype not in (dtypes.int32, dtypes.int64):
-      raise ValueError("Row-partitioning argument must be int32 or int64")
+    if not isinstance(row_partition, RowPartition):
+      raise TypeError("row_partition must be a RowPartition, got %r" %
+                      row_partition)
 
-    # Validate shapes & dtypes.
-    row_splits.shape.assert_has_rank(1)
+    # Validate shapes.
+    values = convert_to_tensor_or_ragged_tensor(values)
     values.shape.with_rank_at_least(1)
-    row_splits.set_shape([None])
     if isinstance(values, RaggedTensor):
-      assert row_splits.dtype == values.row_splits.dtype
+      # pylint: disable=protected-access
+      assert row_partition.dtype == values._row_partition.dtype
 
     self._values = values
-    self._row_splits = row_splits
-
-    # Store any cached tensors.  These are used to avoid unnecessary
-    # round-trip conversions when a RaggedTensor is constructed from
-    # lengths or rowids, and we later want those lengths/rowids back.
-    for tensor in [cached_row_lengths, cached_value_rowids, cached_nrows]:
-      if tensor is not None:
-        if not isinstance(tensor, ops.Tensor):
-          raise TypeError("Cached value must be a Tensor or None.")
-        elif tensor.dtype not in (dtypes.int32, dtypes.int64):
-          raise TypeError("Cached value must be int32 or int64.")
-    self._cached_row_lengths = cached_row_lengths
-    self._cached_value_rowids = cached_value_rowids
-    self._cached_nrows = cached_nrows
-
-    if uniform_row_length is not None:
-      if not isinstance(uniform_row_length, ops.Tensor):
-        raise TypeError("uniform_row_length must be a Tensor or None.")
-      elif uniform_row_length.dtype not in (dtypes.int32, dtypes.int64):
-        raise TypeError("uniform_row_length must be int32 or int64.")
-    self._uniform_row_length = uniform_row_length
+    self._row_partition = row_partition
 
   #=============================================================================
   # Factory Methods
   #=============================================================================
 
+  @classmethod
+  def _from_row_partition(cls, values, row_partition, validate=True):
+    """Creates a `RaggedTensor` with a row partition.
+
+    This is used as a way for RaggedTensors to share row partitions.
+
+    The outer dimension of values must be equal to `partition.nvals()`.
+
+    Args:
+      values: A potentially ragged tensor.
+      row_partition: a `RowPartition`: can be shared between tensors.
+      validate: If true, then use assertions to check that the arguments form a
+        valid `RaggedTensor`.
+
+    Returns:
+      A `RaggedTensor`.  `result.rank = values.rank + 1`.
+      `result.ragged_rank = values.ragged_rank + 1`.
+
+    Raises:
+      ValueError: If partition.nvals() != _nrows(values)
+    """
+    if not isinstance(row_partition, RowPartition):
+      raise TypeError("row_partition must be a RowPartition")
+    if not isinstance(validate, bool):
+      raise TypeError("validate must have type bool")
+    values, row_partition = cls._convert_values_and_partition(
+        values, row_partition, "partition")
+    if row_partition.has_precomputed_value_rowids():
+      value_rowids_shape = row_partition.value_rowids().shape
+      values.shape[:1].assert_is_compatible_with(value_rowids_shape)
+    if validate:
+      msg = "Arguments to _from_row_partition do not form a valid RaggedTensor"
+      nvals = _nrows(values, row_partition.dtype)
+      checks = [
+          check_ops.assert_equal(
+              row_partition.nvals(out_type=row_partition.dtype),
+              nvals,
+              message=msg),
+      ]
+      if not isinstance(values, RaggedTensor):
+        checks.append(check_ops.assert_rank_at_least(values, 1))
+      row_partition = row_partition.with_dependencies(checks)
+    return cls(
+        values=values,
+        internal=True,
+        row_partition=row_partition)
+
   @classmethod
   def from_value_rowids(cls,
                         values,
@@ -362,76 +374,15 @@ class RaggedTensor(composite_tensor.CompositeTensor):
     """
     if not isinstance(validate, bool):
       raise TypeError("validate must have type bool")
+
     with ops.name_scope(name, "RaggedFromValueRowIds",
                         [values, value_rowids, nrows]):
-      values, value_rowids = cls._convert_values_and_row_partition(
-          values, value_rowids, "value_rowids")
-      if nrows is None:
-        const_rowids = tensor_util.constant_value(value_rowids)
-        if const_rowids is None:
-          nrows = array_ops.concat([value_rowids[-1:], [-1]], axis=0)[0] + 1
-          const_nrows = None
-        else:
-          const_nrows = const_rowids[-1] + 1 if const_rowids.size > 0 else 0
-          nrows = ops.convert_to_tensor(const_nrows, value_rowids.dtype,
-                                        name="nrows")
-      else:
-        nrows = ops.convert_to_tensor(nrows, value_rowids.dtype, "nrows")
-        const_nrows = tensor_util.constant_value(nrows)
-        if const_nrows is not None:
-          if const_nrows < 0:
-            raise ValueError("Expected nrows >= 0; got %d" % const_nrows)
-          const_rowids = tensor_util.constant_value(value_rowids)
-          if const_rowids is not None and const_rowids.size > 0:
-            if not const_nrows >= const_rowids[-1] + 1:
-              raise ValueError(
-                  "Expected nrows >= value_rowids[-1] + 1; got nrows=%d, "
-                  "value_rowids[-1]=%d" % (const_nrows, const_rowids[-1]))
-
-      value_rowids.shape.assert_has_rank(1)
-      nrows.shape.assert_has_rank(0)
-      values.shape[:1].assert_is_compatible_with(value_rowids.shape)
-
-      if validate:
-        msg = "Arguments to from_value_rowids do not form a valid RaggedTensor"
-        nvals1 = _nrows(values)
-        nvals2 = _nrows(value_rowids)
-        checks = [
-            check_ops.assert_rank(value_rowids, 1, message=msg),
-            check_ops.assert_rank(nrows, 0, message=msg),
-            check_ops.assert_equal(nvals1, nvals2, message=msg),
-            check_ops.assert_non_negative(value_rowids[:1], message=msg),
-            _assert_monotonic_increasing(value_rowids, message=msg),
-            check_ops.assert_less(value_rowids[-1:], nrows, message=msg),
-        ]
-        if not isinstance(values, RaggedTensor):
-          checks.append(check_ops.assert_rank_at_least(values, 1))
-        value_rowids = control_flow_ops.with_dependencies(checks, value_rowids)
-
-      # Convert value_rowids & nrows to row_splits.
-      # Note: we don't use segment_ids_to_row_splits() here because we want
-      # to save the intermediate value `row_lengths`, so we can cache it.
-      # TODO(b/116708836) Upgrade bincount to accept int64 so we can skip the
-      # cast.
-      value_rowids_int32 = math_ops.cast(value_rowids, dtypes.int32)
-      nrows_int32 = math_ops.cast(nrows, dtypes.int32)
-      row_lengths = math_ops.bincount(
-          value_rowids_int32,
-          minlength=nrows_int32,
-          maxlength=nrows_int32,
-          dtype=value_rowids.dtype)
-      row_splits = array_ops.concat([[0], math_ops.cumsum(row_lengths)], axis=0)
-      if const_nrows is not None:
-        row_lengths.set_shape([const_nrows])
-        row_splits.set_shape([const_nrows + 1])
-
-      return cls(
-          values,
-          row_splits,
-          cached_row_lengths=row_lengths,
-          cached_value_rowids=value_rowids,
-          cached_nrows=nrows,
-          internal=True)
+      row_partition = RowPartition.from_value_rowids(
+          value_rowids=value_rowids,
+          nrows=nrows,
+          validate=validate,
+          preferred_dtype=_get_optional_partition_dtype(values))
+      return cls._from_row_partition(values, row_partition, validate=validate)
 
   @classmethod
   def from_row_splits(cls, values, row_splits, name=None, validate=True):
@@ -471,30 +422,13 @@ class RaggedTensor(composite_tensor.CompositeTensor):
     """
     if not isinstance(validate, bool):
       raise TypeError("validate must have type bool")
-    if isinstance(row_splits, (list, tuple)) and not row_splits:
-      raise ValueError("row_splits tensor may not be empty.")
-    if isinstance(row_splits, tensor_spec.TensorSpec):
-      return cls(values=values, row_splits=row_splits, internal=True)
 
     with ops.name_scope(name, "RaggedFromRowSplits", [values, row_splits]):
-      values, row_splits = cls._convert_values_and_row_partition(
-          values, row_splits, "row_splits")
-      row_splits.shape.assert_has_rank(1)
-
-      if validate:
-        msg = "Arguments to from_row_splits do not form a valid RaggedTensor"
-        nvals = _nrows(values, row_splits.dtype)
-        checks = [
-            check_ops.assert_rank(row_splits, 1, message=msg),
-            _assert_zero(row_splits[0], message=msg),
-            _assert_monotonic_increasing(row_splits, message=msg),
-            check_ops.assert_equal(row_splits[-1], nvals, message=msg),
-        ]
-        if not isinstance(values, RaggedTensor):
-          checks.append(check_ops.assert_rank_at_least(values, 1))
-        row_splits = control_flow_ops.with_dependencies(checks, row_splits)
-
-      return cls(values=values, row_splits=row_splits, internal=True)
+      row_partition = RowPartition.from_row_splits(
+          row_splits=row_splits,
+          validate=validate,
+          preferred_dtype=_get_optional_partition_dtype(values))
+      return cls._from_row_partition(values, row_partition, validate=validate)
 
   @classmethod
   def from_row_lengths(cls, values, row_lengths, name=None, validate=True):
@@ -530,31 +464,13 @@ class RaggedTensor(composite_tensor.CompositeTensor):
     """
     if not isinstance(validate, bool):
       raise TypeError("validate must have type bool")
+
     with ops.name_scope(name, "RaggedFromRowLengths", [values, row_lengths]):
-      values, row_lengths = cls._convert_values_and_row_partition(
-          values, row_lengths, "row_lengths")
-      row_lengths.shape.assert_has_rank(1)
-
-      if validate:
-        msg = "Arguments to from_row_lengths do not form a valid RaggedTensor"
-        nvals1 = math_ops.reduce_sum(row_lengths)
-        nvals2 = _nrows(values, row_lengths.dtype)
-        checks = [
-            check_ops.assert_rank(row_lengths, 1, message=msg),
-            check_ops.assert_non_negative(row_lengths, message=msg),
-            check_ops.assert_equal(nvals1, nvals2, message=msg)
-        ]
-        if not isinstance(values, RaggedTensor):
-          checks.append(check_ops.assert_rank_at_least(values, 1))
-        row_lengths = control_flow_ops.with_dependencies(checks, row_lengths)
-
-      row_limits = math_ops.cumsum(row_lengths)
-      row_splits = array_ops.concat([[0], row_limits], axis=0)
-      return cls(
-          values=values,
-          row_splits=row_splits,
-          cached_row_lengths=row_lengths,
-          internal=True)
+      row_partition = RowPartition.from_row_lengths(
+          row_lengths=row_lengths,
+          validate=validate,
+          preferred_dtype=_get_optional_partition_dtype(values))
+      return cls._from_row_partition(values, row_partition, validate=validate)
 
   @classmethod
   def from_row_starts(cls, values, row_starts, name=None, validate=True):
@@ -587,25 +503,13 @@ class RaggedTensor(composite_tensor.CompositeTensor):
     if not isinstance(validate, bool):
       raise TypeError("validate must have type bool")
     with ops.name_scope(name, "RaggedFromRowStarts", [values, row_starts]):
-      values, row_starts = cls._convert_values_and_row_partition(
-          values, row_starts, "row_starts")
-      row_starts.shape.assert_has_rank(1)
-      nvals = _nrows(values, row_starts.dtype)
-
-      if validate:
-        msg = "Arguments to from_row_starts do not form a valid RaggedTensor"
-        checks = [
-            check_ops.assert_rank(row_starts, 1, message=msg),
-            _assert_zero(row_starts[:1], message=msg),
-            _assert_monotonic_increasing(row_starts, message=msg),
-            check_ops.assert_less_equal(row_starts[-1:], nvals, message=msg),
-        ]
-        if not isinstance(values, RaggedTensor):
-          checks.append(check_ops.assert_rank_at_least(values, 1))
-        row_starts = control_flow_ops.with_dependencies(checks, row_starts)
-
-      row_splits = array_ops.concat([row_starts, [nvals]], axis=0)
-      return cls(values=values, row_splits=row_splits, internal=True)
+      values = convert_to_tensor_or_ragged_tensor(values)
+      row_partition = RowPartition.from_row_starts(
+          row_starts=row_starts,
+          nvals=_nrows(values),
+          validate=validate,
+          preferred_dtype=_get_optional_partition_dtype(values))
+      return cls._from_row_partition(values, row_partition, validate=validate)
 
   @classmethod
   def from_row_limits(cls, values, row_limits, name=None, validate=True):
@@ -637,26 +541,11 @@ class RaggedTensor(composite_tensor.CompositeTensor):
     if not isinstance(validate, bool):
       raise TypeError("validate must have type bool")
     with ops.name_scope(name, "RaggedFromRowLimits", [values, row_limits]):
-      values, row_limits = cls._convert_values_and_row_partition(
-          values, row_limits, "row_limits")
-      row_limits.shape.assert_has_rank(1)
-
-      if validate:
-        msg = "Arguments to from_row_limits do not form a valid RaggedTensor"
-        nvals = _nrows(values, row_limits.dtype)
-        checks = [
-            check_ops.assert_rank(row_limits, 1, message=msg),
-            check_ops.assert_non_negative(row_limits[:1], message=msg),
-            _assert_monotonic_increasing(row_limits, message=msg),
-            check_ops.assert_equal(row_limits[-1:], nvals, message=msg)
-        ]
-        if not isinstance(values, RaggedTensor):
-          checks.append(check_ops.assert_rank_at_least(values, 1))
-        row_limits = control_flow_ops.with_dependencies(checks, row_limits)
-
-      zero = array_ops.zeros([1], row_limits.dtype)
-      row_splits = array_ops.concat([zero, row_limits], axis=0)
-      return cls(values=values, row_splits=row_splits, internal=True)
+      row_partition = RowPartition.from_row_limits(
+          row_limits=row_limits,
+          validate=validate,
+          preferred_dtype=_get_optional_partition_dtype(values))
+      return cls._from_row_partition(values, row_partition, validate=validate)
 
   @classmethod
   def from_uniform_row_length(cls,
@@ -691,8 +580,8 @@ class RaggedTensor(composite_tensor.CompositeTensor):
 
     Args:
       values: A potentially ragged tensor with shape `[nvals, ...]`.
-      uniform_row_length: A scalar integer tensor.  Must be nonnegative.
-        The size of the outer axis of `values` must be evenly divisible by
+      uniform_row_length: A scalar integer tensor.  Must be nonnegative. The
+        size of the outer axis of `values` must be evenly divisible by
         `uniform_row_length`.
       nrows: The number of rows in the constructed RaggedTensor.  If not
         specified, then it defaults to `nvals/uniform_row_length` (or `0` if
@@ -719,85 +608,18 @@ class RaggedTensor(composite_tensor.CompositeTensor):
       raise TypeError("validate must have type bool")
     with ops.name_scope(name, "RaggedFromUniformRowLength",
                         [values, uniform_row_length, nrows]):
-      values, uniform_row_length = cls._convert_values_and_row_partition(
-          values, uniform_row_length, "uniform_row_length")
-      uniform_row_length.shape.assert_has_rank(0)
-
-      # Find nvals.
-      const_nvals = tensor_shape.dimension_at_index(values.shape, 0).value
-      if const_nvals is not None:
-        nvals = constant_op.constant(const_nvals, uniform_row_length.dtype)
-      elif isinstance(values, RaggedTensor):
-        nvals = values.nrows(out_type=uniform_row_length.dtype)
-      else:
-        nvals = array_ops.shape(values, out_type=uniform_row_length.dtype)[0]
-
-      # Find nrows.
-      const_row_length = tensor_util.constant_value(uniform_row_length)
-      if nrows is None:
-        if const_row_length is None:
-          # Avoid division by zero if uniform_row_length==0 (and nvals==0).
-          rowlen_or_1 = control_flow_ops.cond(
-              math_ops.equal(uniform_row_length, 0),
-              lambda: constant_op.constant(1, uniform_row_length.dtype),
-              lambda: uniform_row_length)
-          nrows = nvals // rowlen_or_1
-        elif const_row_length == 0:
-          nrows = 0
-        else:
-          nrows = nvals // const_row_length
-      nrows = ops.convert_to_tensor(
-          nrows, uniform_row_length.dtype, name="nrows")
-      const_nrows = tensor_util.constant_value(nrows)
-
-      # Find row_splits.
-      if const_nrows is not None and const_row_length is not None:
-        row_splits = [v * const_row_length for v in range(const_nrows + 1)]
-        row_splits = constant_op.constant(row_splits, uniform_row_length.dtype)
-      else:
-        row_splits = math_ops.range(nrows + 1) * uniform_row_length
-
-      if validate:
-        checks = []
-
-        if (const_nrows is None or const_row_length is None or
-            const_nvals is None):
-          checks.append(check_ops.assert_equal(
-              nrows * uniform_row_length,
-              nvals,
-              ("uniform_row_length", uniform_row_length, "times nrows",
-               nrows, "must equal nvals", nvals)))
-        else:
-          if const_nrows * const_row_length != const_nvals:
-            raise ValueError(
-                "uniform_row_length=%d times nrows=%d must equal nvals=%d"
-                % (const_row_length, const_nrows, const_nvals))
-
-        if uniform_row_length.shape.rank is None:
-          checks.append(
-              check_ops.assert_rank(
-                  uniform_row_length, 0,
-                  message="uniform_row_length must be a scalar."))
-
-        const_row_length = tensor_util.constant_value(uniform_row_length)
-        if const_row_length is None:
-          checks.append(
-              check_ops.assert_greater_equal(
-                  uniform_row_length,
-                  constant_op.constant(0, uniform_row_length.dtype),
-                  message="uniform_row_length must be >= 0."))
-        else:
-          if const_row_length < 0:
-            raise ValueError("uniform_row_length must be >= 0.")
-
-        row_splits = control_flow_ops.with_dependencies(checks, row_splits)
-
-      return cls(
-          values=values,
-          row_splits=row_splits,
+      values = convert_to_tensor_or_ragged_tensor(values)
+      uniform_row_length = _convert_row_partition(
+          uniform_row_length, "UniformRowLength",
+          _get_optional_partition_dtype(values))
+      nvals = _nvals_uniform_row_length(values, uniform_row_length)
+      row_partition = RowPartition.from_uniform_row_length(
           uniform_row_length=uniform_row_length,
-          cached_nrows=nrows,
-          internal=True)
+          nvals=nvals,
+          nrows=nrows,
+          validate=validate,
+          preferred_dtype=_get_optional_partition_dtype(values))
+      return cls._from_row_partition(values, row_partition, validate=validate)
 
   @classmethod
   def from_nested_value_rowids(cls,
@@ -823,7 +645,6 @@ class RaggedTensor(composite_tensor.CompositeTensor):
       nested_nrows: A list of integer scalars.  The `i`th scalar is used as the
         `nrows` for the `i`th ragged dimension.
       name: A name prefix for the RaggedTensor (optional).
-
       validate: If true, then use assertions to check that the arguments form
         a valid `RaggedTensor`.  Note: these assertions incur a runtime cost,
         since they must be checked for each tensor value.
@@ -847,14 +668,13 @@ class RaggedTensor(composite_tensor.CompositeTensor):
         raise ValueError("nested_nrows must have the same length as "
                          "nested_value_rowids")
 
-    with ops.name_scope(
-        name, "RaggedFromNestedValueRowIds",
-        [flat_values] + list(nested_value_rowids) + list(nested_nrows)):
+    with ops.name_scope(name, "RaggedFromNestedValueRowIds", [flat_values] +
+                        list(nested_value_rowids) + list(nested_nrows)):
       result = flat_values
       for value_rowids, nrows in reversed(
           list(zip(nested_value_rowids, nested_nrows))):
-        result = cls.from_value_rowids(result, value_rowids, nrows,
-                                       validate=validate)
+        result = cls.from_value_rowids(
+            result, value_rowids, nrows, validate=validate)
       return result
 
   @classmethod
@@ -936,7 +756,7 @@ class RaggedTensor(composite_tensor.CompositeTensor):
       return result
 
   @classmethod
-  def _convert_values_and_row_partition(cls, values, partition, name):
+  def _convert_values_and_partition(cls, values, row_partition, name):
     """Converts `values` and `partition` to Tensors.
 
     If `values` is a `RaggedTensor`, then converts `values` and `partition`
@@ -948,39 +768,28 @@ class RaggedTensor(composite_tensor.CompositeTensor):
 
     Args:
       values: The `values` for the `RaggedTensor` being constructed.
-      partition: A row-partitioning tensor for the `RaggedTensor` being
-        constructed.  I.e., one of: row_splits, row_lengths, row_starts,
-        row_limits, value_rowids.
-      name: The name of the row-partitioning tensor.
+      row_partition: A RowPartition object for the `RaggedTensor` being
+        constructed.
+      name: The name of the RowPartition object.
 
     Returns:
       A tuple (values, partition).
     """
+    if not isinstance(row_partition, RowPartition):
+      raise ValueError("partition must be a RowPartition")
     if isinstance(values, RaggedTensor):
-      if isinstance(partition, ops.Tensor):
-        if partition.dtype not in (dtypes.int32, dtypes.int64):
-          raise ValueError("%s must have dtype int32 or int64" % name)
-        if values.row_splits.dtype != partition.dtype:
-          if not ragged_config.auto_cast_partition_dtype():
-            raise ValueError("dtype mismatch: %s (%s) vs values.row_splits (%s)"
-                             % (name, partition.dtype, values.row_splits.dtype))
-          partition = math_ops.cast(partition, dtypes.int64)
-          values = values.with_row_splits_dtype(dtypes.int64)
-      else:
-        partition = ops.convert_to_tensor(partition, values.row_splits.dtype,
-                                          name=name)
+      # pylint: disable=protected-access
+      if values._row_partition.dtype != row_partition.dtype:
+        if not ragged_config.auto_cast_partition_dtype():
+          # pylint: disable=protected-access
+          raise ValueError(
+              "dtype mismatch: %s (%s) vs values.partition (%s)" %
+              (name, row_partition.dtype, values._row_partition.dtype))
+        values = values.with_row_splits_dtype(row_partition.dtype)
     else:
       values = ops.convert_to_tensor(values, name="values")
-      if isinstance(partition, np.ndarray) and partition.dtype == np.int32:
-        partition = ops.convert_to_tensor(partition, name=name)
-      else:
-        partition = ops.convert_to_tensor(
-            partition, preferred_dtype=dtypes.int64,
-            name=name)
-      if partition.dtype not in (dtypes.int32, dtypes.int64):
-        raise ValueError("%s must have dtype int32 or int64" % name)
 
-    return (values, partition)
+    return (values, row_partition)
 
   #=============================================================================
   # Accessors
@@ -1008,17 +817,10 @@ class RaggedTensor(composite_tensor.CompositeTensor):
     TensorShape([2, None, 2])
 
     """
-    nrows = tensor_shape.dimension_at_index(self._row_splits.shape, 0) - 1
-
-    if self._uniform_row_length is not None:
-      row_length = tensor_util.constant_value(self._uniform_row_length)
-    else:
-      row_length = None
-
-    values_shape = self._values.shape
-    value_shape = values_shape[1:]
-    return tensor_shape.TensorShape([nrows,
-                                     row_length]).concatenate(value_shape)
+    nrows = self._row_partition.static_nrows
+    ncols = self._row_partition.static_uniform_row_length
+    value_shape = self._values.shape[1:]
+    return tensor_shape.TensorShape([nrows, ncols]).concatenate(value_shape)
 
   @property
   def ragged_rank(self):
@@ -1055,6 +857,17 @@ class RaggedTensor(composite_tensor.CompositeTensor):
     """
     return self._values
 
+  @property
+  def _nested_row_partitions(self):
+    """Returns the row partitions for this `RaggedTensor`."""
+    partitions = [self._row_partition]
+    rt_values = self.values
+    while isinstance(rt_values, RaggedTensor):
+      # pylint: disable=protected-access
+      partitions.append(rt_values._row_partition)
+      rt_values = rt_values.values
+    return tuple(partitions)
+
   @property
   def row_splits(self):
     """The row-split indices for this ragged tensor's `values`.
@@ -1076,7 +889,7 @@ class RaggedTensor(composite_tensor.CompositeTensor):
     tf.Tensor([0 4 4 7 8 8], shape=(6,), dtype=int64)
 
     """
-    return self._row_splits
+    return self._row_partition.row_splits()
 
   @property
   def uniform_row_length(self):
@@ -1102,7 +915,7 @@ class RaggedTensor(composite_tensor.CompositeTensor):
       ragged tensor (for ragged tensors whose rows are uniform); or `None`
       (for ragged tensors whose rows are ragged).
     """
-    return self._uniform_row_length
+    return self._row_partition.uniform_row_length()
 
   @property
   def flat_values(self):
@@ -1189,11 +1002,8 @@ class RaggedTensor(composite_tensor.CompositeTensor):
     tf.Tensor([0 0 0 0 2 2 2 3], shape=(8,), dtype=int64)
 
     """
-    if self._cached_value_rowids is not None:
-      return self._cached_value_rowids
-
     with ops.name_scope(name, "RaggedValueRowIds", [self]):
-      return segment_id_ops.row_splits_to_segment_ids(self.row_splits)
+      return self._row_partition.value_rowids()
 
   def nested_value_rowids(self, name=None):
     """Returns a tuple containing the value_rowids for all ragged dimensions.
@@ -1252,18 +1062,8 @@ class RaggedTensor(composite_tensor.CompositeTensor):
     tf.Tensor(5, shape=(), dtype=int64)
 
     """
-    if out_type is None:
-      out_type = self._row_splits.dtype
-    else:
-      out_type = dtypes.as_dtype(out_type)
-    if self._cached_nrows is not None:
-      return math_ops.cast(self._cached_nrows, out_type)
     with ops.name_scope(name, "RaggedNRows", [self]):
-      nsplits = tensor_shape.dimension_at_index(self.row_splits.shape, 0)
-      if nsplits.value is None:
-        return array_ops.shape(self.row_splits, out_type=out_type)[0] - 1
-      else:
-        return constant_op.constant(nsplits.value - 1, dtype=out_type)
+      return self._row_partition.nrows(out_type=out_type)
 
   def row_starts(self, name=None):
     """Returns the start indices for rows in this ragged tensor.
@@ -1288,7 +1088,7 @@ class RaggedTensor(composite_tensor.CompositeTensor):
 
     """
     with ops.name_scope(name, "RaggedRowStarts", [self]):
-      return self.row_splits[:-1]
+      return self._row_partition.row_starts()
 
   def row_limits(self, name=None):
     """Returns the limit indices for rows in this ragged tensor.
@@ -1313,7 +1113,7 @@ class RaggedTensor(composite_tensor.CompositeTensor):
 
     """
     with ops.name_scope(name, "RaggedRowLimits", [self]):
-      return self.row_splits[1:]
+      return self._row_partition.row_limits()
 
   def row_lengths(self, axis=1, name=None):
     """Returns the lengths of the rows in this ragged tensor.
@@ -1342,8 +1142,11 @@ class RaggedTensor(composite_tensor.CompositeTensor):
     <tf.RaggedTensor [[3, 1], [], [2, 1], [1], []]>
 
     """
-    if self._cached_row_lengths is not None:
-      return self._cached_row_lengths
+    if axis == 0:
+      return self._row_partition.nrows()
+
+    if axis == 1:
+      return self._row_partition.row_lengths()
 
     with ops.name_scope(name, "RaggedRowLengths", [self]):
       axis = array_ops.get_positive_axis(
@@ -1356,9 +1159,9 @@ class RaggedTensor(composite_tensor.CompositeTensor):
       elif isinstance(self.values, RaggedTensor):
         return self.with_values(self.values.row_lengths(axis - 1))
       else:
-        shape = array_ops.shape(self.values, out_type=self._row_splits.dtype)
+        shape = array_ops.shape(self.values, out_type=self._row_partition.dtype)
         return self.with_values(
-            array_ops.ones(shape[:axis - 1], self._row_splits.dtype) *
+            array_ops.ones(shape[:axis - 1], self._row_partition.dtype) *
             shape[axis - 1])
 
   def nested_row_lengths(self, name=None):
@@ -1408,7 +1211,7 @@ class RaggedTensor(composite_tensor.CompositeTensor):
 
     """
     if out_type is None:
-      out_type = self._row_splits.dtype
+      out_type = self._row_partition.dtype
     else:
       out_type = dtypes.as_dtype(out_type)
     with ops.name_scope(name, "RaggedBoundingBox", [self, axis]):
@@ -1456,7 +1259,7 @@ class RaggedTensor(composite_tensor.CompositeTensor):
     new_values.shape.with_rank_at_least(1)
     self.values.shape[:1].assert_is_compatible_with(new_values.shape[:1])
     if (isinstance(new_values, RaggedTensor) and
-        self._row_splits.dtype != new_values.row_splits.dtype):
+        self._row_partition.dtype != new_values.row_splits.dtype):
       if not ragged_config.auto_cast_partition_dtype():
         raise ValueError("self and new_values have mismatched row_splits "
                          "dtypes; use RaggedTensor.with_row_splits_dtype() to "
@@ -1464,13 +1267,7 @@ class RaggedTensor(composite_tensor.CompositeTensor):
       new_values = new_values.with_row_splits_dtype(dtypes.int64)
       return self.with_row_splits_dtype(dtypes.int64).with_values(new_values)
     return RaggedTensor(
-        values=new_values,
-        row_splits=self._row_splits,
-        cached_row_lengths=self._cached_row_lengths,
-        cached_value_rowids=self._cached_value_rowids,
-        cached_nrows=self._cached_nrows,
-        internal=True,
-        uniform_row_length=self._uniform_row_length)
+        values=new_values, row_partition=self._row_partition, internal=True)
 
   def with_flat_values(self, new_values):
     """Returns a copy of `self` with `flat_values` replaced by `new_value`.
@@ -1480,8 +1277,8 @@ class RaggedTensor(composite_tensor.CompositeTensor):
 
     Args:
       new_values: Potentially ragged tensor that should replace
-      `self.flat_values`.  Must have `rank > 0`, and must have the same
-      number of rows as `self.flat_values`.
+        `self.flat_values`.  Must have `rank > 0`, and must have the same number
+        of rows as `self.flat_values`.
 
     Returns:
       A `RaggedTensor`.
@@ -1509,30 +1306,19 @@ class RaggedTensor(composite_tensor.CompositeTensor):
     dtype = dtypes.as_dtype(dtype)
     if dtype not in (dtypes.int32, dtypes.int64):
       raise ValueError("dtype must be int32 or int64")
-    if self._row_splits.dtype == dtype:
+    if self._row_partition.dtype == dtype:
       return self
-
-    row_splits = math_ops.cast(self._row_splits, dtype)
-
-    values = self._values
-    if isinstance(values, RaggedTensor):
-      values = values.with_row_splits_dtype(dtype)
-    cached_row_lengths = self._cached_row_lengths
-    if cached_row_lengths is not None:
-      cached_row_lengths = math_ops.cast(cached_row_lengths, dtype)
-    cached_value_rowids = self._cached_value_rowids
-    if cached_value_rowids is not None:
-      cached_value_rowids = math_ops.cast(cached_value_rowids, dtype)
-    cached_nrows = self._cached_nrows
-    if cached_value_rowids is not None:
-      cached_value_rowids = math_ops.cast(cached_value_rowids, dtype)
-    uniform_row_length = self._uniform_row_length
-    if uniform_row_length is not None:
-      uniform_row_length = math_ops.cast(uniform_row_length, dtype)
-
-    return RaggedTensor(values, row_splits, cached_row_lengths,
-                        cached_value_rowids, cached_nrows, internal=True,
-                        uniform_row_length=uniform_row_length)
+    current_values = self._values
+    if isinstance(current_values, RaggedTensor):
+      return RaggedTensor(
+          values=current_values.with_row_splits_dtype(dtype),
+          row_partition=self._row_partition.with_row_splits_dtype(dtype),
+          internal=True)
+    else:
+      return RaggedTensor(
+          values=current_values,
+          row_partition=self._row_partition.with_row_splits_dtype(dtype),
+          internal=True)
 
   def merge_dims(self, outer_axis, inner_axis):
     """Merges outer_axis...inner_axis into a single dimension.
@@ -1558,8 +1344,8 @@ class RaggedTensor(composite_tensor.CompositeTensor):
     Args:
       outer_axis: `int`: The first dimension in the range of dimensions to
         merge. May be negative if `self.shape.rank` is statically known.
-      inner_axis: `int`: The last dimension in the range of dimensions to
-        merge. May be negative if `self.shape.rank` is statically known.
+      inner_axis: `int`: The last dimension in the range of dimensions to merge.
+        May be negative if `self.shape.rank` is statically known.
 
     Returns:
       A copy of this tensor, with the specified dimensions merged into a
@@ -1630,9 +1416,9 @@ class RaggedTensor(composite_tensor.CompositeTensor):
         `Tensor` whose length is equal to `tensor.shape[0]` (the number of rows
         in `tensor`).  If specified, then `output[row]` will contain
         `tensor[row][:lengths[row]]`.  Negative lengths are treated as zero. You
-        may optionally pass a list or tuple of lengths to this argument, which
-        will be used as nested row lengths to construct a ragged tensor with
-        multiple ragged dimensions.
+          may optionally pass a list or tuple of lengths to this argument, which
+          will be used as nested row lengths to construct a ragged tensor with
+          multiple ragged dimensions.
       padding: An optional padding value.  If specified, then any row suffix
         consisting entirely of `padding` will be excluded from the returned
         RaggedTensor.  `padding` is a `Tensor` with the same dtype as `tensor`
@@ -1655,8 +1441,8 @@ class RaggedTensor(composite_tensor.CompositeTensor):
     if not isinstance(ragged_rank, int):
       raise TypeError("ragged_rank expected int, got %r" % ragged_rank)
     if ragged_rank <= 0:
-      raise ValueError(
-          "ragged_rank must be greater than 0; got %s" % ragged_rank)
+      raise ValueError("ragged_rank must be greater than 0; got %s" %
+                       ragged_rank)
 
     with ops.name_scope(name, "RaggedFromTensor", [tensor, lengths, padding]):
       tensor = ops.convert_to_tensor(tensor, name="tensor")
@@ -1685,8 +1471,7 @@ class RaggedTensor(composite_tensor.CompositeTensor):
             ones_mask, lengths, validate=False)
         dense_ragged_mask = ragged_mask.to_tensor(default_value=False)
         masked_data = array_ops.boolean_mask(tensor, dense_ragged_mask)
-        return cls.from_nested_row_lengths(
-            masked_data, lengths, validate=False)
+        return cls.from_nested_row_lengths(masked_data, lengths, validate=False)
 
       # Handle ragged_rank>1 via recursion:
       # If the output should have multiple ragged dimensions, then first
@@ -1706,8 +1491,8 @@ class RaggedTensor(composite_tensor.CompositeTensor):
                                        axis=0)
           dim_size = math_ops.cumprod(input_shape)
         flattened = array_ops.reshape(tensor, new_shape)
-        result = cls.from_tensor(flattened, lengths, padding,
-                                 row_splits_dtype=row_splits_dtype)
+        result = cls.from_tensor(
+            flattened, lengths, padding, row_splits_dtype=row_splits_dtype)
 
         for axis in range(ragged_rank - 1, 0, -1):
           dim_len = tensor_shape.dimension_at_index(tensor.shape, axis).value
@@ -1736,7 +1521,7 @@ class RaggedTensor(composite_tensor.CompositeTensor):
         # If the padding isn't a scalar, then require that all values in the
         # padding match each item in the tensor.  After this block of code,
         # `has_default.shape = tensor.shape[:2]`.  (Unfortunately, we can't just
-        # use reduce_all for both cases, because when you pass an empty `axis`
+        # use reduce_all for both cases, becaue when you pass an empty `axis`
         # list to reduce_all, it reduces all axes; but we want it to reduce no
         # axes -- i.e., to be a no-op.)
         tensor_rank = array_ops.rank(tensor)
@@ -1755,8 +1540,8 @@ class RaggedTensor(composite_tensor.CompositeTensor):
         has_nondefault = math_ops.logical_not(has_default)
         has_nondefault = math_ops.cast(has_nondefault, row_splits_dtype)
         length_for_nondefault_value = (
-            has_nondefault * array_ops.expand_dims(
-                math_ops.range(1, ncols + 1), 0))
+            has_nondefault *
+            array_ops.expand_dims(math_ops.range(1, ncols + 1), 0))
         lengths = math_ops.reduce_max(length_for_nondefault_value, axis=1)
 
       if lengths is not None:
@@ -1791,8 +1576,7 @@ class RaggedTensor(composite_tensor.CompositeTensor):
       else:
         ncols = input_shape[1]
       return RaggedTensor.from_uniform_row_length(
-          values=values, uniform_row_length=ncols,
-          nrows=nrows, validate=False)
+          values=values, uniform_row_length=ncols, nrows=nrows, validate=False)
 
   def to_tensor(self, default_value=None, name=None, shape=None):
     """Converts this `RaggedTensor` into a `tf.Tensor`.
@@ -1968,10 +1752,10 @@ class RaggedTensor(composite_tensor.CompositeTensor):
         nested-batched) `RaggedTensor`.
       dtype: The dtype of the encoded `RaggedTensor`.
       output_ragged_rank: The expected ragged rank of the output `RaggedTensor`.
-      input_ragged_rank: The ragged rank of each encoded `RaggedTensor`. This
-        is optional and inferred dynamically if not provided.
-      row_splits_dtype: `dtype` for the RaggedTensor's `row_splits` tensor.
-        One of `tf.int32` or `tf.int64`.
+      input_ragged_rank: The ragged rank of each encoded `RaggedTensor`. This is
+        optional and inferred dynamically if not provided.
+      row_splits_dtype: `dtype` for the RaggedTensor's `row_splits` tensor. One
+        of `tf.int32` or `tf.int64`.
       name: A name prefix for the returned tensors (optional).
 
     Returns:
@@ -2038,8 +1822,8 @@ class RaggedTensor(composite_tensor.CompositeTensor):
     if self._is_eager():
       return "<tf.RaggedTensor %s>" % self.to_list()
     else:
-      return "tf.RaggedTensor(values=%s, row_splits=%s)" % (self._values,
-                                                            self._row_splits)
+      return "tf.RaggedTensor(values=%s, row_splits=%s)" % (
+          self.values, self.row_splits)
 
   #=============================================================================
   # Eager Execution Mode
@@ -2074,8 +1858,8 @@ class RaggedTensor(composite_tensor.CompositeTensor):
     """
     if not self._is_eager():
       raise ValueError("RaggedTensor.numpy() is only supported in eager mode.")
-    values = self._values.numpy()
-    splits = self._row_splits.numpy()
+    values = self.values.numpy()
+    splits = self.row_splits.numpy()
     rows = [values[splits[i]:splits[i + 1]] for i in range(len(splits) - 1)]
     if not rows:
       return np.zeros((0, 0) + values.shape[1:], dtype=values.dtype)
@@ -2117,15 +1901,50 @@ class RaggedTensor(composite_tensor.CompositeTensor):
     return isinstance(rt, ops.EagerTensor)
 
   #=============================================================================
-  # Indexing & Slicing
+  # Operators
   #=============================================================================
-  def __getitem__(self, key):
-    """Returns the specified piece of this RaggedTensor."""
-    # See ragged_getitem.py for the documentation and implementation of this
-    # method.
-    #
-    # Note: the imports in ragged/__init__.py ensure that this method always
-    # gets overridden before it is called.
+  # To avoid circular dependencies, we define stub methods for operators here,
+  # and then override them when the ragged_operators module is imported.
+
+  def _overloaded_operator(name):  # pylint: disable=no-self-argument
+    def stub(*args, **kwargs):
+      del args, kwargs
+      raise ValueError(
+          "You must import 'tensorflow.python.ops.ragged.ragged_ops' "
+          "before using RaggedTensor.%s" % name)
+    return stub
+
+  __getitem__ = _overloaded_operator("__getitem__")
+  __ge__ = _overloaded_operator("__ge__")
+  __gt__ = _overloaded_operator("__gt__")
+  __le__ = _overloaded_operator("__le__")
+  __lt__ = _overloaded_operator("__lt__")
+  __and__ = _overloaded_operator("__and__")
+  __rand__ = _overloaded_operator("__rand__")
+  __invert__ = _overloaded_operator("__invert__")
+  __ror__ = _overloaded_operator("__ror__")
+  __or__ = _overloaded_operator("__or__")
+  __xor__ = _overloaded_operator("__xor__")
+  __rxor__ = _overloaded_operator("__rxor__")
+  __abs__ = _overloaded_operator("__abs__")
+  __add__ = _overloaded_operator("__add__")
+  __radd__ = _overloaded_operator("__radd__")
+  __div__ = _overloaded_operator("__div__")
+  __rdiv__ = _overloaded_operator("__rdiv__")
+  __floordiv__ = _overloaded_operator("__floordiv__")
+  __rfloordiv__ = _overloaded_operator("__rfloordiv__")
+  __mod__ = _overloaded_operator("__mod__")
+  __rmod__ = _overloaded_operator("__rmod__")
+  __mul__ = _overloaded_operator("__mul__")
+  __rmul__ = _overloaded_operator("__rmul__")
+  __neg__ = _overloaded_operator("__neg__")
+  __pow__ = _overloaded_operator("__pow__")
+  __rpow__ = _overloaded_operator("__rpow__")
+  __sub__ = _overloaded_operator("__sub__")
+  __rsub__ = _overloaded_operator("__rsub__")
+  __truediv__ = _overloaded_operator("__truediv__")
+  __rtruediv__ = _overloaded_operator("__rtruediv__")
+  del _overloaded_operator
 
   #=============================================================================
   # Name Scope
@@ -2148,11 +1967,7 @@ class RaggedTensor(composite_tensor.CompositeTensor):
 
   @property
   def _type_spec(self):
-    return RaggedTensorSpec(
-        shape=self.shape,
-        dtype=self.dtype,
-        ragged_rank=self.ragged_rank,
-        row_splits_dtype=self._row_splits.dtype)
+    return RaggedTensorSpec.from_value(self)
 
   def _shape_invariant_to_type_spec(self, shape):
     return RaggedTensorSpec(shape, self.dtype, self.ragged_rank,
@@ -2176,6 +1991,7 @@ def match_row_splits_dtypes(*tensors, **kwargs):
     **kwargs: If 'return_dtype=True', then return a tuple (dtype, tensors),
       where `dtype` is the data type used by row-splits, and `tensors` is the
       converted list of `Tensors` and `RaggedTensors`.
+
   Returns:
     The converted list of `Tensors` and `RaggedTensors`.
   """
@@ -2198,8 +2014,10 @@ def match_row_splits_dtypes(*tensors, **kwargs):
                        "use RaggedTensor.with_row_splits_dtype() to convert "
                        "them to compatible dtypes.")
     dtype = dtypes.int64
-    tensors = tuple(t.with_row_splits_dtype(dtypes.int64)
-                    if isinstance(t, RaggedTensor) else t for t in tensors)
+    tensors = tuple(
+        t.with_row_splits_dtype(dtypes.int64) if isinstance(t, RaggedTensor
+                                                           ) else t
+        for t in tensors)
 
   elif has_int32:
     dtype = dtypes.int32
@@ -2225,18 +2043,21 @@ class RaggedTensorSpec(type_spec.BatchableTypeSpec):
   def value_type(self):
     return RaggedTensor if self._ragged_rank > 0 else ops.Tensor
 
-  def __init__(self, shape=None, dtype=dtypes.float32, ragged_rank=None,
+  def __init__(self,
+               shape=None,
+               dtype=dtypes.float32,
+               ragged_rank=None,
                row_splits_dtype=dtypes.int64):
     """Constructs a type specification for a `tf.RaggedTensor`.
 
     Args:
-      shape: The shape of the RaggedTensor, or `None` to allow any shape.  If
-        a shape is specified, then all ragged dimensions must have size `None`.
+      shape: The shape of the RaggedTensor, or `None` to allow any shape.  If a
+        shape is specified, then all ragged dimensions must have size `None`.
       dtype: `tf.DType` of values in the RaggedTensor.
-      ragged_rank: Python integer, the ragged rank of the RaggedTensor
-        to be described.  Defaults to `shape.ndims - 1`.
-      row_splits_dtype: `dtype` for the RaggedTensor's `row_splits` tensor.
-        One of `tf.int32` or `tf.int64`.
+      ragged_rank: Python integer, the ragged rank of the RaggedTensor to be
+        described.  Defaults to `shape.ndims - 1`.
+      row_splits_dtype: `dtype` for the RaggedTensor's `row_splits` tensor. One
+        of `tf.int32` or `tf.int64`.
     """
     self._shape = tensor_shape.as_shape(shape)
     self._dtype = dtypes.as_dtype(dtype)
@@ -2256,6 +2077,14 @@ class RaggedTensorSpec(type_spec.BatchableTypeSpec):
       if ragged_rank >= rank:
         raise ValueError("ragged_rank must be less than rank.")
 
+  def is_compatible_with(self, spec_or_value):
+    if (self._ragged_rank == 0 and
+        isinstance(spec_or_value, (ops.Tensor, tensor_spec.TensorSpec))):
+      return tensor_spec.TensorSpec(
+          self._shape, self._dtype).is_compatible_with(spec_or_value)
+    else:
+      return super(RaggedTensorSpec, self).is_compatible_with(spec_or_value)
+
   def _serialize(self):
     return (self._shape, self._dtype, self._ragged_rank, self._row_splits_dtype)
 
@@ -2270,10 +2099,10 @@ class RaggedTensorSpec(type_spec.BatchableTypeSpec):
     outer_splits_shape = [None if outer_dim is None else outer_dim + 1]
     inner_splits_spec = tensor_spec.TensorSpec([None], self._row_splits_dtype)
 
-    specs = (
-        [tensor_spec.TensorSpec(flat_values_shape, self._dtype),
-         tensor_spec.TensorSpec(outer_splits_shape, self._row_splits_dtype)] +
-        [inner_splits_spec for _ in range(self._ragged_rank - 1)])
+    specs = ([
+        tensor_spec.TensorSpec(flat_values_shape, self._dtype),
+        tensor_spec.TensorSpec(outer_splits_shape, self._row_splits_dtype)
+    ] + [inner_splits_spec for _ in range(self._ragged_rank - 1)])
     return specs
 
   def _to_components(self, value):
@@ -2293,7 +2122,10 @@ class RaggedTensorSpec(type_spec.BatchableTypeSpec):
         tensor_list = [ops.convert_to_tensor(t) for t in tensor_list]
         result = tensor_list[0]
       for row_splits in reversed(tensor_list[1:]):
-        result = RaggedTensor(result, row_splits, internal=True)
+        result = RaggedTensor(
+            result,
+            RowPartition.from_row_splits(row_splits, validate=False),
+            internal=True)
     return result
 
   # The RaggedTensorSpec tensor_list encoding uses to/from_variant ops
@@ -2334,10 +2166,11 @@ class RaggedTensorSpec(type_spec.BatchableTypeSpec):
 
   def _from_compatible_tensor_list(self, tensor_list):
     if self._ragged_rank < 0:
-      raise ValueError(
-          "ragged_rank must be non-negative; got %s." % self._ragged_rank)
+      raise ValueError("ragged_rank must be non-negative; got %s." %
+                       self._ragged_rank)
     result = RaggedTensor._from_variant(  # pylint: disable=protected-access
-        tensor_list[0], dtype=self._dtype,
+        tensor_list[0],
+        dtype=self._dtype,
         row_splits_dtype=self._row_splits_dtype,
         output_ragged_rank=self._ragged_rank)
     if self._shape.ndims is not None:
@@ -2376,10 +2209,11 @@ class RaggedTensorSpec(type_spec.BatchableTypeSpec):
 
   @classmethod
   def from_value(cls, value):
-    return cls(shape=value.shape,
-               dtype=value.values.dtype,
-               ragged_rank=value.ragged_rank,
-               row_splits_dtype=value.row_splits.dtype)
+    return cls(
+        shape=value.shape,
+        dtype=value.values.dtype,
+        ragged_rank=value.ragged_rank,
+        row_splits_dtype=value.row_splits.dtype)
 
 
 type_spec.register_type_spec_from_value_converter(
@@ -2492,6 +2326,10 @@ class RaggedTensorType(object):
   ragged_rank = property(lambda self: self._ragged_rank)
   row_splits_dtype = property(lambda self: self._row_splits_dtype)
 
+  def __repr__(self):
+    return "RaggedTensorType(%r, %r, %r)" % (
+        self.dtype, self.ragged_rank, self.row_splits_dtype)
+
 
 #===============================================================================
 # Helper Functions
@@ -2646,28 +2484,22 @@ def _prod(lst):
   return functools.reduce(operator.mul, lst, 1)
 
 
-def _get_row_partition_type_tensor_pairs_tail(rt_value):
-  """Gets a list of the row partitions for rt_value.
+def _get_row_partition_type_tensor_pairs_tail(partition):
+  """Gets a row partition type tensor pair for the tail.
 
-  If parent_indices are defined, then they are used. Otherwise, row_splits
+  If value_rowid is defined, then it is used. Otherwise, row_splits
   are used.
 
-  This assumes that rt_input is nested inside another RaggedTensor. If it is
-  a tensor, then return an empty list.
-
   Args:
-    rt_value: a ragged tensor value. May be a tensor.
+    partition: a RowPartition.
 
   Returns:
     A list of (row_partition_type, row_partition_tensor) pairs.
   """
-  if isinstance(rt_value, RaggedTensor):
-    tail = _get_row_partition_type_tensor_pairs_tail(rt_value.values)
-    if rt_value._cached_value_rowids is not None:  # pylint: disable=protected-access
-      return [("VALUE_ROWIDS", rt_value.value_rowids())] + tail
-    else:
-      return [("ROW_SPLITS", rt_value.row_splits)] + tail
-  return []
+  if partition.has_precomputed_value_rowids():
+    return ("VALUE_ROWIDS", partition.value_rowids())
+  else:
+    return ("ROW_SPLITS", partition.row_splits())
 
 
 def _get_row_partition_type_tensor_pairs(rt_input):
@@ -2683,12 +2515,14 @@ def _get_row_partition_type_tensor_pairs(rt_input):
   Returns:
     A list of (row_partition_type, row_partition_tensor) pairs.
   """
-  tail = _get_row_partition_type_tensor_pairs_tail(rt_input.values)
-  if rt_input._cached_value_rowids is not None:  # pylint: disable=protected-access
-    return [("FIRST_DIM_SIZE", rt_input.nrows()),
-            ("VALUE_ROWIDS", rt_input.value_rowids())] + tail
+  partitions = rt_input._nested_row_partitions  # pylint: disable=protected-access
+  tail = [_get_row_partition_type_tensor_pairs_tail(x) for x in partitions[1:]]
+
+  if partitions[0]._value_rowids is not None:  # pylint: disable=protected-access
+    return [("FIRST_DIM_SIZE", partitions[0].nrows()),
+            ("VALUE_ROWIDS", partitions[0].value_rowids())] + tail
   else:
-    return [("ROW_SPLITS", rt_input.row_splits)] + tail
+    return [("ROW_SPLITS", partitions[0].row_splits())] + tail
 
 
 def _shape_as_tensor(shape, dtype):
@@ -2730,4 +2564,24 @@ def _shape_as_tensor(shape, dtype):
   return constant_op.constant(shape, dtype=dtype)
 
 
+def _nvals_uniform_row_length(values, uniform_row_length):
+  """Get the number of values for uniform row length constructor."""
+  const_nvals = tensor_shape.dimension_at_index(values.shape, 0).value
+  if const_nvals is not None:
+    nvals = constant_op.constant(const_nvals, uniform_row_length.dtype)
+  elif isinstance(values, RaggedTensor):
+    nvals = values.nrows(out_type=uniform_row_length.dtype)
+  else:
+    nvals = array_ops.shape(values, out_type=uniform_row_length.dtype)[0]
+  return nvals
+
+
+def _get_optional_partition_dtype(values):
+  """Returns the partition dtype, or None if None exists."""
+  if isinstance(values, RaggedTensor):
+    # pylint: disable=protected-access
+    return values._row_partition.dtype
+  return None
+
+
 ops.no_gradient("RaggedTensorToVariant")
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_test.py b/tensorflow/python/ops/ragged/ragged_tensor_test.py
index f4c75d26699..20c21bd5947 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_test.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import re
-
 from absl.testing import parameterized
 import numpy as np
 
@@ -40,96 +38,17 @@ from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_tensor_value
 from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensor
 from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensorSpec
+from tensorflow.python.ops.ragged.row_partition import RowPartition
+
 from tensorflow.python.platform import googletest
 
 
-class _SliceBuilder(object):
-  """Helper to construct arguments for __getitem__.
-
-  Usage: _SliceBuilder()[<expr>] slice_spec Python generates for <expr>.
-  """
-
-  def __getitem__(self, slice_spec):
-    return slice_spec
-
-
-SLICE_BUILDER = _SliceBuilder()
-
-
-def _make_tensor_slice_spec(slice_spec, use_constant=True):
-  """Wraps all integers in an extended slice spec w/ a tensor.
-
-  This function is used to help test slicing when the slice spec contains
-  tensors, rather than integers.
-
-  Args:
-    slice_spec: The extended slice spec.
-    use_constant: If true, then wrap each integer with a tf.constant.  If false,
-      then wrap each integer with a tf.placeholder.
-
-  Returns:
-    A copy of slice_spec, but with each integer i replaced with tf.constant(i).
-  """
-
-  def make_piece_scalar(piece):
-    if isinstance(piece, int):
-      scalar = constant_op.constant(piece)
-      if use_constant:
-        return scalar
-      else:
-        return array_ops.placeholder_with_default(scalar, [])
-    elif isinstance(piece, slice):
-      return slice(
-          make_piece_scalar(piece.start), make_piece_scalar(piece.stop),
-          make_piece_scalar(piece.step))
-    else:
-      return piece
-
-  if isinstance(slice_spec, tuple):
-    return tuple(make_piece_scalar(piece) for piece in slice_spec)
-  else:
-    return make_piece_scalar(slice_spec)
-
-
-# Example 2D ragged tensor value with one ragged dimension and with scalar
-# values, expressed as nested python lists and as splits+values.
-EXAMPLE_RAGGED_TENSOR_2D = [[b'a', b'b'], [b'c', b'd', b'e'], [b'f'], [],
-                            [b'g']]
-EXAMPLE_RAGGED_TENSOR_2D_SPLITS = [0, 2, 5, 6, 6, 7]
-EXAMPLE_RAGGED_TENSOR_2D_VALUES = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
-
-# Example 4D ragged tensor value, with two ragged dimensions and with values
-# whose shape is [2], expressed as nested python lists and as splits+values.
-EXAMPLE_RAGGED_TENSOR_4D = [
-    [                                       # rt[0]
-        [[1, 2], [3, 4], [5, 6]],           # rt[0][0]
-        [[7, 8], [9, 10], [11, 12]]],       # rt[0][1]
-    [],                                     # rt[1]
-    [                                       # rt[2]
-        [[13, 14], [15, 16], [17, 18]]],    # rt[2][0]
-    [                                       # rt[3]
-        [[19, 20]]]                         # rt[3][0]
-]  # pyformat: disable
-EXAMPLE_RAGGED_TENSOR_4D_SPLITS1 = [0, 2, 2, 3, 4]
-EXAMPLE_RAGGED_TENSOR_4D_SPLITS2 = [0, 3, 6, 9, 10]
-EXAMPLE_RAGGED_TENSOR_4D_VALUES = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10],
-                                   [11, 12], [13, 14], [15, 16], [17, 18],
-                                   [19, 20]]
-
-# Example 3D ragged tensor with uniform_row_lengths.
-EXAMPLE_RAGGED_TENSOR_3D = [[[1, 2, 3], [4], [5, 6]], [[], [7, 8, 9], []]]
-EXAMPLE_RAGGED_TENSOR_3D_ROWLEN = 3
-EXAMPLE_RAGGED_TENSOR_3D_SPLITS = [0, 3, 4, 6, 6, 9, 9]
-EXAMPLE_RAGGED_TENSOR_3D_VALUES = [1, 2, 3, 4, 5, 6, 7, 8, 9]
-
-
 def int32array(values):
   return np.array(values, dtype=np.int32)
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class RaggedTensorTest(test_util.TensorFlowTestCase,
-                       parameterized.TestCase):
+class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   longMessage = True  # Property in unittest.Testcase. pylint: disable=invalid-name
 
   #=============================================================================
@@ -161,25 +80,22 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
     outer_rt = RaggedTensor.from_row_splits(
         values=inner_rt, row_splits=[0, 3, 3, 5])
     self.assertEqual(outer_rt.ragged_rank, 2)
-    self.assertAllEqual(
-        outer_rt,
-        [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]])
+    self.assertAllEqual(outer_rt,
+                        [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]])
     del inner_rt, outer_rt
 
     # From section: "Multiple Ragged Dimensions"
     rt = RaggedTensor.from_nested_row_splits(
         flat_values=[3, 1, 4, 1, 5, 9, 2, 6],
         nested_row_splits=([0, 3, 3, 5], [0, 4, 4, 7, 8, 8]))
-    self.assertAllEqual(
-        rt, [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]])
+    self.assertAllEqual(rt, [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]])
     del rt
 
     # From section: "Uniform Inner Dimensions"
     rt = RaggedTensor.from_row_splits(
         values=array_ops.ones([5, 3]), row_splits=[0, 2, 5])
     self.assertAllEqual(
-        rt,
-        [[[1, 1, 1], [1, 1, 1]], [[1, 1, 1], [1, 1, 1], [1, 1, 1]]])
+        rt, [[[1, 1, 1], [1, 1, 1]], [[1, 1, 1], [1, 1, 1], [1, 1, 1]]])
     self.assertEqual(rt.shape.as_list(), [2, None, 3])
     del rt
 
@@ -223,42 +139,29 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
   def testRaggedTensorConstruction(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
-    rt = RaggedTensor(values=values, row_splits=row_splits, internal=True)
+    rp = RowPartition.from_row_splits(row_splits)
+    rt = RaggedTensor(values=values, row_partition=rp, internal=True)
 
-    self.assertAllEqual(
-        rt,
-        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+    self.assertAllEqual(rt,
+                        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
   def testRaggedTensorConstructionErrors(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+    rp = RowPartition.from_row_splits(row_splits)
 
     with self.assertRaisesRegexp(ValueError,
                                  'RaggedTensor constructor is private'):
-      RaggedTensor(values=values, row_splits=row_splits)
+      RaggedTensor(values=values, row_partition=rp)
 
     with self.assertRaisesRegexp(TypeError,
                                  'values must be a Tensor or RaggedTensor'):
-      RaggedTensor(values=range(7), row_splits=row_splits, internal=True)
+      RaggedTensor(values=range(7), row_partition=rp, internal=True)
 
     with self.assertRaisesRegexp(TypeError,
-                                 'Row-partitioning argument must be a Tensor'):
-      RaggedTensor(values=values, row_splits=[0, 2, 2, 5, 6, 7], internal=True)
-
-    with self.assertRaisesRegexp(ValueError,
-                                 r'Shape \(6, 1\) must have rank 1'):
-      RaggedTensor(
-          values=values,
-          row_splits=array_ops.expand_dims(row_splits, 1),
-          internal=True)
-
-    with self.assertRaisesRegexp(TypeError,
-                                 'Cached value must be a Tensor or None.'):
-      RaggedTensor(
-          values=values,
-          row_splits=row_splits,
-          cached_row_lengths=[2, 3, 4],
-          internal=True)
+                                 'row_partition must be a RowPartition'):
+      RaggedTensor(values=values, row_partition=[0, 2, 2, 5, 6, 7],
+                   internal=True)
 
   #=============================================================================
   # RaggedTensor Factory Ops
@@ -282,9 +185,8 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
     self.assertIs(rt_value_rowids, value_rowids)  # cached_value_rowids
     self.assertAllEqual(rt_value_rowids, value_rowids)
     self.assertAllEqual(rt_nrows, 5)
-    self.assertAllEqual(
-        rt,
-        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+    self.assertAllEqual(rt,
+                        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
   def testFromValueRowIdsWithDerivedNRowsDynamic(self):
     # nrows is not known at graph creation time.
@@ -308,17 +210,16 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
     self.assertIs(rt_value_rowids, value_rowids)  # cached_value_rowids
     self.assertAllEqual(rt_value_rowids, value_rowids)
     self.assertAllEqual(rt_nrows, 5)
-    self.assertAllEqual(
-        rt,
-        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+    self.assertAllEqual(rt,
+                        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
   def testFromValueRowIdsWithExplicitNRows(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
     nrows = constant_op.constant(7, dtypes.int64)
 
-    rt = RaggedTensor.from_value_rowids(values, value_rowids, nrows,
-                                        validate=False)
+    rt = RaggedTensor.from_value_rowids(
+        values, value_rowids, nrows, validate=False)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [7, None])
     self.assertEqual(rt.ragged_rank, 1)
@@ -331,16 +232,15 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
     self.assertIs(rt_value_rowids, value_rowids)  # cached_value_rowids
     self.assertIs(rt_nrows, nrows)  # cached_nrows
     self.assertAllEqual(
-        rt,
-        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g'], [], []])
+        rt, [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g'], [], []])
 
   def testFromValueRowIdsWithExplicitNRowsEqualToDefault(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
     nrows = constant_op.constant(5, dtypes.int64)
 
-    rt = RaggedTensor.from_value_rowids(values, value_rowids, nrows,
-                                        validate=False)
+    rt = RaggedTensor.from_value_rowids(
+        values, value_rowids, nrows, validate=False)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [5, None])
     self.assertEqual(rt.ragged_rank, 1)
@@ -354,9 +254,8 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
     self.assertIs(rt_nrows, nrows)  # cached_nrows
     self.assertAllEqual(rt_value_rowids, value_rowids)
     self.assertAllEqual(rt_nrows, nrows)
-    self.assertAllEqual(
-        rt,
-        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+    self.assertAllEqual(rt,
+                        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
   def testFromValueRowIdsWithEmptyValues(self):
     rt = RaggedTensor.from_value_rowids([], [])
@@ -385,9 +284,8 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
     self.assertIs(rt_values, values)
     self.assertIs(rt_row_splits, row_splits)
     self.assertAllEqual(rt_nrows, 5)
-    self.assertAllEqual(
-        rt,
-        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+    self.assertAllEqual(rt,
+                        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
   def testFromRowSplitsWithDifferentSplitTypes(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
@@ -428,9 +326,8 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
     self.assertIs(rt_values, values)
     self.assertAllEqual(rt_nrows, 5)
     self.assertAllEqual(rt_row_starts, row_starts)
-    self.assertAllEqual(
-        rt,
-        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+    self.assertAllEqual(rt,
+                        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
   def testFromRowLimits(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
@@ -448,9 +345,8 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
     self.assertIs(rt_values, values)
     self.assertAllEqual(rt_nrows, 5)
     self.assertAllEqual(rt_row_limits, row_limits)
-    self.assertAllEqual(
-        rt,
-        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+    self.assertAllEqual(rt,
+                        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
   def testFromRowLengths(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
@@ -469,21 +365,27 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
     self.assertIs(rt_row_lengths, row_lengths)  # cached_nrows
     self.assertAllEqual(rt_nrows, 5)
     self.assertAllEqual(rt_row_lengths, row_lengths)
-    self.assertAllEqual(
-        rt,
-        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+    self.assertAllEqual(rt,
+                        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+
+  def testFromRowLengthsInt32(self):
+    rt = RaggedTensor.from_row_lengths([1, 2, 3, 4],
+                                       constant_op.constant([1, 0, 3],
+                                                            dtype=dtypes.int32))
+    rt2 = RaggedTensor.from_row_lengths(rt, [2, 1, 0])
+    self.assertAllEqual([2, 1, 0], rt2.row_lengths())
 
   def testFromUniformRowLength(self):
     values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
 
     a1 = RaggedTensor.from_uniform_row_length(values, 2)
     a2 = RaggedTensor.from_uniform_row_length(values, 2, 8)
-    self.assertAllEqual(a1, [[1, 2], [3, 4], [5, 6], [7, 8],
-                             [9, 10], [11, 12], [13, 14], [15, 16]])
+    self.assertAllEqual(
+        a1,
+        [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]])
     self.assertAllEqual(a1, a2)
     self.assertEqual(a1.shape.as_list(), [8, 2])
     self.assertEqual(a2.shape.as_list(), [8, 2])
-    self.assertAllEqual(a1.uniform_row_length, 2)
 
     b1 = RaggedTensor.from_uniform_row_length(a1, 2)
     b2 = RaggedTensor.from_uniform_row_length(a1, 2, 4)
@@ -492,7 +394,6 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
     self.assertAllEqual(b1, b2)
     self.assertEqual(b1.shape.as_list(), [4, 2, 2])
     self.assertEqual(b2.shape.as_list(), [4, 2, 2])
-    self.assertAllEqual(b1.uniform_row_length, 2)
 
     c1 = RaggedTensor.from_uniform_row_length(b1, 2)
     c2 = RaggedTensor.from_uniform_row_length(b1, 2, 2)
@@ -501,13 +402,11 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
     self.assertAllEqual(c1, c2)
     self.assertEqual(c1.shape.as_list(), [2, 2, 2, 2])
     self.assertEqual(c2.shape.as_list(), [2, 2, 2, 2])
-    self.assertAllEqual(c1.uniform_row_length, 2)
 
   def testFromUniformRowLengthWithEmptyValues(self):
     empty_values = []
     a = RaggedTensor.from_uniform_row_length(empty_values, 0, nrows=10)
     self.assertEqual(a.shape.as_list(), [10, 0])
-    self.assertAllEqual(a.uniform_row_length, 0)
 
     b = RaggedTensor.from_uniform_row_length(a, 2)
     self.assertEqual(b.shape.as_list(), [5, 2, 0])
@@ -570,8 +469,7 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
     self.assertAllEqual(rt_value_rowids, nested_value_rowids[0])
     self.assertAllEqual(rt_values_value_rowids, nested_value_rowids[1])
     self.assertAllEqual(
-        rt,
-        [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
+        rt, [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
 
   def testFromNestedValueRowIdsWithExplicitNRows(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
@@ -602,9 +500,8 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
     self.assertAllEqual(rt_values_value_rowids, nested_value_rowids[1])
     self.assertAllEqual(rt_nrows, nrows[0])
     self.assertAllEqual(rt_values_nrows, nrows[1])
-    self.assertAllEqual(
-        rt, [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [],
-             [[b'f'], [b'g'], []], [], []])
+    self.assertAllEqual(rt, [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [],
+                             [[b'f'], [b'g'], []], [], []])
 
   def testFromNestedValueRowIdsWithExplicitNRowsMismatch(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
@@ -635,8 +532,8 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
         constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
     ]
 
-    rt = RaggedTensor.from_nested_row_splits(flat_values, nested_row_splits,
-                                             validate=False)
+    rt = RaggedTensor.from_nested_row_splits(
+        flat_values, nested_row_splits, validate=False)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [4, None, None])
     self.assertEqual(rt.ragged_rank, 2)
@@ -650,8 +547,34 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
     self.assertIs(rt_row_splits, nested_row_splits[0])
     self.assertIs(rt_values_row_splits, nested_row_splits[1])
     self.assertAllEqual(
-        rt,
-        [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
+        rt, [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
+
+  def testWithRowSplits(self):
+    flat_values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    nested_row_splits = [
+        constant_op.constant([0, 2, 3, 3, 5], dtypes.int64),
+        constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+    ]
+
+    rt = RaggedTensor.from_nested_row_splits(
+        flat_values, nested_row_splits, validate=False)
+
+    rt = rt.with_row_splits_dtype(dtypes.int32)
+
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [4, None, None])
+    self.assertEqual(rt.ragged_rank, 2)
+
+    rt_values = rt.values
+    rt_row_splits = rt.row_splits
+    rt_values_values = rt_values.values
+    rt_values_row_splits = rt_values.row_splits
+
+    self.assertAllEqual(rt_values_values, flat_values)
+    self.assertAllEqual(rt_row_splits, nested_row_splits[0])
+    self.assertAllEqual(rt_values_row_splits, nested_row_splits[1])
+    self.assertAllEqual(
+        rt, [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
 
   def testFromNestedRowSplitsWithNonListInput(self):
     with self.assertRaisesRegexp(TypeError,
@@ -743,25 +666,26 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
     values = [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]]
     row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
     value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    row_lengths = constant_op.constant([2, 0, 3, 1, 1])
     rt1 = RaggedTensor.from_row_splits(values, row_splits)
     rt2 = RaggedTensor.from_value_rowids(values, value_rowids)
+    rt3 = RaggedTensor.from_row_lengths(values, row_lengths)
 
-    for rt in [rt1, rt2]:
-      self.assertAllEqual(
-          rt,
-          [[[0, 1], [2, 3]], [], [[4, 5], [6, 7], [8, 9]], [[10, 11]],
-           [[12, 13]]])
+    for rt in [rt1, rt2, rt3]:
+      self.assertAllEqual(rt, [[[0, 1], [2, 3]], [], [[4, 5], [6, 7], [8, 9]],
+                               [[10, 11]], [[12, 13]]])
       self.assertAllEqual(
           rt.values,
           [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]])
       self.assertEqual(rt.values.shape.dims[0].value, 7)
-      self.assertAllEqual(
-          rt.value_rowids(), [0, 0, 2, 2, 2, 3, 4])
+      self.assertAllEqual(rt.value_rowids(), [0, 0, 2, 2, 2, 3, 4])
       self.assertAllEqual(rt.nrows(), 5)
       self.assertAllEqual(rt.row_splits, [0, 2, 2, 5, 6, 7])
       self.assertAllEqual(rt.row_starts(), [0, 2, 2, 5, 6])
       self.assertAllEqual(rt.row_limits(), [2, 2, 5, 6, 7])
       self.assertAllEqual(rt.row_lengths(), [2, 0, 3, 1, 1])
+      self.assertAllEqual(rt.row_lengths(axis=2),
+                          [[2, 2], [], [2, 2, 2], [2], [2]])
       self.assertAllEqual(
           rt.flat_values,
           [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]])
@@ -786,11 +710,9 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
 
     for rt in [rt1, rt2]:
       self.assertAllEqual(
-          rt,
-          [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
+          rt, [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
       self.assertAllEqual(
-          rt.values,
-          [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+          rt.values, [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
       self.assertEqual(rt.values.shape.dims[0].value, 5)
       self.assertAllEqual(rt.value_rowids(), [0, 0, 1, 3, 3])
       self.assertAllEqual(rt.nrows(), 4)
@@ -798,9 +720,8 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
       self.assertAllEqual(rt.row_starts(), [0, 2, 3, 3])
       self.assertAllEqual(rt.row_limits(), [2, 3, 3, 5])
       self.assertAllEqual(rt.row_lengths(), [2, 1, 0, 2])
-      self.assertAllEqual(
-          rt.flat_values,
-          [b'a', b'b', b'c', b'd', b'e', b'f', b'g'])
+      self.assertAllEqual(rt.flat_values,
+                          [b'a', b'b', b'c', b'd', b'e', b'f', b'g'])
       self.assertLen(rt.nested_row_splits, 2)
       self.assertAllEqual(rt.nested_row_splits[0], [0, 2, 3, 3, 5])
       self.assertAllEqual(rt.nested_row_splits[1], [0, 2, 2, 5, 6, 7])
@@ -839,482 +760,6 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
           [1, 2, 3], array_ops.placeholder(dtype=dtypes.int64))
       self.assertEqual(rt6.shape.as_list(), [None, None])
 
-  #=============================================================================
-  # RaggedTensor.__getitem__
-  #=============================================================================
-
-  def _TestGetItem(self, rt, slice_spec, expected, expected_shape=None):
-    """Helper function for testing RaggedTensor.__getitem__.
-
-    Checks that calling `rt.__getitem__(slice_spec) returns the expected value.
-    Checks three different configurations for each slice spec:
-
-      * Call __getitem__ with the slice spec as-is (with int values)
-      * Call __getitem__ with int values in the slice spec wrapped in
-        `tf.constant()`.
-      * Call __getitem__ with int values in the slice spec wrapped in
-        `tf.compat.v1.placeholder()` (so value is not known at graph
-        construction time).
-
-    Args:
-      rt: The RaggedTensor to test.
-      slice_spec: The slice spec.
-      expected: The expected value of rt.__getitem__(slice_spec), as a python
-        list; or an exception class.
-      expected_shape: The expected shape for `rt.__getitem__(slice_spec)`.
-    """
-    tensor_slice_spec1 = _make_tensor_slice_spec(slice_spec, True)
-    tensor_slice_spec2 = _make_tensor_slice_spec(slice_spec, False)
-    value1 = rt.__getitem__(slice_spec)
-    value2 = rt.__getitem__(tensor_slice_spec1)
-    value3 = rt.__getitem__(tensor_slice_spec2)
-    self.assertAllEqual(value1, expected, 'slice_spec=%s' % (slice_spec,))
-    self.assertAllEqual(value2, expected, 'slice_spec=%s' % (slice_spec,))
-    self.assertAllEqual(value3, expected, 'slice_spec=%s' % (slice_spec,))
-    if expected_shape is not None:
-      value1.shape.assert_is_compatible_with(expected_shape)
-      value2.shape.assert_is_compatible_with(expected_shape)
-      value3.shape.assert_is_compatible_with(expected_shape)
-
-  def _TestGetItemException(self, rt, slice_spec, expected, message):
-    """Helper function for testing RaggedTensor.__getitem__ exceptions."""
-    tensor_slice_spec = _make_tensor_slice_spec(slice_spec, True)
-    with self.assertRaisesRegexp(expected, message):
-      self.evaluate(rt.__getitem__(slice_spec))
-    with self.assertRaisesRegexp(expected, message):
-      self.evaluate(rt.__getitem__(tensor_slice_spec))
-
-  @parameterized.parameters(
-      # Tests for rt[i]
-      (SLICE_BUILDER[-5], EXAMPLE_RAGGED_TENSOR_2D[-5]),
-      (SLICE_BUILDER[-4], EXAMPLE_RAGGED_TENSOR_2D[-4]),
-      (SLICE_BUILDER[-1], EXAMPLE_RAGGED_TENSOR_2D[-1]),
-      (SLICE_BUILDER[0], EXAMPLE_RAGGED_TENSOR_2D[0]),
-      (SLICE_BUILDER[1], EXAMPLE_RAGGED_TENSOR_2D[1]),
-      (SLICE_BUILDER[4], EXAMPLE_RAGGED_TENSOR_2D[4]),
-
-      # Tests for rt[i:]
-      (SLICE_BUILDER[-6:], EXAMPLE_RAGGED_TENSOR_2D[-6:]),
-      (SLICE_BUILDER[-3:], EXAMPLE_RAGGED_TENSOR_2D[-3:]),
-      (SLICE_BUILDER[-1:], EXAMPLE_RAGGED_TENSOR_2D[-1:]),
-      (SLICE_BUILDER[0:], EXAMPLE_RAGGED_TENSOR_2D[0:]),
-      (SLICE_BUILDER[3:], EXAMPLE_RAGGED_TENSOR_2D[3:]),
-      (SLICE_BUILDER[5:], EXAMPLE_RAGGED_TENSOR_2D[5:]),
-
-      # Tests for rt[:j]
-      (SLICE_BUILDER[:-6], EXAMPLE_RAGGED_TENSOR_2D[:-6]),
-      (SLICE_BUILDER[:-3], EXAMPLE_RAGGED_TENSOR_2D[:-3]),
-      (SLICE_BUILDER[:-1], EXAMPLE_RAGGED_TENSOR_2D[:-1]),
-      (SLICE_BUILDER[:0], EXAMPLE_RAGGED_TENSOR_2D[:0]),
-      (SLICE_BUILDER[:3], EXAMPLE_RAGGED_TENSOR_2D[:3]),
-      (SLICE_BUILDER[:5], EXAMPLE_RAGGED_TENSOR_2D[:5]),
-
-      # Tests for rt[i:j]
-      (SLICE_BUILDER[0:3], EXAMPLE_RAGGED_TENSOR_2D[0:3]),
-      (SLICE_BUILDER[3:5], EXAMPLE_RAGGED_TENSOR_2D[3:5]),
-      (SLICE_BUILDER[-5:3], EXAMPLE_RAGGED_TENSOR_2D[-5:3]),
-      (SLICE_BUILDER[3:1], EXAMPLE_RAGGED_TENSOR_2D[3:1]),
-      (SLICE_BUILDER[-1:1], EXAMPLE_RAGGED_TENSOR_2D[-1:1]),
-      (SLICE_BUILDER[1:-1], EXAMPLE_RAGGED_TENSOR_2D[1:-1]),
-
-      # Tests for rt[i, j]
-      (SLICE_BUILDER[0, 1], EXAMPLE_RAGGED_TENSOR_2D[0][1]),
-      (SLICE_BUILDER[1, 2], EXAMPLE_RAGGED_TENSOR_2D[1][2]),
-      (SLICE_BUILDER[-1, 0], EXAMPLE_RAGGED_TENSOR_2D[-1][0]),
-      (SLICE_BUILDER[-3, 0], EXAMPLE_RAGGED_TENSOR_2D[-3][0]),
-      (SLICE_BUILDER[:], EXAMPLE_RAGGED_TENSOR_2D),
-      (SLICE_BUILDER[:, :], EXAMPLE_RAGGED_TENSOR_2D),
-
-      # Empty slice spec.
-      ([], EXAMPLE_RAGGED_TENSOR_2D),
-
-      # Test for ellipsis
-      (SLICE_BUILDER[...], EXAMPLE_RAGGED_TENSOR_2D),
-      (SLICE_BUILDER[2, ...], EXAMPLE_RAGGED_TENSOR_2D[2]),
-      (SLICE_BUILDER[..., :], EXAMPLE_RAGGED_TENSOR_2D),
-      (SLICE_BUILDER[..., 2, 0], EXAMPLE_RAGGED_TENSOR_2D[2][0]),
-      (SLICE_BUILDER[2, ..., 0], EXAMPLE_RAGGED_TENSOR_2D[2][0]),
-      (SLICE_BUILDER[2, 0, ...], EXAMPLE_RAGGED_TENSOR_2D[2][0]),
-
-      # Test for array_ops.newaxis
-      (SLICE_BUILDER[array_ops.newaxis, :], [EXAMPLE_RAGGED_TENSOR_2D]),
-      (SLICE_BUILDER[:, array_ops.newaxis],
-       [[row] for row in EXAMPLE_RAGGED_TENSOR_2D]),
-
-      # Slicing inner ragged dimensions.
-      (SLICE_BUILDER[-1:,
-                     1:4], [row[1:4] for row in EXAMPLE_RAGGED_TENSOR_2D[-1:]]),
-      (SLICE_BUILDER[:, 1:4], [row[1:4] for row in EXAMPLE_RAGGED_TENSOR_2D]),
-      (SLICE_BUILDER[:, -2:], [row[-2:] for row in EXAMPLE_RAGGED_TENSOR_2D]),
-
-      # Strided slices
-      (SLICE_BUILDER[::2], EXAMPLE_RAGGED_TENSOR_2D[::2]),
-      (SLICE_BUILDER[::-1], EXAMPLE_RAGGED_TENSOR_2D[::-1]),
-      (SLICE_BUILDER[::-2], EXAMPLE_RAGGED_TENSOR_2D[::-2]),
-      (SLICE_BUILDER[::-3], EXAMPLE_RAGGED_TENSOR_2D[::-3]),
-      (SLICE_BUILDER[:, ::2], [row[::2] for row in EXAMPLE_RAGGED_TENSOR_2D]),
-      (SLICE_BUILDER[:, ::-1], [row[::-1] for row in EXAMPLE_RAGGED_TENSOR_2D]),
-      (SLICE_BUILDER[:, ::-2], [row[::-2] for row in EXAMPLE_RAGGED_TENSOR_2D]),
-      (SLICE_BUILDER[:, ::-3], [row[::-3] for row in EXAMPLE_RAGGED_TENSOR_2D]),
-      (SLICE_BUILDER[:, 2::-1],
-       [row[2::-1] for row in EXAMPLE_RAGGED_TENSOR_2D]),
-      (SLICE_BUILDER[:, -1::-1],
-       [row[-1::-1] for row in EXAMPLE_RAGGED_TENSOR_2D]),
-      (SLICE_BUILDER[..., -1::-1],
-       [row[-1::-1] for row in EXAMPLE_RAGGED_TENSOR_2D]),
-      (SLICE_BUILDER[:, 2::-2],
-       [row[2::-2] for row in EXAMPLE_RAGGED_TENSOR_2D]),
-      (SLICE_BUILDER[::-1, ::-1],
-       [row[::-1] for row in EXAMPLE_RAGGED_TENSOR_2D[::-1]]),
-  )  # pyformat: disable
-  def testRaggedTensorGetItemWithRaggedRank1(self, slice_spec, expected):
-    """Test that rt.__getitem__(slice_spec) == expected."""
-    # Ragged tensor
-    rt = RaggedTensor.from_row_splits(EXAMPLE_RAGGED_TENSOR_2D_VALUES,
-                                      EXAMPLE_RAGGED_TENSOR_2D_SPLITS)
-
-    self.assertAllEqual(rt, EXAMPLE_RAGGED_TENSOR_2D)
-    self._TestGetItem(rt, slice_spec, expected)
-
-  def testStridedSlices(self):
-    test_value = [[1, 2, 3, 4, 5], [6, 7], [8, 9, 10], [], [9],
-                  [1, 2, 3, 4, 5, 6, 7, 8]]
-    rt = ragged_factory_ops.constant(test_value)
-    for start in [-2, -1, None, 0, 1, 2]:
-      for stop in [-2, -1, None, 0, 1, 2]:
-        for step in [-3, -2, -1, 1, 2, 3]:
-          # Slice outer dimension
-          self.assertAllEqual(rt[start:stop:step], test_value[start:stop:step],
-                              'slice=%s:%s:%s' % (start, stop, step))
-          # Slice inner dimension
-          self.assertAllEqual(rt[:, start:stop:step],
-                              [row[start:stop:step] for row in test_value],
-                              'slice=%s:%s:%s' % (start, stop, step))
-
-  # pylint: disable=invalid-slice-index
-  @parameterized.parameters(
-      # Tests for out-of-bound errors
-      (SLICE_BUILDER[5], (IndexError, ValueError, errors.InvalidArgumentError),
-       '.*out of bounds.*'),
-      (SLICE_BUILDER[-6], (IndexError, ValueError, errors.InvalidArgumentError),
-       '.*out of bounds.*'),
-      (SLICE_BUILDER[0, 2], (IndexError, ValueError,
-                             errors.InvalidArgumentError), '.*out of bounds.*'),
-      (SLICE_BUILDER[3, 0], (IndexError, ValueError,
-                             errors.InvalidArgumentError), '.*out of bounds.*'),
-
-      # Indexing into an inner ragged dimension
-      (SLICE_BUILDER[:, 3], ValueError,
-       'Cannot index into an inner ragged dimension'),
-      (SLICE_BUILDER[:1, 3], ValueError,
-       'Cannot index into an inner ragged dimension'),
-      (SLICE_BUILDER[..., 3], ValueError,
-       'Cannot index into an inner ragged dimension'),
-
-      # Tests for type errors
-      (SLICE_BUILDER[0.5], TypeError, re.escape(array_ops._SLICE_TYPE_ERROR)),
-      (SLICE_BUILDER[1:3:0.5], TypeError, re.escape(
-          array_ops._SLICE_TYPE_ERROR)),
-      (SLICE_BUILDER[:, 1:3:0.5], TypeError,
-       'slice strides must be integers or None'),
-      (SLICE_BUILDER[:, 0.5:1.5], TypeError,
-       'slice offsets must be integers or None'),
-      (SLICE_BUILDER['foo'], TypeError, re.escape(array_ops._SLICE_TYPE_ERROR)),
-      (SLICE_BUILDER[:, 'foo':'foo'], TypeError,
-       'slice offsets must be integers or None'),
-
-      # Tests for other errors
-      (SLICE_BUILDER[..., 0, 0, 0], IndexError,
-       'Too many indices for RaggedTensor'),
-  )
-  def testRaggedTensorGetItemErrorsWithRaggedRank1(self, slice_spec, expected,
-                                                   message):
-    """Test that rt.__getitem__(slice_spec) == expected."""
-    # Ragged tensor
-    rt = RaggedTensor.from_row_splits(EXAMPLE_RAGGED_TENSOR_2D_VALUES,
-                                      EXAMPLE_RAGGED_TENSOR_2D_SPLITS)
-
-    self.assertAllEqual(rt, EXAMPLE_RAGGED_TENSOR_2D)
-    self._TestGetItemException(rt, slice_spec, expected, message)
-
-  @parameterized.parameters(
-      # Tests for rt[index, index, ...]
-      (SLICE_BUILDER[2, 0], EXAMPLE_RAGGED_TENSOR_4D[2][0]),
-      (SLICE_BUILDER[2, 0, 1], EXAMPLE_RAGGED_TENSOR_4D[2][0][1]),
-      (SLICE_BUILDER[2, 0, 1, 1], EXAMPLE_RAGGED_TENSOR_4D[2][0][1][1]),
-      (SLICE_BUILDER[2, 0, 1:], EXAMPLE_RAGGED_TENSOR_4D[2][0][1:]),
-      (SLICE_BUILDER[2, 0, 1:, 1:], [[16], [18]]),
-      (SLICE_BUILDER[2, 0, :, 1], [14, 16, 18]),
-      (SLICE_BUILDER[2, 0, 1, :], EXAMPLE_RAGGED_TENSOR_4D[2][0][1]),
-
-      # Tests for rt[index, slice, ...]
-      (SLICE_BUILDER[0, :], EXAMPLE_RAGGED_TENSOR_4D[0]),
-      (SLICE_BUILDER[1, :], EXAMPLE_RAGGED_TENSOR_4D[1]),
-      (SLICE_BUILDER[0, :, :, 1], [[2, 4, 6], [8, 10, 12]]),
-      (SLICE_BUILDER[1, :, :, 1], []),
-      (SLICE_BUILDER[2, :, :, 1], [[14, 16, 18]]),
-      (SLICE_BUILDER[3, :, :, 1], [[20]]),
-
-      # Tests for rt[slice, slice, ...]
-      (SLICE_BUILDER[:, :], EXAMPLE_RAGGED_TENSOR_4D),
-      (SLICE_BUILDER[:, :, :, 1], [[[2, 4, 6], [8, 10, 12]], [], [[14, 16, 18]],
-                                   [[20]]]),
-      (SLICE_BUILDER[1:, :, :, 1], [[], [[14, 16, 18]], [[20]]]),
-      (SLICE_BUILDER[-3:, :, :, 1], [[], [[14, 16, 18]], [[20]]]),
-
-      # Test for ellipsis
-      (SLICE_BUILDER[...], EXAMPLE_RAGGED_TENSOR_4D),
-      (SLICE_BUILDER[2, ...], EXAMPLE_RAGGED_TENSOR_4D[2]),
-      (SLICE_BUILDER[2, 0, ...], EXAMPLE_RAGGED_TENSOR_4D[2][0]),
-      (SLICE_BUILDER[..., 0], [[[1, 3, 5], [7, 9, 11]], [], [[13, 15, 17]],
-                               [[19]]]),
-      (SLICE_BUILDER[2, ..., 0], [[13, 15, 17]]),
-      (SLICE_BUILDER[2, 0, ..., 0], [13, 15, 17]),
-
-      # Test for array_ops.newaxis
-      (SLICE_BUILDER[array_ops.newaxis, :], [EXAMPLE_RAGGED_TENSOR_4D]),
-      (SLICE_BUILDER[:, array_ops.newaxis],
-       [[row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
-
-      # Empty slice spec.
-      ([], EXAMPLE_RAGGED_TENSOR_4D),
-
-      # Slicing inner ragged dimensions.
-      (SLICE_BUILDER[:, 1:4], [row[1:4] for row in EXAMPLE_RAGGED_TENSOR_4D]),
-      (SLICE_BUILDER[:, -2:], [row[-2:] for row in EXAMPLE_RAGGED_TENSOR_4D]),
-      (SLICE_BUILDER[:, :, :-1],
-       [[v[:-1] for v in row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
-      (SLICE_BUILDER[:, :, 1:2],
-       [[v[1:2] for v in row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
-      (SLICE_BUILDER[1:, 1:3, 1:2],
-       [[v[1:2] for v in row[1:3]] for row in EXAMPLE_RAGGED_TENSOR_4D[1:]]),
-
-      # Strided slices
-      (SLICE_BUILDER[::2], EXAMPLE_RAGGED_TENSOR_4D[::2]),
-      (SLICE_BUILDER[::-1], EXAMPLE_RAGGED_TENSOR_4D[::-1]),
-      (SLICE_BUILDER[::-2], EXAMPLE_RAGGED_TENSOR_4D[::-2]),
-      (SLICE_BUILDER[1::2], EXAMPLE_RAGGED_TENSOR_4D[1::2]),
-      (SLICE_BUILDER[:, ::2], [row[::2] for row in EXAMPLE_RAGGED_TENSOR_4D]),
-      (SLICE_BUILDER[:, 1::2], [row[1::2] for row in EXAMPLE_RAGGED_TENSOR_4D]),
-      (SLICE_BUILDER[:, :, ::2],
-       [[v[::2] for v in row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
-      (SLICE_BUILDER[:, :, 1::2],
-       [[v[1::2] for v in row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
-      (SLICE_BUILDER[:, :, ::-1],
-       [[v[::-1] for v in row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
-      (SLICE_BUILDER[:, :, ::-2],
-       [[v[::-2] for v in row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
-      (SLICE_BUILDER[..., ::-1, :],
-       [[v[::-1] for v in row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
-      (SLICE_BUILDER[..., ::-1],
-       [[[v[::-1] for v in col] for col in row]
-        for row in EXAMPLE_RAGGED_TENSOR_4D]),
-  )
-  def testRaggedTensorGetItemWithRaggedRank2(self, slice_spec, expected):
-    """Test that rt.__getitem__(slice_spec) == expected."""
-    rt = RaggedTensor.from_nested_row_splits(
-        EXAMPLE_RAGGED_TENSOR_4D_VALUES,
-        [EXAMPLE_RAGGED_TENSOR_4D_SPLITS1, EXAMPLE_RAGGED_TENSOR_4D_SPLITS2])
-    self.assertAllEqual(rt, EXAMPLE_RAGGED_TENSOR_4D)
-    self._TestGetItem(rt, slice_spec, expected)
-
-  @parameterized.parameters(
-      # Test for errors in unsupported cases
-      (SLICE_BUILDER[:, 0], ValueError,
-       'Cannot index into an inner ragged dimension.'),
-      (SLICE_BUILDER[:, :, 0], ValueError,
-       'Cannot index into an inner ragged dimension.'),
-
-      # Test for out-of-bounds errors.
-      (SLICE_BUILDER[1, 0], (IndexError, ValueError,
-                             errors.InvalidArgumentError), '.*out of bounds.*'),
-      (SLICE_BUILDER[0, 0, 3],
-       (IndexError, ValueError,
-        errors.InvalidArgumentError), '.*out of bounds.*'),
-      (SLICE_BUILDER[5], (IndexError, ValueError, errors.InvalidArgumentError),
-       '.*out of bounds.*'),
-      (SLICE_BUILDER[0, 5], (IndexError, ValueError,
-                             errors.InvalidArgumentError), '.*out of bounds.*'),
-  )
-  def testRaggedTensorGetItemErrorsWithRaggedRank2(self, slice_spec, expected,
-                                                   message):
-    """Test that rt.__getitem__(slice_spec) == expected."""
-    rt = RaggedTensor.from_nested_row_splits(
-        EXAMPLE_RAGGED_TENSOR_4D_VALUES,
-        [EXAMPLE_RAGGED_TENSOR_4D_SPLITS1, EXAMPLE_RAGGED_TENSOR_4D_SPLITS2])
-    self.assertAllEqual(rt, EXAMPLE_RAGGED_TENSOR_4D)
-    self._TestGetItemException(rt, slice_spec, expected, message)
-
-  @parameterized.parameters(
-      (SLICE_BUILDER[:], []),
-      (SLICE_BUILDER[2:], []),
-      (SLICE_BUILDER[:-3], []),
-  )
-  def testRaggedTensorGetItemWithEmptyTensor(self, slice_spec, expected):
-    """Test that rt.__getitem__(slice_spec) == expected."""
-    rt = RaggedTensor.from_row_splits([], [0])
-    self._TestGetItem(rt, slice_spec, expected)
-
-  @parameterized.parameters(
-      (SLICE_BUILDER[0], (IndexError, ValueError, errors.InvalidArgumentError),
-       '.*out of bounds.*'),
-      (SLICE_BUILDER[-1], (IndexError, ValueError, errors.InvalidArgumentError),
-       '.*out of bounds.*'),
-  )
-  def testRaggedTensorGetItemErrorsWithEmptyTensor(self, slice_spec, expected,
-                                                   message):
-    """Test that rt.__getitem__(slice_spec) == expected."""
-    rt = RaggedTensor.from_row_splits([], [0])
-    self._TestGetItemException(rt, slice_spec, expected, message)
-
-  @parameterized.parameters(
-      (SLICE_BUILDER[-4], EXAMPLE_RAGGED_TENSOR_2D[-4]),
-      (SLICE_BUILDER[0], EXAMPLE_RAGGED_TENSOR_2D[0]),
-      (SLICE_BUILDER[-3:], EXAMPLE_RAGGED_TENSOR_2D[-3:]),
-      (SLICE_BUILDER[:3], EXAMPLE_RAGGED_TENSOR_2D[:3]),
-      (SLICE_BUILDER[3:5], EXAMPLE_RAGGED_TENSOR_2D[3:5]),
-      (SLICE_BUILDER[0, 1], EXAMPLE_RAGGED_TENSOR_2D[0][1]),
-      (SLICE_BUILDER[-3, 0], EXAMPLE_RAGGED_TENSOR_2D[-3][0]),
-  )
-  def testRaggedTensorGetItemWithPlaceholderShapes(self, slice_spec, expected):
-    """Test that rt.__getitem__(slice_spec) == expected."""
-    # Intentionally use an unknown shape for `splits`, to force the code path
-    # that deals with having nrows unknown at graph construction time.
-    splits = constant_op.constant(
-        EXAMPLE_RAGGED_TENSOR_2D_SPLITS, dtype=dtypes.int64)
-    splits = array_ops.placeholder_with_default(splits, None)
-    rt = RaggedTensor.from_row_splits(EXAMPLE_RAGGED_TENSOR_2D_VALUES, splits)
-    self.assertAllEqual(rt, EXAMPLE_RAGGED_TENSOR_2D)
-    self._TestGetItem(rt, slice_spec, expected)
-
-  @parameterized.parameters(
-      (SLICE_BUILDER[..., 2], ValueError,
-       'Ellipsis not supported for unknown shape RaggedTensors'),)
-  def testRaggedTensorGetItemErrorsWithPlaceholderShapes(
-      self, slice_spec, expected, message):
-    """Test that rt.__getitem__(slice_spec) == expected."""
-    if not context.executing_eagerly():
-      # Intentionally use an unknown shape for `values`.
-      values = array_ops.placeholder_with_default([0], None)
-      rt = RaggedTensor.from_row_splits(values, [0, 1])
-      self._TestGetItemException(rt, slice_spec, expected, message)
-
-  def testGetItemNewAxis(self):
-    # rt: [[[['a', 'b'], ['c', 'd']], [], [['e', 'f']]], []]
-    splits1 = [0, 3, 3]
-    splits2 = [0, 2, 2, 3]
-    values = constant_op.constant([['a', 'b'], ['c', 'd'], ['e', 'f']])
-    rt = RaggedTensor.from_nested_row_splits(values, [splits1, splits2])
-    rt_newaxis0 = rt[array_ops.newaxis]
-    rt_newaxis1 = rt[:, array_ops.newaxis]
-    rt_newaxis2 = rt[:, :, array_ops.newaxis]
-    rt_newaxis3 = rt[:, :, :, array_ops.newaxis]
-    rt_newaxis4 = rt[:, :, :, :, array_ops.newaxis]
-
-    self.assertAllEqual(
-        rt,
-        [[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]], []])
-    self.assertAllEqual(
-        rt_newaxis0,
-        [[[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]], []]])
-    self.assertAllEqual(
-        rt_newaxis1,
-        [[[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]]], [[]]])
-    self.assertAllEqual(
-        rt_newaxis2,
-        [[[[[b'a', b'b'], [b'c', b'd']]], [[]], [[[b'e', b'f']]]], []])
-    self.assertAllEqual(
-        rt_newaxis3,
-        [[[[[b'a', b'b']], [[b'c', b'd']]], [], [[[b'e', b'f']]]], []])
-    self.assertAllEqual(
-        rt_newaxis4,
-        [[[[[b'a'], [b'b']], [[b'c'], [b'd']]], [], [[[b'e'], [b'f']]]], []])
-
-    self.assertEqual(rt.ragged_rank, 2)
-    self.assertEqual(rt_newaxis0.ragged_rank, 3)
-    self.assertEqual(rt_newaxis1.ragged_rank, 3)
-    self.assertEqual(rt_newaxis2.ragged_rank, 3)
-    self.assertEqual(rt_newaxis3.ragged_rank, 2)
-    self.assertEqual(rt_newaxis4.ragged_rank, 2)
-
-    self.assertEqual(rt_newaxis0.shape.as_list(), [1, 2, None, None, 2])
-    self.assertEqual(rt_newaxis1.shape.as_list(), [2, 1, None, None, 2])
-    self.assertEqual(rt_newaxis2.shape.as_list(), [2, None, 1, None, 2])
-    self.assertEqual(rt_newaxis3.shape.as_list(), [2, None, None, 1, 2])
-    self.assertEqual(rt_newaxis4.shape.as_list(), [2, None, None, 2, 1])
-
-  @parameterized.parameters(
-      # EXAMPLE_RAGGED_TENSOR_3D.shape = [2, 3, None]
-
-      # Indexing into uniform_row_splits dimension:
-      (SLICE_BUILDER[:, 1], [r[1] for r in EXAMPLE_RAGGED_TENSOR_3D],
-       [2, None]),
-      (SLICE_BUILDER[:, 2], [r[2] for r in EXAMPLE_RAGGED_TENSOR_3D],
-       [2, None]),
-      (SLICE_BUILDER[:, -2], [r[-2] for r in EXAMPLE_RAGGED_TENSOR_3D],
-       [2, None]),
-      (SLICE_BUILDER[:, -3], [r[-3] for r in EXAMPLE_RAGGED_TENSOR_3D],
-       [2, None]),
-      (SLICE_BUILDER[1:, 2], [r[2] for r in EXAMPLE_RAGGED_TENSOR_3D[1:]],
-       [1, None]),
-      (SLICE_BUILDER[:, 1, 1:], [r[1][1:] for r in EXAMPLE_RAGGED_TENSOR_3D],
-       [2, None]),
-      (SLICE_BUILDER[1:, 1, 1:],
-       [r[1][1:] for r in EXAMPLE_RAGGED_TENSOR_3D[1:]],
-       [1, None]),
-
-      # Slicing uniform_row_splits dimension:
-      (SLICE_BUILDER[:, 2:], [r[2:] for r in EXAMPLE_RAGGED_TENSOR_3D],
-       [2, 1, None]),
-      (SLICE_BUILDER[:, -2:], [r[-2:] for r in EXAMPLE_RAGGED_TENSOR_3D],
-       [2, 2, None]),
-      (SLICE_BUILDER[:, :, 1:],
-       [[c[1:] for c in r] for r in EXAMPLE_RAGGED_TENSOR_3D],
-       [2, 3, None]),
-      (SLICE_BUILDER[:, 5:], [r[5:] for r in EXAMPLE_RAGGED_TENSOR_3D],
-       [2, 0, None]),
-
-      # Slicing uniform_row_splits dimension with a non-default step size:
-      (SLICE_BUILDER[:, ::2], [r[::2] for r in EXAMPLE_RAGGED_TENSOR_3D],
-       [2, 2, None]),
-      (SLICE_BUILDER[:, ::-1], [r[::-1] for r in EXAMPLE_RAGGED_TENSOR_3D],
-       [2, 3, None]),
-  )
-  def testRaggedTensorGetItemWithUniformRowLength(self, slice_spec, expected,
-                                                  expected_shape):
-    """Test that rt.__getitem__(slice_spec) == expected."""
-    rt = RaggedTensor.from_uniform_row_length(
-        RaggedTensor.from_row_splits(
-            EXAMPLE_RAGGED_TENSOR_3D_VALUES,
-            EXAMPLE_RAGGED_TENSOR_3D_SPLITS),
-        EXAMPLE_RAGGED_TENSOR_3D_ROWLEN)
-    self.assertAllEqual(rt, EXAMPLE_RAGGED_TENSOR_3D)
-    self.assertIsNot(rt.uniform_row_length, None)
-    self._TestGetItem(rt, slice_spec, expected, expected_shape)
-
-    # If the result is 3D, then check that it still has a uniform row length:
-    actual = rt.__getitem__(slice_spec)
-    if actual.shape.rank == 3:
-      self.assertIsNot(actual.uniform_row_length, None)
-      self.assertAllEqual(actual.uniform_row_length, expected_shape[1])
-
-  @parameterized.parameters(
-      (SLICE_BUILDER[:, 3], errors.InvalidArgumentError, 'out of bounds'),
-      (SLICE_BUILDER[:, -4], errors.InvalidArgumentError, 'out of bounds'),
-      (SLICE_BUILDER[:, 10], errors.InvalidArgumentError, 'out of bounds'),
-      (SLICE_BUILDER[:, -10], errors.InvalidArgumentError, 'out of bounds'),
-  )
-  def testRaggedTensorGetItemErrorsWithUniformRowLength(self, slice_spec,
-                                                        expected, message):
-    """Test that rt.__getitem__(slice_spec) == expected."""
-    rt = RaggedTensor.from_uniform_row_length(
-        RaggedTensor.from_row_splits(
-            EXAMPLE_RAGGED_TENSOR_3D_VALUES,
-            EXAMPLE_RAGGED_TENSOR_3D_SPLITS),
-        EXAMPLE_RAGGED_TENSOR_3D_ROWLEN)
-    self.assertAllEqual(rt, EXAMPLE_RAGGED_TENSOR_3D)
-    self._TestGetItemException(rt, slice_spec, expected, message)
-
   #=============================================================================
   # RaggedTensor.__str__
   #=============================================================================
@@ -1330,9 +775,10 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
     else:
       expected_repr = (
           'tf.RaggedTensor(values=Tensor("RaggedFromRowSplits/values:0", '
-          'shape=(7,), dtype=string), row_splits='
-          'Tensor("RaggedFromRowSplits/row_splits:0", '
-          'shape=(6,), dtype={}))').format(splits_type)
+          'shape=(7,), dtype=string), '
+          'row_splits=Tensor('
+          '"RaggedFromRowSplits/RowPartitionFromRowSplits/row_splits:0",'
+          ' shape=(6,), dtype={}))').format(splits_type)
     self.assertEqual(repr(rt), expected_repr)
     self.assertEqual(str(rt), expected_repr)
 
@@ -1362,15 +808,11 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
     rt2_times_10 = rt2.with_flat_values(rt2.flat_values * 10)
     rt1_expanded = rt1.with_values(array_ops.expand_dims(rt1.values, axis=1))
 
-    self.assertAllEqual(
-        rt1_plus_10,
-        [[11, 12], [13, 14, 15], [16], [], [17]])
-    self.assertAllEqual(
-        rt2_times_10,
-        [[[10, 20], [30, 40, 50]], [[60]], [], [[], [70]]])
-    self.assertAllEqual(
-        rt1_expanded,
-        [[[1], [2]], [[3], [4], [5]], [[6]], [], [[7]]])
+    self.assertAllEqual(rt1_plus_10, [[11, 12], [13, 14, 15], [16], [], [17]])
+    self.assertAllEqual(rt2_times_10,
+                        [[[10, 20], [30, 40, 50]], [[60]], [], [[], [70]]])
+    self.assertAllEqual(rt1_expanded,
+                        [[[1], [2]], [[3], [4], [5]], [[6]], [], [[7]]])
 
   #=============================================================================
   # Session.run
@@ -1465,6 +907,99 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
     ragged_math_ops.reduce_sum(a)
     self.assertLen(a.consumers(), 1)
 
+  @parameterized.parameters([
+      {
+          'descr': 'from_value_rowids',
+          'factory': RaggedTensor.from_value_rowids,
+          'test': RaggedTensor.value_rowids,
+          'values': {
+              'values': [1, 2, 3, 4, 5, 6],
+              'value_rowids': [0, 0, 1, 1, 2, 2],
+          },
+          'tensor_field': 'value_rowids',
+          'value_rowids': [0, 1, 2],
+          'nrows': 10
+      },
+      {
+          'descr': 'from_row_splits',
+          'factory': RaggedTensor.from_row_splits,
+          # row_splits is a property, not a function.
+          'test': (lambda rt: rt.row_splits),
+          'values': {
+              'values': [1, 2, 3, 4, 5, 6],
+              'row_splits': [0, 2, 4, 6],
+          },
+          'tensor_field': 'row_splits',
+          'row_splits': [0, 1, 2, 3]
+      },
+      {
+          'descr': 'from_row_lengths',
+          'factory': RaggedTensor.from_row_lengths,
+          'test': RaggedTensor.row_lengths,
+          'values': {
+              'values': [1, 2, 3, 4, 5, 6],
+              'row_lengths': [2, 2, 2],
+          },
+          'tensor_field': 'row_lengths',
+          'row_lengths': [1, 1, 1],
+      },
+      # from_row_starts
+      {
+          'descr': 'from_row_starts',
+          'factory': RaggedTensor.from_row_starts,
+          'test': RaggedTensor.row_starts,
+          'values': {
+              'values': [1, 2, 3, 4, 5, 6],
+              'row_starts': [0, 2, 4]
+          },
+          'tensor_field': 'row_starts',
+          'row_starts': [0, 1, 2]
+      },
+      # from_row_limits
+      {
+          'descr': 'from_row_limits',
+          'factory': RaggedTensor.from_row_limits,
+          'test': RaggedTensor.row_limits,
+          'values': {
+              'values': [1, 2, 3, 4, 5, 6],
+              'row_limits': [2, 4, 6]
+          },
+          'tensor_field': 'row_limits',
+          'row_limits': [3]
+      },
+      # from_uniform_row_length
+      {
+          'descr': 'from_uniform_row_length',
+          'factory': RaggedTensor.from_uniform_row_length,
+          # One cannot extract uniform_row_length or nvals, so we return
+          # nvals//nrows = uniform_row_length, where nvals = 3
+          'test': (lambda rt: 3 // (rt.shape[0])),
+          'values': {
+              'values': [1, 2, 3, 4, 5, 6],
+              'uniform_row_length': 2
+          },
+          'tensor_field': 'uniform_row_length',
+          'uniform_row_length': 3
+      },
+  ])
+  def testFactoryTypePreference(self, descr, test, factory, values,
+                                tensor_field, **kwargs):
+    # When input tensors have shape information, some of these errors will be
+    # detected statically.
+    def op_cast(k, v):
+      if k == tensor_field:
+        return constant_op.constant(v, dtype=dtypes.int32)
+      else:
+        return v
+
+    value_copy = {k: op_cast(k, v) for k, v in values.items()}
+    rt = factory(**value_copy)
+
+    kw_copy = {k: v for k, v in kwargs.items()}
+    kw_copy['values'] = rt
+    rt2 = factory(**kw_copy)
+    self.assertAllEqual(kwargs[tensor_field], test(rt2))
+
   @parameterized.parameters([
       # from_value_rowids
       {
@@ -1557,7 +1092,7 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
           'row_lengths': [[1, 2], [1, 0]]
       },
       {
-          'descr': 'negative row_lengths',
+          'descr': 'negatve row_lengths',
           'factory': RaggedTensor.from_row_lengths,
           'values': [1, 2, 3, 4],
           'row_lengths': [3, -1, 2]
@@ -1678,18 +1213,21 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
     with self.assertRaises((errors.InvalidArgumentError, ValueError)):
       self.evaluate(factory(**kwargs))
 
-    # Remove shape information (by wraping tensors in placeholders), and check
+    # Remove shape information (by wrapping tensors in placeholders), and check
     # that we detect the errors when the graph is run.
     if not context.executing_eagerly():
+
       def wrap_arg(v):
         return array_ops.placeholder_with_default(
             constant_op.constant(v, dtype=dtypes.int64),
             tensor_shape.TensorShape(None))
+
       kwargs = dict((k, wrap_arg(v)) for (k, v) in kwargs.items())
 
       with self.assertRaises(errors.InvalidArgumentError):
         self.evaluate(factory(**kwargs))
 
+
 #=============================================================================
 # RaggedTensor Variant conversion
 #=============================================================================
@@ -2059,8 +1597,10 @@ class RaggedTensorSpecTest(test_util.TensorFlowTestCase,
     self.assertAllEqual(rt1, [[1, 2], [3]])
 
     spec2 = RaggedTensorSpec(ragged_rank=2, dtype=dtypes.int32)
-    rt2 = spec2._from_components([np.array([1, 2, 3]), np.array([0, 2, 3]),
-                                  np.array([0, 0, 2, 3])])
+    rt2 = spec2._from_components(
+        [np.array([1, 2, 3]),
+         np.array([0, 2, 3]),
+         np.array([0, 0, 2, 3])])
     self.assertIsInstance(rt2, ragged_tensor_value.RaggedTensorValue)
     self.assertAllEqual(rt2, [[[], [1, 2]], [[3]]])
 
@@ -2149,6 +1689,20 @@ class RaggedTensorSpecTest(test_util.TensorFlowTestCase,
   def testUnbatch(self, spec, expected):
     self.assertEqual(spec._unbatch(), expected)
 
+  def testIsCompatibleWith(self):
+    spec1 = RaggedTensorSpec([32, None, None], dtypes.float32, 2)
+    spec2 = RaggedTensorSpec(None, dtypes.float32, 2)
+    spec3 = RaggedTensorSpec(None, dtypes.int32, 1)
+    spec4 = RaggedTensorSpec([None], dtypes.int32, 0)
+
+    self.assertTrue(spec1.is_compatible_with(spec2))
+    self.assertFalse(spec1.is_compatible_with(spec3))
+    self.assertFalse(spec1.is_compatible_with(spec4))
+    self.assertFalse(spec2.is_compatible_with(spec3))
+    self.assertFalse(spec2.is_compatible_with(spec4))
+    self.assertFalse(spec3.is_compatible_with(spec4))
+    self.assertTrue(spec4.is_compatible_with(constant_op.constant([1, 2, 3])))
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/ops/ragged/row_partition.py b/tensorflow/python/ops/ragged/row_partition.py
new file mode 100644
index 00000000000..3e0269f692e
--- /dev/null
+++ b/tensorflow/python/ops/ragged/row_partition.py
@@ -0,0 +1,1223 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A class used to partition a sequence into contiguous subsequences ("rows").
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import composite_tensor
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import type_spec
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import segment_id_ops
+
+
+#===============================================================================
+# RowPartition
+#===============================================================================
+# TODO(edloper): Consider removing row_starts and row_limits factory methods
+# and accessors from RowPartition.  In particular, these two encodings are
+# "second-class citizens": we never cache them, and if you do construct a
+# RowPartition from them then it may be more expensive than you might expect
+# (because we append a value to the beginning/end to transform them into
+# splits).  If we do remove them from RowPartition, then we would still keep
+# the from_row_starts and from_row_limits factory methods in RaggedTensor.
+
+
+class RowPartition(composite_tensor.CompositeTensor):
+  """Partitioning of a sequence of values into contiguous subsequences ("rows").
+
+  A `RowPartition` describes how a sequence with `nvals` items should be
+  divided into `nrows` contiguous subsequences ("rows").  For example, a
+  `RowPartition` could be used to partition the vector `[1, 2, 3, 4, 5]` into
+  subsequences `[[1, 2], [3], [], [4, 5]]`.  Note that `RowPartition` stores
+  information about how values are partitioned, but does not include the
+  partitioned values themselves.  `tf.RaggedTensor` is used to pair a `values`
+  tensor with one or more `RowPartition`s, providing a complete encoding for a
+  ragged tensor (i.e. a tensor with variable-length dimensions).
+
+  `RowPartition`s may be defined using several different schemes:
+
+    * `row_lengths`: an integer vector with shape `[nrows]`, which specifies
+      the length of each row.
+
+    * `row_splits`: an integer vector with shape `[nrows+1]`, specifying the
+      "split points" between each row.
+
+    * `row_starts`: an integer vector with shape `[nrows]`, which specifies
+      the start offset for each row.  Equivalent to `row_splits[:-1]`.
+
+    * `row_limits`: an integer vector with shape `[nrows]`, which specifies
+      the stop offset for each row.  Equivalent to `row_splits[1:]`.
+
+    * `value_rowids` is an integer vector with shape `[nvals]`, corresponding
+      one-to-one with sequence values, which specifies the row that each value
+      belongs to.  If the partition has empty trailing rows, then `nrows`
+      must also be specified.
+
+    * `uniform_row_length` is an integer scalar, specifying the length of every
+      row.  This scheme may only be used if all rows have the same length.
+
+  For example, the following `RowPartition`s all represent the partitioning of
+  8 values into 5 sublists as follows: `[[*, *, *, *], [], [*, *, *], [*], []]`.
+
+  >>> p1 = RowPartition.from_row_lengths([4, 0, 3, 1, 0])
+  >>> p2 = RowPartition.from_row_splits([0, 4, 4, 7, 8, 8])
+  >>> p3 = RowPartition.from_row_starts([0, 4, 4, 7, 8], nvals=8)
+  >>> p4 = RowPartition.from_row_limits([4, 4, 7, 8, 8])
+  >>> p5 = RowPartition.from_value_rowids([0, 0, 0, 0, 2, 2, 2, 3], nrows=5)
+
+  For more information about each scheme, see the documentation for the
+  its factory method.  For additional examples, see the documentation on
+  `tf.RaggedTensor`.
+
+  ### Precomputed Encodings
+
+  `RowPartition` always stores at least one encoding of the partitioning, but
+  it can be configured to cache additional encodings as well.  This can
+  avoid unnecessary recomputation in eager mode.  (In graph mode, optimizations
+  such as common subexpression elimination will typically prevent these
+  unnecessary recomputations.)  To check which encodings are precomputed, use
+  `RowPartition.has_precomputed_<encoding>`.  To cache an additional
+  encoding, use `RowPartition.with_precomputed_<encoding>`.
+  """
+
+  #=============================================================================
+  # Constructor (private)
+  #=============================================================================
+  def __init__(self,
+               row_splits,
+               row_lengths=None,
+               value_rowids=None,
+               nrows=None,
+               uniform_row_length=None,
+               internal=False):
+    """Creates a `RowPartition` from the specified encoding tensor(s).
+
+    This constructor is private -- please use one of the following ops to
+    build `RowPartition`s:
+
+      * `RowPartition.from_row_lengths`
+      * `RowPartition.from_value_rowids`
+      * `RowPartition.from_row_splits`
+      * `RowPartition.from_row_starts`
+      * `RowPartition.from_row_limits`
+
+    Args:
+      row_splits: A 1-D integer tensor with shape `[nrows+1]`.
+      row_lengths: A 1-D integer tensor with shape `[nrows]`
+      value_rowids: A 1-D integer tensor with shape `[nvals]`.
+      nrows: A 1-D integer scalar tensor.
+      uniform_row_length: A scalar tensor.
+      internal: Private key value, required to ensure that this private
+        constructor is *only* called from the factory methods.
+
+    Raises:
+      TypeError: If a row partitioning tensor has an inappropriate dtype.
+      TypeError: If exactly one row partitioning argument was not specified.
+      ValueError: If a row partitioning tensor has an inappropriate shape.
+      ValueError: If multiple partitioning arguments are specified.
+      ValueError: If nrows is specified but value_rowids is not None.
+    """
+    if internal is not _row_partition_factory_key:
+      raise ValueError("RaggedTensor constructor is private; please use one "
+                       "of the factory methods instead (e.g., "
+                       "RaggedTensor.from_row_lengths())")
+
+    # Validate the arguments.
+    if not isinstance(row_splits, ops.Tensor):
+      raise TypeError("Row-partitioning argument must be a Tensor, got %r" %
+                      row_splits)
+    if row_splits.dtype not in (dtypes.int32, dtypes.int64):
+      raise ValueError("Row-partitioning argument must be int32 or int64")
+
+    # Validate shapes & dtypes.
+    row_splits.shape.assert_has_rank(1)
+    row_splits.set_shape([None])
+    self._row_splits = row_splits
+
+    # Store any cached tensors.  These are used to avoid unnecessary
+    # round-trip conversions when a RaggedTensor is constructed from
+    # lengths or rowids, and we later want those lengths/rowids back.
+    for tensor in [row_lengths, value_rowids, nrows]:
+      if tensor is not None:
+        if not isinstance(tensor, ops.Tensor):
+          raise TypeError("Cached value must be a Tensor or None.")
+        elif tensor.dtype not in (dtypes.int32, dtypes.int64):
+          raise TypeError("Cached value must be int32 or int64.")
+    self._row_lengths = row_lengths
+    self._value_rowids = value_rowids
+    self._nrows = nrows
+
+    if uniform_row_length is not None:
+      if not isinstance(uniform_row_length, ops.Tensor):
+        raise TypeError("uniform_row_length must be a Tensor or None.")
+      elif uniform_row_length.dtype not in (dtypes.int32, dtypes.int64):
+        raise TypeError("uniform_row_length must be int32 or int64.")
+    self._uniform_row_length = uniform_row_length
+
+  #=============================================================================
+  # Factory Methods
+  #=============================================================================
+
+  @classmethod
+  def from_value_rowids(cls,
+                        value_rowids,
+                        nrows=None,
+                        validate=True,
+                        preferred_dtype=None):
+    """Creates a `RowPartition` with rows partitioned by `value_rowids`.
+
+    This `RowPartition` divides a sequence `values` into rows by specifying
+    which row each value should be added to:
+
+    ```python
+    partitioned_rows = [[] for _ in nrows]
+    for (value, rowid) in zip(values, value_rowids):
+      partitioned_rows[rowid].append(value)
+    ``
+
+    Args:
+      value_rowids: A 1-D integer tensor with shape `[nvals]`, which corresponds
+        one-to-one with `values`, and specifies each value's row index.  Must be
+        nonnegative, and must be sorted in ascending order.
+      nrows: An integer scalar specifying the number of rows.  This should be
+        specified if the `RowPartition` may containing empty training rows. Must
+        be greater than `value_rowids[-1]` (or greater than or equal to zero if
+        `value_rowids` is empty). Defaults to `value_rowids[-1]` (or zero if
+        `value_rowids` is empty).
+      validate: If true, then use assertions to check that the arguments form a
+        valid `RowPartition`.
+      preferred_dtype: The dtype to encode value_rowids if it doesn't already
+        have one. The default is tf.int64.
+
+    Returns:
+      A `RowPartition`.
+
+    Raises:
+      ValueError: If `nrows` is incompatible with `value_rowids`.
+
+    #### Example:
+
+    >>> print(RowPartition.from_value_rowids(
+    ...     value_rowids=[0, 0, 0, 0, 2, 2, 2, 3],
+    ...     nrows=4))
+    tf.RowPartition(row_splits=tf.Tensor([0 4 4 7 8], shape=(5,), dtype=int64))
+    """
+    if not isinstance(validate, bool):
+      raise TypeError("validate must have type bool")
+    with ops.name_scope(None, "RowPartitionFromValueRowIds",
+                        [value_rowids, nrows]):
+      value_rowids = cls._convert_row_partition(value_rowids, "value_rowids",
+                                                preferred_dtype)
+      if nrows is None:
+        const_rowids = tensor_util.constant_value(value_rowids)
+        if const_rowids is None:
+          nrows = array_ops.concat([value_rowids[-1:], [-1]], axis=0)[0] + 1
+          const_nrows = None
+        else:
+          const_nrows = const_rowids[-1] + 1 if const_rowids.size > 0 else 0
+          nrows = ops.convert_to_tensor(
+              const_nrows, value_rowids.dtype, name="nrows")
+      else:
+        nrows = ops.convert_to_tensor(nrows, value_rowids.dtype, "nrows")
+        const_nrows = tensor_util.constant_value(nrows)
+        if const_nrows is not None:
+          if const_nrows < 0:
+            raise ValueError("Expected nrows >= 0; got %d" % const_nrows)
+          const_rowids = tensor_util.constant_value(value_rowids)
+          if const_rowids is not None and const_rowids.size > 0:
+            if not const_nrows >= const_rowids[-1] + 1:
+              raise ValueError(
+                  "Expected nrows >= value_rowids[-1] + 1; got nrows=%d, "
+                  "value_rowids[-1]=%d" % (const_nrows, const_rowids[-1]))
+
+      value_rowids.shape.assert_has_rank(1)
+      nrows.shape.assert_has_rank(0)
+
+      if validate:
+        msg = ("Arguments to from_value_rowids do not form a valid "
+               "RowPartition")
+        checks = [
+            check_ops.assert_rank(value_rowids, 1, message=msg),
+            check_ops.assert_rank(nrows, 0, message=msg),
+            check_ops.assert_non_negative(value_rowids[:1], message=msg),
+            _assert_monotonic_increasing(value_rowids, message=msg),
+            check_ops.assert_less(value_rowids[-1:], nrows, message=msg),
+        ]
+        value_rowids = control_flow_ops.with_dependencies(checks, value_rowids)
+
+      # Convert value_rowids & nrows to row_splits.
+      # Note: we don't use segment_ids_to_row_splits() here because we want
+      # to save the intermediate value `row_lengths`, so we can cache it.
+      # TODO(b/116708836) Upgrade bincount to accept int64 so we can skip the
+      # cast.
+      value_rowids_int32 = math_ops.cast(value_rowids, dtypes.int32)
+      nrows_int32 = math_ops.cast(nrows, dtypes.int32)
+      row_lengths = math_ops.bincount(
+          value_rowids_int32,
+          minlength=nrows_int32,
+          maxlength=nrows_int32,
+          dtype=value_rowids.dtype)
+      row_splits = array_ops.concat([[0], math_ops.cumsum(row_lengths)], axis=0)
+      if const_nrows is not None:
+        row_lengths.set_shape([const_nrows])
+        row_splits.set_shape([const_nrows + 1])
+
+      return cls(
+          row_splits=row_splits,
+          row_lengths=row_lengths,
+          value_rowids=value_rowids,
+          nrows=nrows,
+          internal=_row_partition_factory_key)
+
+  @classmethod
+  def from_row_splits(cls, row_splits, validate=True, preferred_dtype=None):
+    """Creates a `RowPartition` with rows partitioned by `row_splits`.
+
+    This `RowPartition` divides a sequence `values` into rows by indicating
+    where each row begins and ends:
+
+    ```python
+    partitioned_rows = []
+    for i in range(len(row_splits) - 1):
+      row_start = row_splits[i]
+      row_end = row_splits[i + 1]
+      partitioned_rows.append(values[row_start:row_end])
+    ```
+
+    Args:
+      row_splits: A 1-D integer tensor with shape `[nrows+1]`.  Must not be
+        empty, and must be sorted in ascending order.  `row_splits[0]` must be
+        zero.
+      validate: If true, then use assertions to check that the arguments form a
+        valid `RowPartition`.
+      preferred_dtype: If row_splits has an unspecified type, use this one. If
+        preferred_dtype is None, defaults to dtypes.int64.
+
+    Returns:
+      A `RowPartition`.
+
+    Raises:
+      ValueError: If `row_splits` is an empty list.
+    """
+    if not isinstance(validate, bool):
+      raise TypeError("validate must have type bool")
+    if isinstance(row_splits, (list, tuple)) and not row_splits:
+      raise ValueError("row_splits tensor may not be empty.")
+    if isinstance(row_splits, tensor_spec.TensorSpec):
+      return cls(row_splits=row_splits, internal=_row_partition_factory_key)
+
+    with ops.name_scope(None, "RowPartitionFromRowSplits", [row_splits]):
+      row_splits = cls._convert_row_partition(row_splits, "row_splits",
+                                              preferred_dtype)
+      row_splits.shape.assert_has_rank(1)
+
+      if validate:
+        msg = "Arguments to from_row_splits do not form a valid RaggedTensor:"
+        checks = [
+            check_ops.assert_rank(row_splits, 1, message=(msg + "rank")),
+            _assert_zero(row_splits[0], message=(msg + "zero")),
+            _assert_monotonic_increasing(
+                row_splits, message=(msg + "monotonic")),
+        ]
+        row_splits = control_flow_ops.with_dependencies(checks, row_splits)
+
+      return cls(row_splits=row_splits, internal=_row_partition_factory_key)
+
+  @classmethod
+  def from_row_lengths(cls, row_lengths, validate=True, preferred_dtype=None):
+    """Creates a `RowPartition` with rows partitioned by `row_lengths`.
+
+    This `RowPartition` divides a sequence `values` into rows by indicating
+    the length of each row:
+
+    ```python
+    partitioned_rows = [[values.pop(0) for _ in range(length)]
+                        for length in row_lengths]
+    ```
+
+    Args:
+      row_lengths: A 1-D integer tensor with shape `[nrows]`.  Must be
+        nonnegative.
+      validate: If true, then use assertions to check that the arguments form a
+        valid `RowPartition`.
+      preferred_dtype: If row_lengths has an unspecified type, use this one. If
+        preferred_dtype is None, defaults to dtypes.int64.
+
+    Returns:
+      A `RowPartition`.
+    """
+    if not isinstance(validate, bool):
+      raise TypeError("validate must have type bool")
+    with ops.name_scope(None, "RowPartitionFromRowLengths", [row_lengths]):
+      row_lengths = cls._convert_row_partition(row_lengths, "row_lengths",
+                                               preferred_dtype)
+      row_lengths.shape.assert_has_rank(1)
+
+      if validate:
+        msg = "Arguments to from_row_lengths do not form a valid RowPartition"
+        checks = [
+            check_ops.assert_rank(row_lengths, 1, message=msg),
+            check_ops.assert_non_negative(row_lengths, message=msg),
+        ]
+        row_lengths = control_flow_ops.with_dependencies(checks, row_lengths)
+
+      row_limits = math_ops.cumsum(row_lengths)
+      row_splits = array_ops.concat([[0], row_limits], axis=0)
+      return cls(
+          row_splits=row_splits,
+          row_lengths=row_lengths,
+          internal=_row_partition_factory_key)
+
+  @classmethod
+  def from_row_starts(cls,
+                      row_starts,
+                      nvals,
+                      validate=True,
+                      preferred_dtype=None):
+    """Creates a `RowPartition` with rows partitioned by `row_starts`.
+
+    Equivalent to: `from_row_splits(concat([row_starts, nvals], axis=0))`.
+
+    Args:
+      row_starts: A 1-D integer tensor with shape `[nrows]`.  Must be
+        nonnegative and sorted in ascending order.  If `nrows>0`, then
+        `row_starts[0]` must be zero.
+      nvals: A scalar tensor indicating the number of values.
+      validate: If true, then use assertions to check that the arguments form a
+        valid `RowPartition`.
+      preferred_dtype: If row_limits has an unspecified type, use this one. If
+        preferred_dtype is None, defaults to dtypes.int64.
+
+    Returns:
+      A `RowPartition`.
+    """
+    if not isinstance(validate, bool):
+      raise TypeError("validate must have type bool")
+    with ops.name_scope(None, "RowPartitionFromRowStarts", [row_starts]):
+      row_starts = cls._convert_row_partition(row_starts, "row_starts",
+                                              preferred_dtype)
+      row_starts.shape.assert_has_rank(1)
+      nvals = math_ops.cast(nvals, row_starts.dtype)
+      if validate:
+        msg = "Arguments to from_row_starts do not form a valid RaggedTensor"
+        checks = [
+            check_ops.assert_rank(row_starts, 1, message=msg),
+            _assert_zero(row_starts[:1], message=msg),
+            _assert_monotonic_increasing(row_starts, message=msg),
+            check_ops.assert_less_equal(row_starts[-1:], nvals, message=msg),
+        ]
+        row_starts = control_flow_ops.with_dependencies(checks, row_starts)
+
+      row_splits = array_ops.concat([row_starts, [nvals]], axis=0)
+      return cls(row_splits=row_splits, internal=_row_partition_factory_key)
+
+  @classmethod
+  def from_row_limits(cls, row_limits, validate=True, preferred_dtype=None):
+    """Creates a `RowPartition` with rows partitioned by `row_limits`.
+
+    Equivalent to: `from_row_splits(values, concat([0, row_limits], axis=0))`.
+
+    Args:
+      row_limits: A 1-D integer tensor with shape `[nrows]`.  Must be sorted in
+        ascending order.
+      validate: If true, then use assertions to check that the arguments form a
+        valid `RowPartition`.
+      preferred_dtype: If row_limits has an unspecified type, use this one. If
+        preferred_dtype is None, defaults to dtypes.int64.
+
+    Returns:
+      A `RowPartition`.
+    """
+    if not isinstance(validate, bool):
+      raise TypeError("validate must have type bool")
+    with ops.name_scope(None, "RowPartitionFromRowLimits", [row_limits]):
+      row_limits = cls._convert_row_partition(row_limits, "row_limits",
+                                              preferred_dtype)
+      row_limits.shape.assert_has_rank(1)
+
+      if validate:
+        msg = "Arguments to from_row_limits do not form a valid RaggedTensor"
+        checks = [
+            check_ops.assert_rank(row_limits, 1, message=msg),
+            check_ops.assert_non_negative(row_limits[:1], message=msg),
+            _assert_monotonic_increasing(row_limits, message=msg),
+        ]
+        row_limits = control_flow_ops.with_dependencies(checks, row_limits)
+
+      zero = array_ops.zeros([1], row_limits.dtype)
+      row_splits = array_ops.concat([zero, row_limits], axis=0)
+      return cls(row_splits=row_splits, internal=_row_partition_factory_key)
+
+  # TODO(edloper): Make nvals optional: user must specify at least one of
+  # {nvals, nrows}, but they can pick which one to specify.
+  @classmethod
+  def from_uniform_row_length(cls,
+                              uniform_row_length,
+                              nvals,
+                              nrows=None,
+                              validate=True,
+                              preferred_dtype=None):
+    """Creates a `RowPartition` with rows partitioned by `uniform_row_length`.
+
+    This `RowPartition` divides a sequence `values` into rows that all have
+    the same length:
+
+    ```python
+    partitioned_rows = [[values.pop(0) for _ in range(uniform_row_length)]
+             for _ in range(nrows)]
+    ```
+
+    Args:
+      uniform_row_length: A scalar integer tensor.  Must be nonnegative. The
+        size of the outer axis of `values` must be evenly divisible by
+        `uniform_row_length`.
+      nvals: a non-negative scalar integer tensor for the number of values.
+      nrows: The number of rows in the constructed RowPartition.  If not
+        specified, then it defaults to `nvals/uniform_row_length` (or `0` if
+        `uniform_row_length==0`).  `nrows` only needs to be specified if
+        `uniform_row_length` might be zero.  `uniform_row_length*nrows` must be
+        `nvals`.
+      validate: If true, then use assertions to check that the arguments form a
+        valid `RowPartition`.
+      preferred_dtype: if uniform_row_length has no dtype, use this one.
+
+    Returns:
+      A `RowPartition`.
+    """
+    if not isinstance(validate, bool):
+      raise TypeError("validate must have type bool")
+    with ops.name_scope(None, "RowPartitionFromUniformRowLength",
+                        [uniform_row_length, nrows]):
+      uniform_row_length = cls._convert_row_partition(uniform_row_length,
+                                                      "uniform_row_length",
+                                                      preferred_dtype)
+      uniform_row_length.shape.assert_has_rank(0)
+
+      # Find nrows.
+      const_row_length = tensor_util.constant_value(uniform_row_length)
+      if nrows is None:
+        if const_row_length is None:
+          # Avoid division by zero if uniform_row_length==0 (and nvals==0).
+          rowlen_or_1 = math_ops.maximum(
+              uniform_row_length,
+              constant_op.constant(1, uniform_row_length.dtype))
+          nrows = nvals // rowlen_or_1
+        elif const_row_length == 0:
+          nrows = 0
+        else:
+          nrows = nvals // const_row_length
+      nrows = ops.convert_to_tensor(
+          nrows, uniform_row_length.dtype, name="nrows")
+      const_nrows = tensor_util.constant_value(nrows)
+      const_nvals = tensor_util.constant_value(nvals)
+
+      # Find row_splits.
+      if const_nrows is not None and const_row_length is not None:
+        row_splits = [v * const_row_length for v in range(const_nrows + 1)]
+        row_splits = constant_op.constant(row_splits, uniform_row_length.dtype)
+      else:
+        row_splits = math_ops.range(nrows + 1) * uniform_row_length
+
+      if validate:
+        checks = []
+
+        if (const_nrows is None or const_row_length is None or
+            const_nvals is None):
+          checks.append(
+              check_ops.assert_equal(
+                  nrows * uniform_row_length, nvals,
+                  ("uniform_row_length", uniform_row_length, "times nrows",
+                   nrows, "must equal nvals", nvals)))
+        else:
+          if const_nrows * const_row_length != const_nvals:
+            raise ValueError(
+                "uniform_row_length=%d times nrows=%d must equal nvals=%d" %
+                (const_row_length, const_nrows, const_nvals))
+
+        if uniform_row_length.shape.rank is None:
+          checks.append(
+              check_ops.assert_rank(
+                  uniform_row_length,
+                  0,
+                  message="uniform_row_length must be a scalar."))
+
+        const_row_length = tensor_util.constant_value(uniform_row_length)
+        if const_row_length is None:
+          checks.append(
+              check_ops.assert_greater_equal(
+                  uniform_row_length,
+                  constant_op.constant(0, uniform_row_length.dtype),
+                  message="uniform_row_length must be >= 0."))
+        else:
+          if const_row_length < 0:
+            raise ValueError("uniform_row_length must be >= 0.")
+
+        row_splits = control_flow_ops.with_dependencies(checks, row_splits)
+
+      return cls(
+          row_splits=row_splits,
+          uniform_row_length=uniform_row_length,
+          nrows=nrows,
+          internal=_row_partition_factory_key)
+
+  @classmethod
+  def _convert_row_partition(cls, partition, name, preferred_dtype):
+    """Converts `partition` to Tensors.
+
+    Args:
+      partition: A row-partitioning tensor for the `RowPartition` being
+        constructed.  I.e., one of: row_splits, row_lengths, row_starts,
+        row_limits, value_rowids, uniform_row_length.
+      name: The name of the row-partitioning tensor.
+      preferred_dtype: If partition has no dtype, give it this one. If
+        no dtype is specified, use dtypes.int64.
+
+    Returns:
+      A tensor equivalent to partition.
+
+    Raises:
+      ValueError: if dtype is not int32 or int64.
+    """
+    if preferred_dtype is None:
+      preferred_dtype = dtypes.int64
+    if isinstance(partition, np.ndarray) and partition.dtype == np.int32:
+      partition = ops.convert_to_tensor(partition, name=name)
+    else:
+      partition = ops.convert_to_tensor(
+          partition, preferred_dtype=preferred_dtype, name=name)
+    if partition.dtype not in (dtypes.int32, dtypes.int64):
+      raise ValueError("%s must have dtype int32 or int64" % name)
+
+    return partition
+
+  def with_dependencies(self, dependencies):
+    """Returns a new RowPartition equal to self with control dependencies.
+
+    Specifically, self._row_splits is gated by the given control dependencies.
+    Used to add sanity checks to the constructors.
+
+    Args:
+      dependencies: a list of tensors to use as dependencies.
+
+    Returns:
+      A new RowPartition object.
+    """
+    new_row_splits = control_flow_ops.with_dependencies(dependencies,
+                                                        self._row_splits)
+    return RowPartition(
+        row_splits=new_row_splits,
+        row_lengths=self._row_lengths,
+        value_rowids=self._value_rowids,
+        nrows=self._nrows,
+        uniform_row_length=self._uniform_row_length,
+        internal=_row_partition_factory_key)
+
+  #=============================================================================
+  # Accessors
+  #=============================================================================
+
+  @property
+  def dtype(self):
+    """The `DType` used to encode the row partition (either int32 or int64)."""
+    return self._row_splits.dtype
+
+  def row_splits(self):
+    """Returns the row-split indices for this row partition.
+
+    `row_splits` specifies where the values for each row begin and end.
+    In particular, the values for row `i` are stored in the slice
+    `values[row_splits[i]:row_splits[i+1]]`.
+
+    Returns:
+      A 1-D integer `Tensor` with shape `[self.nrows+1]`.
+      The returned tensor is non-empty, and is sorted in ascending order.
+      `self.row_splits()[0] == 0`.
+      `self.row_splits()[-1] == self.nvals()`.
+    """
+    return self._row_splits
+
+  def value_rowids(self):
+    """Returns the row indices for this row partition.
+
+    `value_rowids` specifies the row index fo reach value.  In particular,
+    `value_rowids[i]` is the row index for `values[i]`.
+
+    Returns:
+      A 1-D integer `Tensor` with shape `[self.nvals()]`.
+      The returned tensor is nonnegative, and is sorted in ascending order.
+    """
+    if self._value_rowids is not None:
+      return self._value_rowids
+    return segment_id_ops.row_splits_to_segment_ids(self._row_splits)
+
+  def nvals(self, out_type=None):
+    """Returns the number of values partitioned by this `RowPartition`.
+
+    If the sequence partitioned by this `RowPartition` is a tensor, then
+    `nvals` is the size of that tensor's outermost dimension -- i.e.,
+    `nvals == values.shape[0]`.
+
+    Args:
+      out_type: `dtype` for the returned tensor.  Defaults to `self.dtype`.
+
+    Returns:
+      scalar integer Tensor
+    """
+    if out_type is None:
+      return self._row_splits[-1]
+    else:
+      out_type = dtypes.as_dtype(out_type)
+      return math_ops.cast(self._row_splits[-1], dtype=out_type)
+
+  def nrows(self, out_type=None):
+    """Returns the number of rows created by this `RowPartition`.
+
+    Args:
+      out_type: `dtype` for the returned tensor.  Defaults to `self.dtype`.
+
+    Returns:
+      scalar integer Tensor
+    """
+    if out_type is None:
+      out_type = self.dtype
+    else:
+      out_type = dtypes.as_dtype(out_type)
+    if self._nrows is not None:
+      return math_ops.cast(self._nrows, out_type)
+    nsplits = tensor_shape.dimension_at_index(self._row_splits.shape, 0)
+    if nsplits.value is None:
+      return array_ops.shape(self._row_splits, out_type=out_type)[0] - 1
+    else:
+      return constant_op.constant(nsplits.value - 1, dtype=out_type)
+
+  def uniform_row_length(self):
+    """Returns the length of each row in this partition, if rows are uniform.
+
+    If all rows in this `RowPartition` have the same length, then this returns
+    that length as a scalar integer `Tensor`.  Otherwise, it returns `None`.
+
+    Returns:
+      scalar Tensor with `type=self.dtype`, or `None`.
+    """
+    return self._uniform_row_length
+
+  def row_starts(self):
+    """Returns the start indices for rows in this row partition.
+
+    These indices specify where the values for each row begin.
+    `partition.row_starts()` is equal to `partition.row_splits()[:-1]`.
+
+    Returns:
+      A 1-D integer Tensor with shape `[self.nrows()]`.
+      The returned tensor is nonnegative, and is sorted in ascending order.
+      `self.row_starts()[0] == 0`.
+      `self.row_starts()[-1] <= self.nvals()`.
+    """
+    return self._row_splits[:-1]
+
+  def row_limits(self):
+    """Returns the limit indices for rows in this row partition.
+
+    These indices specify where the values for each row end.
+    `partition.row_limits()` is equal to `partition.row_splits()[:-1]`.
+
+    Returns:
+      A 1-D integer Tensor with shape `[self.nrows]`.
+      The returned tensor is nonnegative, and is sorted in ascending order.
+      `self.row_limits()[-1] == self.nvals()`.
+    """
+    return self._row_splits[1:]
+
+  def row_lengths(self):
+    """Returns the lengths of rows in this `RowPartition`.
+
+    Returns:
+      A 1-D integer Tensor with shape `[self.nrows]`.
+      The returned tensor is nonnegative.
+      `tf.reduce_sum(self.row_lengths) == self.nvals()`.
+    """
+    if self._row_lengths is not None:
+      return self._row_lengths
+    splits = self._row_splits
+    return splits[1:] - splits[:-1]
+
+  @property
+  def static_nrows(self):
+    """The number of rows in this partition, if statically known.
+
+    ```python
+    self.row_lengths().shape == [self.static_nrows]
+    self.row_starts().shape == [self.static_nrows]
+    self.row_limits().shape == [self.static_nrows]
+    self.row_splits().shape == [self.static_nrows + 1]
+    ```
+
+    Returns:
+      The number of rows in this partition as an `int` (if statically known);
+      or `None` (otherwise).
+    """
+    if self._row_splits is not None:
+      nrows = tensor_shape.dimension_at_index(self._row_splits.shape, 0) - 1
+      if nrows.value is not None:
+        return nrows
+    if self._row_lengths is not None:
+      nrows = tensor_shape.dimension_at_index(self._row_lengths.shape, 0)
+      if nrows.value is not None:
+        return nrows
+    if self._nrows is not None:
+      return tensor_shape.Dimension(tensor_util.constant_value(self._nrows))
+    return None
+
+  @property
+  def static_nvals(self):
+    """The number of values in this partition, if statically known.
+
+    ```python
+    self.value_rowids().shape == [self.static_vals]
+    ```
+
+    Returns:
+      The number of values in this partition as an `int` (if statically known);
+      or `None` (otherwise).
+    """
+    if self._value_rowids is not None:
+      nvals = tensor_shape.dimension_at_index(self._value_rowids.shape, 0)
+      if nvals.value is not None:
+        return nvals.value
+    return None
+
+  @property
+  def static_uniform_row_length(self):
+    """The number of values in each row of this partition, if statically known.
+
+    Returns:
+      The number of values in each row of this partition as an `int` (if
+      statically known); or `None` (otherwise).
+    """
+    if self._uniform_row_length is not None:
+      return tensor_util.constant_value(self._uniform_row_length)
+    return None
+
+  #=============================================================================
+  # Transformation
+  #=============================================================================
+
+  def with_row_splits_dtype(self, dtype):
+    """Returns a copy of this RowPartition with the given `row_splits` dtype.
+
+    For RaggedTensors with multiple ragged dimensions, the `row_splits` for all
+    nested `RaggedTensor` objects are cast to the given dtype.
+
+    Args:
+      dtype: The dtype for `row_splits`.  One of `tf.int32` or `tf.int64`.
+
+    Returns:
+      A copy of this RaggedTensor, with the `row_splits` cast to the given
+      type.
+    """
+    dtype = dtypes.as_dtype(dtype)
+    if dtype not in (dtypes.int32, dtypes.int64):
+      raise ValueError("dtype must be int32 or int64")
+    if self.dtype == dtype:
+      return self
+
+    return RowPartition(
+        row_splits=_cast_if_not_none(self._row_splits, dtype),
+        row_lengths=_cast_if_not_none(self._row_lengths, dtype),
+        value_rowids=_cast_if_not_none(self._value_rowids, dtype),
+        nrows=_cast_if_not_none(self._nrows, dtype),
+        uniform_row_length=_cast_if_not_none(self._uniform_row_length, dtype),
+        internal=_row_partition_factory_key)
+
+  #=============================================================================
+  # String Encoding
+  #=============================================================================
+
+  def __repr__(self):
+    return "tf.RowPartition(row_splits=%s)" % (self._row_splits)
+
+  #=============================================================================
+  # Precomputed Encodings
+  #=============================================================================
+
+  def has_precomputed_row_splits(self):
+    """Returns true if `row_splits` has already been computed.
+
+    If true, then `self.row_splits()` will return its value without calling
+    any TensorFlow ops.
+    """
+    return self._row_splits is not None
+
+  def has_precomputed_row_lengths(self):
+    """Returns true if `row_lengths` has already been computed.
+
+    If true, then `self.row_lengths()` will return its value without calling
+    any TensorFlow ops.
+    """
+    return self._row_lengths is not None
+
+  def has_precomputed_value_rowids(self):
+    """Returns true if `value_rowids` has already been computed.
+
+    If true, then `self.value_rowids()` will return its value without calling
+    any TensorFlow ops.
+    """
+    return self._value_rowids is not None
+
+  def has_precomputed_nrows(self):
+    """Returns true if `nrows` has already been computed.
+
+    If true, then `self.nrows()` will return its value without calling
+    any TensorFlow ops.
+    """
+    return self._nrows is not None
+
+  def with_precomputed_row_splits(self):
+    """Returns a copy of `self` with `row_splits` precomputed."""
+    return RowPartition(
+        row_splits=self.row_splits(),
+        row_lengths=self._row_lengths,
+        value_rowids=self._value_rowids,
+        nrows=self._nrows,
+        uniform_row_length=self._uniform_row_length,
+        internal=_row_partition_factory_key)
+
+  def with_precomputed_row_lengths(self):
+    """Returns a copy of `self` with `row_lengths` precomputed."""
+    return RowPartition(
+        row_splits=self._row_splits,
+        row_lengths=self.row_lengths(),
+        value_rowids=self._value_rowids,
+        nrows=self._nrows,
+        uniform_row_length=self._uniform_row_length,
+        internal=_row_partition_factory_key)
+
+  def with_precomputed_value_rowids(self):
+    """Returns a copy of `self` with `value_rowids` precomputed."""
+    return RowPartition(
+        row_splits=self._row_splits,
+        row_lengths=self._row_lengths,
+        value_rowids=self.value_rowids(),
+        nrows=self._nrows,
+        uniform_row_length=self._uniform_row_length,
+        internal=_row_partition_factory_key)
+
+  def with_precomputed_nrows(self):
+    """Returns a copy of `self` with `nrows` precomputed."""
+    return RowPartition(
+        row_splits=self._row_splits,
+        row_lengths=self._row_lengths,
+        value_rowids=self._value_rowids,
+        nrows=self.nrows(),
+        uniform_row_length=self._uniform_row_length,
+        internal=_row_partition_factory_key)
+
+  def merge_precomputed_encodings(self, other, validate=True):
+    """Returns a RowPartition that merges encodings from `self` and `other`.
+
+    Requires that `self` and `other` describe the same partition.
+
+    Args:
+      other: A `RowPartition` that encodes the same partition as `self`.
+      validate: If true, then add runtime checks to verify that `self` and
+        `other` encode the same row partition.
+
+    Returns:
+      A `RowPartition`.
+    """
+    # pylint: disable=protected-access
+    if (self is other or  # Fast path if row partitions are equal.
+        (self._row_splits is other._row_splits and
+         self._row_lengths is other._row_lengths and
+         self._value_rowids is other._value_rowids and
+         self._nrows is other._nrows and
+         self._uniform_row_length is other._uniform_row_length)):
+      return self
+
+    # Merge the component tensors.  We only need to validate one encoding.
+    # We merge less-expensive encodings first (to avoid expensive validation).
+    nrows, nrows_validated = _merge_tensors(self._nrows, other._nrows, "nrows",
+                                            validate)
+    uniform_row_length, uniform_row_length_validated = _merge_tensors(
+        self._uniform_row_length, other._uniform_row_length,
+        "uniform_row_length", validate)
+    if uniform_row_length_validated and nrows_validated:
+      validate = False  # Validation complete.
+    row_splits, row_splits_validated = _merge_tensors(self._row_splits,
+                                                      other._row_splits,
+                                                      "row_splits", validate)
+    if row_splits_validated:
+      validate = False  # Validation complete.
+    row_lengths, row_lengths_validated = _merge_tensors(self._row_lengths,
+                                                        other._row_lengths,
+                                                        "row_lengths", validate)
+    if row_lengths_validated:
+      validate = False  # Validation complete.
+    value_rowids, value_rowids_validated = _merge_tensors(
+        self._value_rowids, other._value_rowids, "value_rowids", validate)
+    if value_rowids_validated and nrows_validated:
+      validate = False  # Validation complete.
+    # TODO(edloper): If we make the row_splits encoding optional, then there
+    # will be cases where we need to do validation at this point -- e.g. if
+    # self has only row_splits and other has only value_rowids.  But for
+    # now, we are guaranteed to have done validation by this point.
+
+    # Avoid creating new RowPartition objects if we don't need to.
+    if (row_splits is self._row_splits and row_lengths is self._row_lengths and
+        value_rowids is self._value_rowids and nrows is self._nrows and
+        uniform_row_length is self._uniform_row_length):
+      return self
+    if (row_splits is other._row_splits and
+        row_lengths is other._row_lengths and
+        value_rowids is other._value_rowids and nrows is other._nrows and
+        uniform_row_length is other._uniform_row_length):
+      return other
+
+    return RowPartition(
+        row_splits=row_splits,
+        row_lengths=row_lengths,
+        value_rowids=value_rowids,
+        nrows=nrows,
+        uniform_row_length=uniform_row_length,
+        internal=_row_partition_factory_key)
+
+  #=============================================================================
+  # Composite Tensor
+  #=============================================================================
+
+  @property
+  def _type_spec(self):
+    return RowPartitionSpec.from_value(self)
+
+
+#===============================================================================
+# RowPartitionSpec
+#===============================================================================
+# TODO(edloper): Consider refactoring RowPartitionSpec to allow any combination
+# of precomputed row-partition encodings (rather than always using row_splits).
+
+
+class RowPartitionSpec(type_spec.TypeSpec):
+  """Type specification for a `tf.RowPartition`."""
+
+  __slots__ = ["_nrows", "_nvals", "_uniform_row_length", "_dtype"]
+
+  value_type = property(lambda self: RowPartition)
+
+  def __init__(self,
+               nrows=None,
+               nvals=None,
+               uniform_row_length=None,
+               dtype=dtypes.int64):
+    """Constructs a new RowPartitionSpec.
+
+    Args:
+      nrows: The number of rows in the RowPartition, or `None` if unspecified.
+      nvals: The number of values partitioned by the RowPartition, or `None` if
+        unspecified.
+      uniform_row_length: The number of values in each row for this
+        RowPartition, or `None` if rows are ragged or row length is unspecified.
+      dtype: The data type used to encode the partition.  One of `tf.int64` or
+        `tf.int32`.
+    """
+    # Wrap dimension sizes in 1D TensorShapes so the default implementations
+    # of TypeSpec methods such as `is_compatile_with` will work.
+    nrows = tensor_shape.TensorShape([nrows])
+    nvals = tensor_shape.TensorShape([nvals])
+    if not isinstance(uniform_row_length, tensor_shape.TensorShape):
+      uniform_row_length = tensor_shape.TensorShape([uniform_row_length])
+    else:
+      uniform_row_length = uniform_row_length.with_rank(1)
+
+    self._nrows = nrows
+    self._nvals = nvals
+    self._uniform_row_length = uniform_row_length
+    self._dtype = dtypes.as_dtype(dtype)
+    if self._dtype not in (dtypes.int32, dtypes.int64):
+      raise ValueError("dtype must be tf.int32 or tf.int64")
+
+    # Check dimension consistency, & infer dimensions when possible.
+    nrows = tensor_shape.dimension_value(nrows[0])
+    nvals = tensor_shape.dimension_value(nvals[0])
+    ncols = tensor_shape.dimension_value(uniform_row_length[0])
+    if nrows == 0:  # no rows -> no values.
+      if nvals is None:
+        self._nvals = tensor_shape.TensorShape([0])
+      elif nvals != 0:
+        raise ValueError("nvals=%s is not compatible with nrows=%s" %
+                         (nvals, nrows))
+    if ncols == 0:  # there are no values in each row -> no values.
+      if nvals is None:
+        self._nvals = tensor_shape.TensorShape([0])
+      elif nvals != 0:
+        raise ValueError("nvals=%s is not compatible with uniform_row_length"
+                         "=%s" % (nvals, uniform_row_length))
+    if ncols is not None and nvals is not None:
+      if ncols != 0 and nvals % ncols != 0:
+        raise ValueError("nvals=%s is not compatible with uniform_row_length"
+                         "=%s (doesn't divide evenly)" % (nvals, ncols))
+      if nrows is not None and nvals != ncols * nrows:
+        raise ValueError("nvals=%s is not compatible with nrows=%s and "
+                         "uniform_row_length=%s" % (nvals, nrows, ncols))
+      if nrows is None and ncols != 0:
+        self._nrows = tensor_shape.TensorShape([nvals // ncols])
+    if ncols is not None and nrows is not None and nvals is None:
+      self._nvals = tensor_shape.TensorShape([ncols * nrows])
+
+  def is_compatible_with(self, other):
+    if not super(RowPartitionSpec, self).is_compatible_with(other):
+      return False
+    nrows = self._nrows.merge_with(other.nrows)
+    nvals = self._nvals.merge_with(other.nvals)
+    ncols = self._uniform_row_length.merge_with(other.uniform_row_length)
+    return self._dimensions_compatible(nrows, nvals, ncols)
+
+  def _serialize(self):
+    return (self._nrows, self._nvals, self._uniform_row_length, self._dtype)
+
+  @classmethod
+  def _deserialize(cls, serialization):
+    # Remove TensorShape wrappers from serialization.
+    (nrows, nvals, uniform_row_length, dtype) = serialization
+    nrows = tensor_shape.dimension_value(nrows[0])
+    nvals = tensor_shape.dimension_value(nvals[0])
+    return cls(nrows, nvals, uniform_row_length, dtype)
+
+  @property
+  def nrows(self):
+    return tensor_shape.dimension_value(self._nrows[0])
+
+  @property
+  def nvals(self):
+    return tensor_shape.dimension_value(self._nvals[0])
+
+  @property
+  def uniform_row_length(self):
+    return tensor_shape.dimension_value(self._uniform_row_length[0])
+
+  @property
+  def dtype(self):
+    return self._dtype
+
+  @property
+  def _component_specs(self):
+    row_splits_shape = tensor_shape.TensorShape(
+        [tensor_shape.dimension_at_index(self._nrows, 0) + 1])
+    return tensor_spec.TensorSpec(row_splits_shape, self._dtype)
+
+  def _to_components(self, value):
+    return value.row_splits()
+
+  def _from_components(self, tensor):
+    return RowPartition.from_row_splits(tensor, validate=False)
+
+  @classmethod
+  def from_value(cls, value):
+    if not isinstance(value, RowPartition):
+      raise TypeError("Expected `value` to be a `RowPartition`")
+    return cls(value.static_nrows, value.static_nvals,
+               value.static_uniform_row_length, value.dtype)
+
+  def __repr__(self):
+    return ("RowPartitionSpec(nrows=%s, nvals=%s, uniform_row_length=%s, "
+            "dtype=%r)" % (self.nrows, self.nvals, self.uniform_row_length,
+                           self.dtype))
+
+  @staticmethod
+  def _dimensions_compatible(nrows, nvals, uniform_row_length):
+    """Returns true if the given dimensions are compatible."""
+    nrows = tensor_shape.dimension_value(nrows[0])
+    nvals = tensor_shape.dimension_value(nvals[0])
+    ncols = tensor_shape.dimension_value(uniform_row_length[0])
+    if nrows == 0 and nvals not in (0, None):
+      return False  # can't have values if we have no rows.
+    if ncols == 0 and nvals not in (0, None):
+      return False  # can't have values if we have no values in each row.
+    if ncols is not None and nvals is not None:
+      if ncols != 0 and nvals % ncols != 0:
+        return False  # rows aren't uniform.
+      if nrows is not None and nvals != ncols * nrows:
+        return False  # inconsistent number of values.
+    return True
+
+
+#===============================================================================
+# Helper Functions
+#===============================================================================
+
+
+def _assert_monotonic_increasing(tensor, message=None):
+  return check_ops.assert_non_negative(
+      tensor[1:] - tensor[:-1], message=message)
+
+
+def _assert_zero(tensor, message=None):
+  return check_ops.assert_equal(
+      tensor, constant_op.constant(0, dtype=tensor.dtype), message=message)
+
+
+def _cast_if_not_none(tensor, dtype):
+  return None if tensor is None else math_ops.cast(tensor, dtype)
+
+
+def _merge_tensors(t1, t2, name, validate):
+  """Merge two optional Tensors with equal values into a single Tensor.
+
+  Args:
+    t1: tf.Tensor or None
+    t2: tf.Tensor or None
+    name: A name for the tensors (for error messages)
+    validate: If true, then check that `t1` is compatible with `t2` (if both are
+      non-None).
+
+  Returns:
+    A pair `(merged_value, validated)`:
+      * `merged_value` is `t1` if it is not None; or `t2` otherwise.
+      * `validated` is true if we validated that t1 and t2 are equal (either
+        by adding a check, or because t1 is t2).
+  """
+  if t1 is None:
+    return t2, False
+  elif t2 is None:
+    return t1, False
+  elif t1 is t2:
+    return t1, True
+  else:
+    err_msg = ("RowPartition.merge_precomuted_encodings: partitons "
+               "have incompatible %s" % name)
+    if not t1.shape.is_compatible_with(t2.shape):
+      raise ValueError(err_msg)
+    if validate:
+      checks = [check_ops.assert_equal(t1, t2, message=err_msg)]
+      return control_flow_ops.with_dependencies(checks, t1), True
+    else:
+      return t1, False
+
+
+_row_partition_factory_key = object()  # unique private object
diff --git a/tensorflow/python/ops/ragged/row_partition_test.py b/tensorflow/python/ops/ragged/row_partition_test.py
new file mode 100644
index 00000000000..d3662a53ca9
--- /dev/null
+++ b/tensorflow/python/ops/ragged/row_partition_test.py
@@ -0,0 +1,896 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.ragged.RowPartition."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.ragged import row_partition
+from tensorflow.python.ops.ragged.row_partition import RowPartition
+from tensorflow.python.ops.ragged.row_partition import RowPartitionSpec
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RowPartitionTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+  #=============================================================================
+  # RaggedTensor class docstring examples
+  #=============================================================================
+
+  def testClassDocStringExamples(self):
+    # From section: "Component Tensors"
+    rp = RowPartition.from_row_splits(row_splits=[0, 4, 4, 7, 8, 8])
+    self.assertAllEqual(rp.row_splits(), [0, 4, 4, 7, 8, 8])
+    del rp
+
+    # From section: "Alternative Row-Partitioning Schemes"
+    rt1 = RowPartition.from_row_splits(row_splits=[0, 4, 4, 7, 8, 8])
+    rt2 = RowPartition.from_row_lengths(row_lengths=[4, 0, 3, 1, 0])
+    rt3 = RowPartition.from_value_rowids(
+        value_rowids=[0, 0, 0, 0, 2, 2, 2, 3], nrows=5)
+    rt4 = RowPartition.from_row_starts(row_starts=[0, 4, 4, 7, 8], nvals=8)
+    rt5 = RowPartition.from_row_limits(row_limits=[4, 4, 7, 8, 8])
+    for rp in (rt1, rt2, rt3, rt4, rt5):
+      self.assertAllEqual(rp.row_splits(), [0, 4, 4, 7, 8, 8])
+    del rt1, rt2, rt3, rt4, rt5
+
+    # From section: "Multiple Ragged Dimensions"
+    inner_rt = RowPartition.from_row_splits(row_splits=[0, 4, 4, 7, 8, 8])
+    outer_rt = RowPartition.from_row_splits(row_splits=[0, 3, 3, 5])
+    del inner_rt, outer_rt
+
+  #=============================================================================
+  # RaggedTensor Constructor (private)
+  #=============================================================================
+
+  def testRaggedTensorConstruction(self):
+    row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+    rp = RowPartition(
+        row_splits=row_splits,
+        internal=row_partition._row_partition_factory_key)
+    self.assertAllEqual(rp.row_splits(), [0, 2, 2, 5, 6, 7])
+
+  def testRaggedTensorConstructionErrors(self):
+    row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+
+    with self.assertRaisesRegexp(ValueError,
+                                 'RaggedTensor constructor is private'):
+      RowPartition(row_splits=row_splits)
+
+    with self.assertRaisesRegexp(TypeError,
+                                 'Row-partitioning argument must be a Tensor'):
+      RowPartition(
+          row_splits=[0, 2, 2, 5, 6, 7],
+          internal=row_partition._row_partition_factory_key)
+
+    with self.assertRaisesRegexp(ValueError,
+                                 r'Shape \(6, 1\) must have rank 1'):
+      RowPartition(
+          row_splits=array_ops.expand_dims(row_splits, 1),
+          internal=row_partition._row_partition_factory_key)
+
+    with self.assertRaisesRegexp(TypeError,
+                                 'Cached value must be a Tensor or None.'):
+      RowPartition(
+          row_splits=row_splits,
+          row_lengths=[2, 3, 4],
+          internal=row_partition._row_partition_factory_key)
+
+  #=============================================================================
+  # RaggedTensor Factory Ops
+  #=============================================================================
+
+  def testFromValueRowIdsWithDerivedNRows(self):
+    # nrows is known at graph creation time.
+    value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    # TODO(martinz): add nrows
+    rp = RowPartition.from_value_rowids(value_rowids, validate=False)
+    self.assertEqual(rp.dtype, dtypes.int64)
+
+    rp_row_splits = rp.row_splits()
+    rp_value_rowids = rp.value_rowids()
+    rp_nrows = rp.nrows()
+
+    self.assertIs(rp_value_rowids, value_rowids)  # value_rowids
+    self.assertAllEqual(rp_value_rowids, value_rowids)
+    self.assertAllEqual(rp_nrows, 5)
+    self.assertAllEqual(rp_row_splits, [0, 2, 2, 5, 6, 7])
+
+  def testFromValueRowIdsWithDerivedNRowsDynamic(self):
+    # nrows is not known at graph creation time.
+    value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    value_rowids = array_ops.placeholder_with_default(value_rowids, shape=None)
+
+    rp = RowPartition.from_value_rowids(value_rowids, validate=False)
+
+    rp_value_rowids = rp.value_rowids()
+    rp_nrows = rp.nrows()
+
+    self.assertIs(rp_value_rowids, value_rowids)  # value_rowids
+    self.assertAllEqual(rp_value_rowids, value_rowids)
+    self.assertAllEqual(rp_nrows, 5)
+
+  def testFromValueRowIdsWithExplicitNRows(self):
+    value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    nrows = constant_op.constant(7, dtypes.int64)
+
+    rp = RowPartition.from_value_rowids(value_rowids, nrows, validate=False)
+
+    rp_value_rowids = rp.value_rowids()
+    rp_nrows = rp.nrows()
+    rp_row_splits = rp.row_splits()
+
+    self.assertIs(rp_value_rowids, value_rowids)  # value_rowids
+    self.assertIs(rp_nrows, nrows)  # nrows
+    self.assertAllEqual(rp_row_splits, [0, 2, 2, 5, 6, 7, 7, 7])
+
+  def testFromValueRowIdsWithExplicitNRowsEqualToDefault(self):
+    value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    nrows = constant_op.constant(5, dtypes.int64)
+
+    rp = RowPartition.from_value_rowids(value_rowids, nrows, validate=False)
+
+    rp_value_rowids = rp.value_rowids()
+    rp_nrows = rp.nrows()
+    rp_row_splits = rp.row_splits()
+
+    self.assertIs(rp_value_rowids, value_rowids)  # value_rowids
+    self.assertIs(rp_nrows, nrows)  # nrows
+    self.assertAllEqual(rp_value_rowids, value_rowids)
+    self.assertAllEqual(rp_nrows, nrows)
+    self.assertAllEqual(rp_row_splits, [0, 2, 2, 5, 6, 7])
+
+  def testFromValueRowIdsWithEmptyValues(self):
+    rp = RowPartition.from_value_rowids([])
+    rp_nrows = rp.nrows()
+    self.assertEqual(rp.dtype, dtypes.int64)
+    self.assertEqual(rp.value_rowids().shape.as_list(), [0])
+    self.assertAllEqual(rp_nrows, 0)
+
+  def testFromRowSplits(self):
+    row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+
+    rp = RowPartition.from_row_splits(row_splits, validate=False)
+    self.assertEqual(rp.dtype, dtypes.int64)
+
+    rp_row_splits = rp.row_splits()
+    rp_nrows = rp.nrows()
+
+    self.assertIs(rp_row_splits, row_splits)
+    self.assertAllEqual(rp_nrows, 5)
+
+  def testFromRowSplitsWithDifferentSplitTypes(self):
+    splits1 = [0, 2, 2, 5, 6, 7]
+    splits2 = np.array([0, 2, 2, 5, 6, 7], np.int64)
+    splits3 = np.array([0, 2, 2, 5, 6, 7], np.int32)
+    splits4 = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+    splits5 = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int32)
+    rt1 = RowPartition.from_row_splits(splits1)
+    rt2 = RowPartition.from_row_splits(splits2)
+    rt3 = RowPartition.from_row_splits(splits3)
+    rt4 = RowPartition.from_row_splits(splits4)
+    rt5 = RowPartition.from_row_splits(splits5)
+    self.assertEqual(rt1.row_splits().dtype, dtypes.int64)
+    self.assertEqual(rt2.row_splits().dtype, dtypes.int64)
+    self.assertEqual(rt3.row_splits().dtype, dtypes.int32)
+    self.assertEqual(rt4.row_splits().dtype, dtypes.int64)
+    self.assertEqual(rt5.row_splits().dtype, dtypes.int32)
+
+  def testFromRowSplitsWithEmptySplits(self):
+    err_msg = 'row_splits tensor may not be empty'
+    with self.assertRaisesRegexp(ValueError, err_msg):
+      RowPartition.from_row_splits([])
+
+  def testFromRowStarts(self):
+    nvals = constant_op.constant(7)
+    row_starts = constant_op.constant([0, 2, 2, 5, 6], dtypes.int64)
+
+    rp = RowPartition.from_row_starts(row_starts, nvals, validate=False)
+    self.assertEqual(rp.dtype, dtypes.int64)
+
+    rp_row_starts = rp.row_starts()
+    rp_row_splits = rp.row_splits()
+    rp_nrows = rp.nrows()
+
+    self.assertAllEqual(rp_nrows, 5)
+    self.assertAllEqual(rp_row_starts, row_starts)
+    self.assertAllEqual(rp_row_splits, [0, 2, 2, 5, 6, 7])
+
+  def testFromRowLimits(self):
+    row_limits = constant_op.constant([2, 2, 5, 6, 7], dtypes.int64)
+
+    rp = RowPartition.from_row_limits(row_limits, validate=False)
+    self.assertEqual(rp.dtype, dtypes.int64)
+
+    rp_row_limits = rp.row_limits()
+    rp_row_splits = rp.row_splits()
+    rp_nrows = rp.nrows()
+
+    self.assertAllEqual(rp_nrows, 5)
+    self.assertAllEqual(rp_row_limits, row_limits)
+    self.assertAllEqual(rp_row_splits, [0, 2, 2, 5, 6, 7])
+
+  def testFromRowLengths(self):
+    row_lengths = constant_op.constant([2, 0, 3, 1, 1], dtypes.int64)
+
+    rp = RowPartition.from_row_lengths(row_lengths, validate=False)
+    self.assertEqual(rp.dtype, dtypes.int64)
+
+    rp_row_lengths = rp.row_lengths()
+    rp_nrows = rp.nrows()
+
+    self.assertIs(rp_row_lengths, row_lengths)  # nrows
+    self.assertAllEqual(rp_nrows, 5)
+    self.assertAllEqual(rp_row_lengths, row_lengths)
+
+  def testFromUniformRowLength(self):
+    nvals = 16
+    a1 = RowPartition.from_uniform_row_length(
+        nvals=nvals, uniform_row_length=2)
+    self.assertAllEqual(a1.uniform_row_length(), 2)
+    self.assertAllEqual(a1.nrows(), 8)
+
+  def testFromUniformRowLengthWithEmptyValues(self):
+    a = RowPartition.from_uniform_row_length(
+        nvals=0, uniform_row_length=0, nrows=10)
+    self.assertEqual(self.evaluate(a.nvals()), 0)
+    self.assertEqual(self.evaluate(a.nrows()), 10)
+
+  def testFromUniformRowLengthWithPlaceholders1(self):
+    nvals = array_ops.placeholder_with_default(
+        constant_op.constant(6, dtype=dtypes.int64), None)
+    rt1 = RowPartition.from_uniform_row_length(
+        nvals=nvals, uniform_row_length=3)
+    const_nvals1 = self.evaluate(rt1.nvals())
+    self.assertEqual(const_nvals1, 6)
+
+  def testFromUniformRowLengthWithPlaceholders2(self):
+    nvals = array_ops.placeholder_with_default(6, None)
+    ph_rowlen = array_ops.placeholder_with_default(3, None)
+    rt2 = RowPartition.from_uniform_row_length(
+        nvals=nvals, uniform_row_length=ph_rowlen)
+    const_nvals2 = self.evaluate(rt2.nvals())
+    self.assertEqual(const_nvals2, 6)
+
+  def testFromValueRowIdsWithBadNRows(self):
+    value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    nrows = constant_op.constant(5, dtypes.int64)
+
+    with self.assertRaisesRegexp(ValueError, r'Expected nrows >= 0; got -2'):
+      RowPartition.from_value_rowids(
+          value_rowids=array_ops.placeholder_with_default(value_rowids, None),
+          nrows=-2)
+
+    with self.assertRaisesRegexp(
+        ValueError, r'Expected nrows >= value_rowids\[-1\] \+ 1; got nrows=2, '
+        r'value_rowids\[-1\]=4'):
+      RowPartition.from_value_rowids(value_rowids=value_rowids, nrows=2)
+
+    with self.assertRaisesRegexp(
+        ValueError, r'Expected nrows >= value_rowids\[-1\] \+ 1; got nrows=4, '
+        r'value_rowids\[-1\]=4'):
+      RowPartition.from_value_rowids(value_rowids=value_rowids, nrows=4)
+
+    with self.assertRaisesRegexp(ValueError,
+                                 r'Shape \(7, 1\) must have rank 1'):
+      RowPartition.from_value_rowids(
+          value_rowids=array_ops.expand_dims(value_rowids, 1), nrows=nrows)
+
+    with self.assertRaisesRegexp(ValueError, r'Shape \(1,\) must have rank 0'):
+      RowPartition.from_value_rowids(
+          value_rowids=value_rowids, nrows=array_ops.expand_dims(nrows, 0))
+
+  #=============================================================================
+  # RowPartition.__str__
+  #=============================================================================
+  def testRowPartitionStr(self):
+    row_splits = [0, 2, 5, 6, 6, 7]
+    rp = RowPartition.from_row_splits(row_splits, validate=False)
+    splits_type = 'int64'
+    if context.executing_eagerly():
+      expected_repr = ('tf.RowPartition(row_splits=tf.Tensor([0 2 5 6 6 7], '
+                       'shape=(6,), dtype=int64))')
+    else:
+      expected_repr = ('tf.RowPartition(row_splits='
+                       'Tensor("RowPartitionFromRowSplits/row_splits:0", '
+                       'shape=(6,), dtype={}))').format(splits_type)
+    self.assertEqual(repr(rp), expected_repr)
+    self.assertEqual(str(rp), expected_repr)
+
+  @parameterized.parameters([
+      # from_value_rowids
+      {
+          'descr': 'bad rank for value_rowids',
+          'factory': RowPartition.from_value_rowids,
+          'value_rowids': [[1, 2], [3, 4]],
+          'nrows': 10
+      },
+      {
+          'descr': 'bad rank for nrows',
+          'factory': RowPartition.from_value_rowids,
+          'value_rowids': [1, 2, 3, 4],
+          'nrows': [10]
+      },
+      {
+          'descr': 'negative value_rowid',
+          'factory': RowPartition.from_value_rowids,
+          'value_rowids': [-5, 2, 3, 4],
+          'nrows': 10
+      },
+      {
+          'descr': 'non-monotonic-increasing value_rowid',
+          'factory': RowPartition.from_value_rowids,
+          'value_rowids': [4, 3, 2, 1],
+          'nrows': 10
+      },
+      {
+          'descr': 'value_rowid > nrows',
+          'factory': RowPartition.from_value_rowids,
+          'value_rowids': [1, 2, 3, 4],
+          'nrows': 2
+      },
+
+      # from_row_splits
+      {
+          'descr': 'bad rank for row_splits',
+          'factory': RowPartition.from_row_splits,
+          'row_splits': [[1, 2], [3, 4]]
+      },
+      {
+          'descr': 'row_splits[0] != 0',
+          'factory': RowPartition.from_row_splits,
+          'row_splits': [2, 3, 4]
+      },
+      {
+          'descr': 'non-monotonic-increasing row_splits',
+          'factory': RowPartition.from_row_splits,
+          'row_splits': [0, 3, 2, 4]
+      },
+
+      # from_row_lengths
+      {
+          'descr': 'bad rank for row_lengths',
+          'factory': RowPartition.from_row_lengths,
+          'row_lengths': [[1, 2], [1, 0]]
+      },
+      {
+          'descr': 'negatve row_lengths',
+          'factory': RowPartition.from_row_lengths,
+          'row_lengths': [3, -1, 2]
+      },
+
+      # from_row_starts
+      {
+          'descr': 'bad rank for row_starts',
+          'factory': RowPartition.from_row_starts,
+          'nvals': 2,
+          'row_starts': [[1, 2], [3, 4]]
+      },
+      {
+          'descr': 'row_starts[0] != 0',
+          'factory': RowPartition.from_row_starts,
+          'nvals': 5,
+          'row_starts': [2, 3, 4]
+      },
+      {
+          'descr': 'non-monotonic-increasing row_starts',
+          'factory': RowPartition.from_row_starts,
+          'nvals': 4,
+          'row_starts': [0, 3, 2, 4]
+      },
+      {
+          'descr': 'row_starts[0] > nvals',
+          'factory': RowPartition.from_row_starts,
+          'nvals': 4,
+          'row_starts': [0, 2, 3, 5]
+      },
+
+      # from_row_limits
+      {
+          'descr': 'bad rank for row_limits',
+          'factory': RowPartition.from_row_limits,
+          'row_limits': [[1, 2], [3, 4]]
+      },
+      {
+          'descr': 'row_limits[0] < 0',
+          'factory': RowPartition.from_row_limits,
+          'row_limits': [-1, 3, 4]
+      },
+      {
+          'descr': 'non-monotonic-increasing row_limits',
+          'factory': RowPartition.from_row_limits,
+          'row_limits': [0, 3, 2, 4]
+      },
+
+      # from_uniform_row_length
+      {
+          'descr': 'rowlen * nrows != nvals (1)',
+          'factory': RowPartition.from_uniform_row_length,
+          'nvals': 5,
+          'uniform_row_length': 3
+      },
+      {
+          'descr': 'rowlen * nrows != nvals (2)',
+          'factory': RowPartition.from_uniform_row_length,
+          'nvals': 5,
+          'uniform_row_length': 6
+      },
+      {
+          'descr': 'rowlen * nrows != nvals (3)',
+          'factory': RowPartition.from_uniform_row_length,
+          'nvals': 6,
+          'uniform_row_length': 3,
+          'nrows': 3
+      },
+      {
+          'descr': 'rowlen must be a scalar',
+          'factory': RowPartition.from_uniform_row_length,
+          'nvals': 4,
+          'uniform_row_length': [2]
+      },
+      {
+          'descr': 'rowlen must be nonnegative',
+          'factory': RowPartition.from_uniform_row_length,
+          'nvals': 4,
+          'uniform_row_length': -1
+      },
+  ])
+  def testFactoryValidation(self, descr, factory, **kwargs):
+    # When input tensors have shape information, some of these errors will be
+    # detected statically.
+    with self.assertRaises((errors.InvalidArgumentError, ValueError)):
+      partition = factory(**kwargs)
+      self.evaluate(partition.row_splits())
+
+    # Remove shape information (by wrapping tensors in placeholders), and check
+    # that we detect the errors when the graph is run.
+    if not context.executing_eagerly():
+
+      def wrap_arg(v):
+        return array_ops.placeholder_with_default(
+            constant_op.constant(v, dtype=dtypes.int64),
+            tensor_shape.TensorShape(None))
+
+      kwargs = dict((k, wrap_arg(v)) for (k, v) in kwargs.items())
+
+      with self.assertRaises(errors.InvalidArgumentError):
+        partition = factory(**kwargs)
+        self.evaluate(partition.row_splits())
+
+  @parameterized.named_parameters([
+      ('FromRowSplits', lambda: RowPartition.from_row_splits([0, 2, 8]),
+       ['row_splits']),
+      ('FromRowLengths', lambda: RowPartition.from_row_lengths([3, 0, 8]),
+       ['row_splits', 'row_lengths']),
+      ('FromValueRowIds',
+       lambda: RowPartition.from_value_rowids([0, 0, 3, 4, 4, 4]),
+       ['row_splits', 'value_rowids', 'row_lengths', 'nrows']),
+      ('FromRowStarts',
+       lambda: RowPartition.from_row_starts([0, 3, 7], nvals=10),
+       ['row_splits']),
+      ('FromRowLimits', lambda: RowPartition.from_row_limits([3, 7, 10]),
+       ['row_splits']),
+  ])
+  def testPrecomputedSplits(self, rp_factory, expected_encodings):
+    rp = rp_factory()
+    self.assertEqual(rp.has_precomputed_row_splits(),
+                     'row_splits' in expected_encodings)
+    self.assertEqual(rp.has_precomputed_row_lengths(),
+                     'row_lengths' in expected_encodings)
+    self.assertEqual(rp.has_precomputed_value_rowids(),
+                     'value_rowids' in expected_encodings)
+    self.assertEqual(rp.has_precomputed_nrows(), 'nrows' in expected_encodings)
+
+  def testWithPrecomputedSplits(self):
+    rp = RowPartition.from_row_splits([0, 2, 8])
+
+    rp_with_row_splits = rp.with_precomputed_row_splits()
+    self.assertTrue(rp_with_row_splits.has_precomputed_row_splits())
+
+    self.assertFalse(rp.has_precomputed_row_lengths())
+    rp_with_row_lengths = rp.with_precomputed_row_lengths()
+    self.assertTrue(rp_with_row_lengths.has_precomputed_row_lengths())
+
+    self.assertFalse(rp.has_precomputed_value_rowids())
+    rp_with_value_rowids = rp.with_precomputed_value_rowids()
+    self.assertTrue(rp_with_value_rowids.has_precomputed_value_rowids())
+
+    self.assertFalse(rp.has_precomputed_nrows())
+    rp_with_nrows = rp.with_precomputed_nrows()
+    self.assertTrue(rp_with_nrows.has_precomputed_nrows())
+
+  @parameterized.named_parameters([
+      dict(
+          testcase_name='FromRowSplitsAndRowSplits',
+          x=lambda: RowPartition.from_row_splits([0, 3, 8]),
+          y=lambda: RowPartition.from_row_splits([0, 3, 8]),
+          expected_encodings=['row_splits']),
+      dict(
+          testcase_name='FromRowSplitsAndUniformRowLength',
+          x=lambda: RowPartition.from_row_splits([0, 3, 6]),
+          y=lambda: RowPartition.from_uniform_row_length(3, nvals=6),
+          expected_encodings=['row_splits', 'uniform_row_length', 'nrows']),
+      dict(
+          testcase_name='FromRowSplitsAndRowLengths',
+          x=lambda: RowPartition.from_row_splits([0, 3, 8]),
+          y=lambda: RowPartition.from_row_lengths([3, 5]),
+          expected_encodings=['row_splits', 'row_lengths']),
+      dict(
+          testcase_name='FromRowSplitsAndValueRowIds',
+          x=lambda: RowPartition.from_row_splits([0, 3, 8]),
+          y=lambda: RowPartition.from_value_rowids([0, 0, 0, 1, 1, 1, 1, 1]),
+          expected_encodings=[
+              'row_splits', 'row_lengths', 'value_rowids', 'nrows'
+          ]),
+      dict(
+          testcase_name='FromRowSplitsAndRowSplitsPlusNRows',
+          x=lambda: RowPartition.from_row_splits([0, 3, 8]),
+          y=lambda: RowPartition.from_row_splits([0, 3, 8]).
+          with_precomputed_nrows(),
+          expected_encodings=['row_splits', 'nrows']),
+  ])
+  def testMergePrecomputedEncodings(self, x, y, expected_encodings):
+    x = x()
+    y = y()
+    for validate in (True, False):
+      result = x.merge_precomputed_encodings(y, validate)
+      self.assertEqual(result.has_precomputed_row_splits(),
+                       'row_splits' in expected_encodings)
+      self.assertEqual(result.has_precomputed_row_lengths(),
+                       'row_lengths' in expected_encodings)
+      self.assertEqual(result.has_precomputed_value_rowids(),
+                       'value_rowids' in expected_encodings)
+      self.assertEqual(result.has_precomputed_nrows(),
+                       'nrows' in expected_encodings)
+      self.assertEqual(result.uniform_row_length() is not None,
+                       'uniform_row_length' in expected_encodings)
+      for r in (x, y):
+        if (r.has_precomputed_row_splits() and
+            result.has_precomputed_row_splits()):
+          self.assertAllEqual(r.row_splits(), result.row_splits())
+        if (r.has_precomputed_row_lengths() and
+            result.has_precomputed_row_lengths()):
+          self.assertAllEqual(r.row_lengths(), result.row_lengths())
+        if (r.has_precomputed_value_rowids() and
+            result.has_precomputed_value_rowids()):
+          self.assertAllEqual(r.value_rowids(), result.value_rowids())
+        if r.has_precomputed_nrows() and result.has_precomputed_nrows():
+          self.assertAllEqual(r.nrows(), result.nrows())
+        if (r.uniform_row_length() is not None and
+            result.uniform_row_length() is not None):
+          self.assertAllEqual(r.uniform_row_length(),
+                              result.uniform_row_length())
+
+  def testMergePrecomputedEncodingsFastPaths(self):
+    # Same object: x gets returned as-is.
+    x = RowPartition.from_row_splits([0, 3, 8, 8])
+    self.assertIs(x.merge_precomputed_encodings(x), x)
+
+    # Same encoding tensor objects: x gets returned as-is.
+    y = RowPartition.from_row_splits(x.row_splits(), validate=False)
+    self.assertIs(x.merge_precomputed_encodings(y), x)
+
+  def testMergePrecomputedEncodingsWithMatchingTensors(self):
+    # The encoding tensors for `a` are a superset of the encoding tensors
+    # for `b`, and where they overlap, they the same tensor objects.
+    a = RowPartition.from_value_rowids([0, 0, 3, 4, 4, 4])
+    b = RowPartition.from_row_splits(a.row_splits(), validate=False)
+    self.assertIs(a.merge_precomputed_encodings(b), a)
+    self.assertIs(b.merge_precomputed_encodings(a), a)
+    self.assertIsNot(a, b)
+
+  @parameterized.named_parameters([
+      dict(
+          testcase_name='RowSplitMismatch',
+          x=lambda: RowPartition.from_row_splits([0, 3, 8]),
+          y=lambda: RowPartition.from_row_splits([0, 3, 8, 9]),
+          message='incompatible row_splits'),
+      dict(
+          testcase_name='RowLengthMismatch',
+          x=lambda: RowPartition.from_row_lengths([2, 0, 2]),
+          y=lambda: RowPartition.from_row_lengths([2, 0, 2, 1]),
+          message='incompatible row_splits'),  # row_splits is checked first
+      dict(
+          testcase_name='ValueRowIdMismatch',
+          x=lambda: RowPartition.from_value_rowids([0, 3, 3, 4]),
+          y=lambda: RowPartition.from_value_rowids([0, 3, 4]),
+          message='incompatible value_rowids'),
+  ])
+  def testMergePrecomputedEncodingStaticErrors(self, x, y, message):
+    if context.executing_eagerly():
+      return
+    # Errors that are caught by static shape checks.
+    x = x()
+    y = y()
+    with self.assertRaisesRegexp(ValueError, message):
+      x.merge_precomputed_encodings(y).row_splits()
+    with self.assertRaisesRegexp(ValueError, message):
+      y.merge_precomputed_encodings(x).row_splits()
+
+  @parameterized.named_parameters([
+      dict(
+          testcase_name='NRowsMismatch',
+          x=lambda: RowPartition.from_uniform_row_length(5, nvals=20),
+          y=lambda: RowPartition.from_uniform_row_length(5, nvals=15),
+          message='incompatible nrows'),
+      dict(
+          testcase_name='UniformRowLengthMismatch',
+          x=lambda: RowPartition.from_uniform_row_length(5, nvals=20),
+          y=lambda: RowPartition.from_uniform_row_length(2, nvals=8),
+          message='incompatible uniform_row_length'),
+      dict(
+          testcase_name='RowSplitMismatch',
+          x=lambda: RowPartition.from_row_splits([0, 3, 8]),
+          y=lambda: RowPartition.from_row_splits([0, 5, 8]),
+          message='incompatible row_splits'),
+      dict(
+          testcase_name='RowLengthMismatch',
+          x=lambda: RowPartition.from_row_lengths([2, 0, 2]),
+          y=lambda: RowPartition.from_row_lengths([0, 0, 2]),
+          message='incompatible row_splits'),  # row_splits is checked first
+      dict(
+          testcase_name='ValueRowIdMismatch',
+          x=lambda: RowPartition.from_value_rowids([0, 3, 3]),
+          y=lambda: RowPartition.from_value_rowids([0, 0, 3]),
+          message='incompatible row_splits'),  # row_splits is checked first
+  ])
+  def testMergePrecomputedEncodingRuntimeErrors(self, x, y, message):
+    # Errors that are caught by runtime value checks.
+    x = x()
+    y = y()
+    with self.assertRaisesRegexp(errors.InvalidArgumentError, message):
+      self.evaluate(x.merge_precomputed_encodings(y).row_splits())
+    with self.assertRaisesRegexp(errors.InvalidArgumentError, message):
+      self.evaluate(y.merge_precomputed_encodings(x).row_splits())
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RowPartitionSpecTest(test_util.TensorFlowTestCase,
+                           parameterized.TestCase):
+
+  def testDefaultConstruction(self):
+    spec = RowPartitionSpec()
+    self.assertEqual(spec.nrows, None)
+    self.assertEqual(spec.nvals, None)
+    self.assertEqual(spec.uniform_row_length, None)
+    self.assertEqual(spec.dtype, dtypes.int64)
+
+  @parameterized.parameters([
+      (None, None, None, dtypes.int64, None, None, None, dtypes.int64),
+      (5, None, None, dtypes.int32, 5, None, None, dtypes.int32),
+      (None, 20, None, dtypes.int64, None, 20, None, dtypes.int64),
+      (None, None, 8, dtypes.int64, None, None, 8, dtypes.int64),
+      (5, None, 8, dtypes.int64, 5, 40, 8, dtypes.int64),  # nvals inferred
+      (None, 20, 4, dtypes.int32, 5, 20, 4, dtypes.int32),  # nrows inferred
+      (0, None, None, dtypes.int32, 0, 0, None, dtypes.int32),  # nvals inferred
+      (None, None, 0, dtypes.int32, None, 0, 0, dtypes.int32),  # nvals inferred
+  ])  # pyformat: disable
+  def testConstruction(self, nrows, nvals, uniform_row_length, dtype,
+                       expected_nrows, expected_nvals,
+                       expected_uniform_row_length, expected_dtype):
+    spec = RowPartitionSpec(nrows, nvals, uniform_row_length, dtype)
+    self.assertEqual(spec.nrows, expected_nrows)
+    self.assertEqual(spec.nvals, expected_nvals)
+    self.assertEqual(spec.uniform_row_length, expected_uniform_row_length)
+    self.assertEqual(spec.dtype, expected_dtype)
+
+  @parameterized.parameters([
+      dict(dtype=dtypes.float32, error='dtype must be tf.int32 or tf.int64'),
+      dict(nrows=0, nvals=5, error='.* not compatible .*'),
+      dict(uniform_row_length=0, nvals=5, error='.* not compatible .*'),
+      dict(nvals=11, uniform_row_length=5, error='.* not compatible .*'),
+      dict(
+          nrows=8, nvals=10, uniform_row_length=5,
+          error='.* not compatible .*'),
+  ])
+  def testConstructionError(self,
+                            nrows=None,
+                            nvals=None,
+                            uniform_row_length=None,
+                            dtype=dtypes.int64,
+                            error=None):
+    with self.assertRaisesRegexp(ValueError, error):
+      RowPartitionSpec(nrows, nvals, uniform_row_length, dtype)
+
+  def testValueType(self):
+    spec = RowPartitionSpec()
+    self.assertEqual(spec.value_type, RowPartition)
+
+  @parameterized.parameters([
+      dict(
+          spec=RowPartitionSpec(),
+          expected=(tensor_shape.TensorShape([None]),
+                    tensor_shape.TensorShape([None]),
+                    tensor_shape.TensorShape([None]), dtypes.int64)),
+      dict(
+          spec=RowPartitionSpec(dtype=dtypes.int32),
+          expected=(tensor_shape.TensorShape([None]),
+                    tensor_shape.TensorShape([None]),
+                    tensor_shape.TensorShape([None]), dtypes.int32)),
+      dict(
+          spec=RowPartitionSpec(nrows=8, nvals=13),
+          expected=(tensor_shape.TensorShape([8]),
+                    tensor_shape.TensorShape([13]),
+                    tensor_shape.TensorShape([None]), dtypes.int64)),
+      dict(
+          spec=RowPartitionSpec(nrows=8, uniform_row_length=2),
+          expected=(
+              tensor_shape.TensorShape([8]),
+              tensor_shape.TensorShape([16]),  # inferred
+              tensor_shape.TensorShape([2]),
+              dtypes.int64)),
+  ])
+  def testSerialize(self, spec, expected):
+    serialization = spec._serialize()
+    # TensorShape has an unconventional definition of equality, so we can't use
+    # assertEqual directly here.  But repr() is deterministic and lossless for
+    # the expected values, so we can use that instead.
+    self.assertEqual(repr(serialization), repr(expected))
+
+  @parameterized.parameters([
+      dict(
+          spec=RowPartitionSpec(),
+          expected=tensor_spec.TensorSpec([None], dtypes.int64)),
+      dict(
+          spec=RowPartitionSpec(dtype=dtypes.int32),
+          expected=tensor_spec.TensorSpec([None], dtypes.int32)),
+      dict(
+          spec=RowPartitionSpec(nrows=17, dtype=dtypes.int32),
+          expected=tensor_spec.TensorSpec([18], dtypes.int32)),
+      dict(
+          spec=RowPartitionSpec(nvals=10, uniform_row_length=2),
+          expected=tensor_spec.TensorSpec([6], dtypes.int64)),  # inferred nrow
+  ])
+  def testComponentSpecs(self, spec, expected):
+    self.assertEqual(spec._component_specs, expected)
+
+  @parameterized.parameters([
+      dict(
+          rp_factory=lambda: RowPartition.from_row_splits([0, 3, 7]),
+          components=[0, 3, 7]),
+  ])
+  def testToFromComponents(self, rp_factory, components):
+    rp = rp_factory()
+    spec = rp._type_spec
+    actual_components = spec._to_components(rp)
+    self.assertAllEqual(actual_components, components)
+    rp_reconstructed = spec._from_components(actual_components)
+    _assert_row_partition_equal(self, rp, rp_reconstructed)
+
+  @parameterized.parameters([
+      (RowPartitionSpec(), RowPartitionSpec()),
+      (RowPartitionSpec(nrows=8), RowPartitionSpec(nrows=8)),
+      (RowPartitionSpec(nrows=8), RowPartitionSpec(nrows=None)),
+      (RowPartitionSpec(nvals=8), RowPartitionSpec(nvals=8)),
+      (RowPartitionSpec(nvals=8), RowPartitionSpec(nvals=None)),
+      (RowPartitionSpec(uniform_row_length=8),
+       RowPartitionSpec(uniform_row_length=8)),
+      (RowPartitionSpec(uniform_row_length=8),
+       RowPartitionSpec(uniform_row_length=None)),
+      (RowPartitionSpec(nvals=12), RowPartitionSpec(uniform_row_length=3)),
+      (RowPartitionSpec(nrows=12), RowPartitionSpec(uniform_row_length=72)),
+      (RowPartitionSpec(nrows=5), RowPartitionSpec(nvals=15)),
+      (RowPartitionSpec(nvals=0), RowPartitionSpec(nrows=0)),
+      (RowPartitionSpec(nvals=0), RowPartitionSpec(uniform_row_length=0)),
+  ])
+  def testIsCompatibleWith(self, spec1, spec2):
+    self.assertTrue(spec1.is_compatible_with(spec2))
+
+  @parameterized.parameters([
+      (RowPartitionSpec(), RowPartitionSpec(dtype=dtypes.int32)),
+      (RowPartitionSpec(nvals=5), RowPartitionSpec(uniform_row_length=3)),
+      (RowPartitionSpec(nrows=7,
+                        nvals=12), RowPartitionSpec(uniform_row_length=3)),
+      (RowPartitionSpec(nvals=5), RowPartitionSpec(nrows=0)),
+      (RowPartitionSpec(nvals=5), RowPartitionSpec(uniform_row_length=0)),
+  ])
+  def testIsNotCompatibleWith(self, spec1, spec2):
+    self.assertFalse(spec1.is_compatible_with(spec2))
+
+  @parameterized.parameters([
+      dict(
+          spec1=RowPartitionSpec(nrows=8, nvals=3, dtype=dtypes.int32),
+          spec2=RowPartitionSpec(nrows=8, nvals=3, dtype=dtypes.int32),
+          expected=RowPartitionSpec(nrows=8, nvals=3, dtype=dtypes.int32)),
+      dict(
+          spec1=RowPartitionSpec(nrows=8, nvals=None),
+          spec2=RowPartitionSpec(nrows=None, nvals=8),
+          expected=RowPartitionSpec(nrows=None, nvals=None)),
+      dict(
+          spec1=RowPartitionSpec(nrows=8, nvals=33),
+          spec2=RowPartitionSpec(nrows=3, nvals=13),
+          expected=RowPartitionSpec(nrows=None, nvals=None)),
+      dict(
+          spec1=RowPartitionSpec(nrows=12, uniform_row_length=3),
+          spec2=RowPartitionSpec(nrows=3, uniform_row_length=3),
+          expected=RowPartitionSpec(nrows=None, uniform_row_length=3)),
+      dict(
+          spec1=RowPartitionSpec(5, 35, 7),
+          spec2=RowPartitionSpec(8, 80, 10),
+          expected=RowPartitionSpec(None, None, None)),
+  ])
+  def testMostSpecificCompatibleType(self, spec1, spec2, expected):
+    actual = spec1.most_specific_compatible_type(spec2)
+    self.assertEqual(actual, expected)
+
+  @parameterized.parameters([
+      (RowPartitionSpec(), RowPartitionSpec(dtype=dtypes.int32)),
+  ])
+  def testMostSpecificCompatibleTypeError(self, spec1, spec2):
+    with self.assertRaisesRegexp(ValueError, 'not compatible'):
+      spec1.most_specific_compatible_type(spec2)
+
+  def testFromValue(self):
+    self.assertEqual(
+        RowPartitionSpec.from_value(RowPartition.from_row_splits([0, 2, 8, 8])),
+        RowPartitionSpec(nrows=3))
+    self.assertEqual(
+        RowPartitionSpec.from_value(
+            RowPartition.from_row_lengths([5, 3, 0, 2])),
+        RowPartitionSpec(nrows=4))
+    self.assertEqual(
+        RowPartitionSpec.from_value(
+            RowPartition.from_value_rowids([0, 2, 2, 8])),
+        RowPartitionSpec(nrows=9, nvals=4))
+    self.assertEqual(
+        RowPartitionSpec.from_value(
+            RowPartition.from_uniform_row_length(
+                nvals=12, uniform_row_length=3)),
+        RowPartitionSpec(nvals=12, uniform_row_length=3))
+
+
+def _assert_row_partition_equal(test_class, actual, expected):
+  assert isinstance(test_class, test_util.TensorFlowTestCase)
+  assert isinstance(actual, RowPartition)
+  assert isinstance(expected, RowPartition)
+
+  test_class.assertEqual(actual.has_precomputed_row_splits(),
+                         expected.has_precomputed_row_splits())
+  test_class.assertEqual(actual.has_precomputed_row_lengths(),
+                         expected.has_precomputed_row_lengths())
+  test_class.assertEqual(actual.has_precomputed_value_rowids(),
+                         expected.has_precomputed_value_rowids())
+  test_class.assertEqual(actual.has_precomputed_nrows(),
+                         expected.has_precomputed_nrows())
+  test_class.assertEqual(actual.uniform_row_length() is None,
+                         expected.uniform_row_length() is None)
+
+  if expected.has_precomputed_row_splits():
+    test_class.assertAllEqual(actual.row_splits(), expected.row_splits())
+  if expected.has_precomputed_row_lengths():
+    test_class.assertAllEqual(actual.row_lengths(), expected.row_lengths())
+  if expected.has_precomputed_value_rowids():
+    test_class.assertAllEqual(actual.value_rowids(), expected.value_rowids())
+  if expected.has_precomputed_nrows():
+    test_class.assertAllEqual(actual.nrows(), expected.nrows())
+  if expected.uniform_row_length() is not None:
+    test_class.assertAllEqual(actual.uniform_row_length(),
+                              expected.uniform_row_length())
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/string_ngrams_op_test.py b/tensorflow/python/ops/ragged/string_ngrams_op_test.py
index 6b3b3777cb5..e0e22c4b384 100644
--- a/tensorflow/python/ops/ragged/string_ngrams_op_test.py
+++ b/tensorflow/python/ops/ragged/string_ngrams_op_test.py
@@ -18,16 +18,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_string_ops
 from tensorflow.python.platform import test
 
 
-class StringNgramsTest(test_util.TensorFlowTestCase):
+class StringNgramsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def test_unpadded_ngrams(self):
     data = [[b"aa", b"bb", b"cc", b"dd"], [b"ee", b"ff"]]
@@ -282,6 +286,66 @@ class StringNgramsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(constant_op.constant([], dtype=dtypes.string),
                         result.values)
 
+  @parameterized.parameters([
+      dict(
+          data=[b"a", b"z"],
+          ngram_width=2,
+          pad_values=5,
+          exception=TypeError,
+          error="pad_values must be a string, tuple of strings, or None."),
+      dict(
+          data=[b"a", b"z"],
+          ngram_width=2,
+          pad_values=[5, 3],
+          exception=TypeError,
+          error="pad_values must be a string, tuple of strings, or None."),
+      dict(
+          data=[b"a", b"z"],
+          ngram_width=2,
+          padding_width=0,
+          pad_values="X",
+          error="padding_width must be greater than 0."),
+      dict(
+          data=[b"a", b"z"],
+          ngram_width=2,
+          padding_width=1,
+          error="pad_values must be provided if padding_width is set."),
+      dict(
+          data=b"hello",
+          ngram_width=2,
+          padding_width=1,
+          pad_values="X",
+          error="Data must have rank>0"),
+      dict(
+          data=[b"hello", b"world"],
+          ngram_width=[1, 2, -1],
+          padding_width=1,
+          pad_values="X",
+          error="All ngram_widths must be greater than 0. Got .*"),
+  ])
+  def test_error(self,
+                 data,
+                 ngram_width,
+                 separator=" ",
+                 pad_values=None,
+                 padding_width=None,
+                 preserve_short_sequences=False,
+                 error=None,
+                 exception=ValueError):
+    with self.assertRaisesRegexp(exception, error):
+      ragged_string_ops.ngrams(data, ngram_width, separator, pad_values,
+                               padding_width, preserve_short_sequences)
+
+  def test_unknown_rank_error(self):
+    # Use a tf.function that erases shape information.
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.string)])
+    def f(v):
+      return ragged_string_ops.ngrams(v, 2)
+
+    with self.assertRaisesRegexp(ValueError, "Rank of data must be known."):
+      f([b"foo", b"bar"])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/ragged/ragged_string_ops_test.py b/tensorflow/python/ops/ragged/strings_reduce_join_op_test.py
similarity index 97%
rename from tensorflow/python/ops/ragged/ragged_string_ops_test.py
rename to tensorflow/python/ops/ragged/strings_reduce_join_op_test.py
index 978d54c22ee..8876880ffe8 100644
--- a/tensorflow/python/ops/ragged/ragged_string_ops_test.py
+++ b/tensorflow/python/ops/ragged/strings_reduce_join_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged_string_ops."""
+"""Tests for tf.strings.reduce_join."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -26,7 +26,8 @@ from tensorflow.python.platform import googletest
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class RaggedStringOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+class StringsReduceJoinOpTest(test_util.TensorFlowTestCase,
+                              parameterized.TestCase):
 
   def test_rank_one(self):
     input_array = [b'this', b'is', b'a', b'test']
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index 5513f6fc1ab..83cb7fcc92a 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -253,10 +253,12 @@ def random_uniform(shape,
   Args:
     shape: A 1-D integer Tensor or Python array. The shape of the output tensor.
     minval: A Tensor or Python value of type `dtype`, broadcastable with
-      `maxval`. The lower bound on the range of random values to generate
+      `shape` (for integer types, broadcasting is not supported, so it needs to
+      be a scalar). The lower bound on the range of random values to generate
       (inclusive).  Defaults to 0.
     maxval: A Tensor or Python value of type `dtype`, broadcastable with
-      `minval`. The upper bound on the range of random values to generate
+      `shape` (for integer types, broadcasting is not supported, so it needs to
+      be a scalar). The upper bound on the range of random values to generate
       (exclusive). Defaults to 1 if `dtype` is floating point.
     dtype: The type of the output: `float16`, `float32`, `float64`, `int32`,
       or `int64`.
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 20e2b87c69b..591e0f5786b 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -51,7 +51,6 @@ from tensorflow.python.ops.gen_resource_variable_ops import *
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated
-from tensorflow.python.util.deprecation import deprecated_args
 
 
 acd.register_read_only_resource_op("ReadVariableOp")
@@ -334,10 +333,7 @@ def variable_accessed(variable):
 class BaseResourceVariable(variables.VariableV1):
   """A python variable from an existing handle."""
 
-  @deprecated_args(
-      None,
-      "If using Keras pass *_constraint arguments to layers.",
-      "constraint")
+  # TODO(wangpeng): Deprecate `constraint` when callers no long pass it in.
   def __init__(  # pylint: disable=super-init-not-called
       self,
       trainable=None,
@@ -445,9 +441,14 @@ class BaseResourceVariable(variables.VariableV1):
 
   def __repr__(self):
     if context.executing_eagerly() and not self._in_graph_mode:
+      # If we cannot read the value for any reason, still produce a __repr__.
+      try:
+        value_text = ops.numpy_text(self.read_value(), is_repr=True)
+      except:  # pylint: disable=bare-except
+        value_text = "<unavailable>"
+
       return "<tf.Variable '%s' shape=%s dtype=%s, numpy=%s>" % (
-          self.name, self.get_shape(), self.dtype.name,
-          ops.numpy_text(self.read_value(), is_repr=True))
+          self.name, self.get_shape(), self.dtype.name, value_text)
     else:
       return "<tf.Variable '%s' shape=%s dtype=%s>" % (
           self.name, self.get_shape(), self.dtype.name)
diff --git a/tensorflow/python/ops/stateful_random_ops.py b/tensorflow/python/ops/stateful_random_ops.py
index 16ff41712f2..9e8eba2e789 100644
--- a/tensorflow/python/ops/stateful_random_ops.py
+++ b/tensorflow/python/ops/stateful_random_ops.py
@@ -226,35 +226,33 @@ def _convert_to_state_tensor(t):
 class GeneratorSpec(type_spec.TypeSpec):
   """TypeSpec for Generator."""
 
-  def __init__(self, shape=None, dtype=None):
+  def __init__(self, shape=None, dtype=None, alg=None):
     self.shape = shape
     self.dtype = dtype
+    self.alg = alg
 
   @property
   def _component_specs(self):
-    return (tensor_spec.TensorSpec(shape=(), dtype=dtypes.resource),
-            tensor_spec.TensorSpec(shape=(), dtype=ALGORITHM_TYPE))
+    return (tensor_spec.TensorSpec(shape=(), dtype=dtypes.resource),)
 
   def _to_components(self, value):
-    return (value.state.handle, ops.convert_to_tensor(value.algorithm,
-                                                      dtype=ALGORITHM_TYPE))
+    return (value.state.handle,)
 
   def _from_components(self, components):
     assert isinstance(components, (list, tuple))
-    assert len(components) == 2
+    assert len(components) == 1
     handle = components[0]
-    alg = components[1]
     state_var = resource_variable_ops.BaseResourceVariable(
         handle=handle, shape=self.shape, dtype=self.dtype,
         trainable=False, handle_deleter=object(), handle_name="RNGVar")
-    return Generator(state=state_var, alg=alg)
+    return Generator(state=state_var, alg=self.alg)
 
   @property
   def value_type(self):
     return Generator
 
   def _serialize(self):
-    return (self.shape, self.dtype)
+    return (self.shape, self.dtype, self.alg)
 
 
 def _create_variable(*args, **kwargs):
@@ -549,7 +547,8 @@ class Generator(tracking.AutoTrackable, composite_tensor.CompositeTensor):
 
   @property
   def _type_spec(self):
-    return GeneratorSpec(shape=self.state.shape, dtype=self.state.dtype)
+    return GeneratorSpec(shape=self.state.shape, dtype=self.state.dtype,
+                         alg=self.algorithm)
 
   @property
   def state(self):
@@ -667,6 +666,11 @@ class Generator(tracking.AutoTrackable, composite_tensor.CompositeTensor):
     return gen_stateful_random_ops.stateful_uniform(
         self.state.handle, self.algorithm, shape=shape, dtype=dtype)
 
+  def _uniform_full_int(self, shape, dtype, name=None):
+    return gen_stateful_random_ops.stateful_uniform_full_int(
+        self.state.handle, self.algorithm, shape=shape,
+        dtype=dtype, name=name)
+
   def uniform(self, shape, minval=0, maxval=None,
               dtype=dtypes.float32, name=None):
     """Outputs random values from a uniform distribution.
@@ -685,14 +689,22 @@ class Generator(tracking.AutoTrackable, composite_tensor.CompositeTensor):
     `maxval - minval` significantly smaller than the range of the output (either
     `2**32` or `2**64`).
 
+    For full-range random integers, pass `minval=None` and `maxval=None` with an
+    integer `dtype` (for integer dtypes, `minval` and `maxval` must be both
+    `None` or both not `None`).
+
     Args:
       shape: A 1-D integer Tensor or Python array. The shape of the output
         tensor.
-      minval: A 0-D Tensor or Python value of type `dtype`. The lower bound on
-        the range of random values to generate.  Defaults to 0.
-      maxval: A 0-D Tensor or Python value of type `dtype`. The upper bound on
-        the range of random values to generate.  Defaults to 1 if `dtype` is
-        floating point.
+      minval: A Tensor or Python value of type `dtype`, broadcastable with
+        `shape` (for integer types, broadcasting is not supported, so it needs
+        to be a scalar). The lower bound (included) on the range of random
+        values to generate. Pass `None` for full-range integers. Defaults to 0.
+      maxval: A Tensor or Python value of type `dtype`, broadcastable with
+        `shape` (for integer types, broadcasting is not supported, so it needs
+        to be a scalar). The upper bound (excluded) on the range of random
+        values to generate. Pass `None` for full-range integers. Defaults to 1
+        if `dtype` is floating point.
       dtype: The type of the output.
       name: A name for the operation (optional).
 
@@ -703,13 +715,18 @@ class Generator(tracking.AutoTrackable, composite_tensor.CompositeTensor):
       ValueError: If `dtype` is integral and `maxval` is not specified.
     """
     dtype = dtypes.as_dtype(dtype)
-    if maxval is None:
-      if dtype.is_integer:
-        raise ValueError("Must specify maxval for integer dtype %r" % dtype)
+    if dtype.is_integer:
+      if (minval is None) != (maxval is None):
+        raise ValueError("For integer dtype {}, minval and maxval must be both "
+                         "`None` or both non-`None`; got minval={} and "
+                         "maxval={}".format(dtype, minval, maxval))
+    elif maxval is None:
       maxval = 1
     with ops.name_scope(name, "stateful_uniform",
                         [shape, minval, maxval]) as name:
       shape = _shape_tensor(shape)
+      if dtype.is_integer and minval is None:
+        return self._uniform_full_int(shape=shape, dtype=dtype, name=name)
       minval = ops.convert_to_tensor(minval, dtype=dtype, name="min")
       maxval = ops.convert_to_tensor(maxval, dtype=dtype, name="max")
       if dtype.is_integer:
@@ -723,8 +740,8 @@ class Generator(tracking.AutoTrackable, composite_tensor.CompositeTensor):
   def uniform_full_int(self, shape, dtype=dtypes.uint64, name=None):
     """Uniform distribution on an integer type's entire range.
 
-    The other method `uniform` only covers the range [minval, maxval), which
-    cannot be `dtype`'s full range because `maxval` is of type `dtype`.
+    This method is the same as setting `minval` and `maxval` to `None` in the
+    `uniform` method.
 
     Args:
       shape: the shape of the output.
@@ -738,9 +755,7 @@ class Generator(tracking.AutoTrackable, composite_tensor.CompositeTensor):
     with ops.name_scope(name, "stateful_uniform_full_int",
                         [shape]) as name:
       shape = _shape_tensor(shape)
-      return gen_stateful_random_ops.stateful_uniform_full_int(
-          self.state.handle, self.algorithm, shape=shape,
-          dtype=dtype, name=name)
+      return self._uniform_full_int(shape=shape, dtype=dtype, name=name)
 
   def binomial(self, shape, counts, probs, dtype=dtypes.int32, name=None):
     """Outputs random values from a binomial distribution.
diff --git a/tensorflow/python/ops/stateful_random_ops_test.py b/tensorflow/python/ops/stateful_random_ops_test.py
index 45c75cf1958..2389a068854 100644
--- a/tensorflow/python/ops/stateful_random_ops_test.py
+++ b/tensorflow/python/ops/stateful_random_ops_test.py
@@ -22,6 +22,7 @@ import re
 
 from absl.testing import parameterized
 import numpy as np
+import six
 
 from tensorflow.python.distribute import values as dist_values
 from tensorflow.python.distribute.mirrored_strategy import MirroredStrategy
@@ -560,6 +561,23 @@ class StatefulRandomOpsTest(test.TestCase, parameterized.TestCase):
         "For the Philox algorithm, the size of state must be at least"):
       gen_stateful_random_ops.stateful_standard_normal_v2(
           var.handle, random.RNG_ALG_PHILOX, shape)
+    with self.assertRaisesWithPredicateMatch(
+        ValueError,
+        "minval must be a scalar; got a tensor of shape "):
+      @def_function.function
+      def f():
+        gen.uniform(shape=shape, minval=array_ops.zeros(shape, "int32"),
+                    maxval=100, dtype="int32")
+      f()
+    with self.assertRaisesWithPredicateMatch(
+        ValueError,
+        "maxval must be a scalar; got a tensor of shape "):
+      @def_function.function
+      def f2():
+        gen.uniform(
+            shape=shape, minval=0, maxval=array_ops.ones(shape, "int32") * 100,
+            dtype="int32")
+      f2()
 
   @test_util.run_v2_only
   def testGetGlobalGeneratorWithXla(self):
@@ -613,6 +631,17 @@ class StatefulRandomOpsTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(res1, res2)
     self.assertAllEqual(g1.state.read_value(), g2.state.read_value())
 
+  @test_util.run_v2_only
+  def testFunArgAlgIsInt(self):
+    """Tests that `algorithm` is `int` when reconstructed from composite tensor.
+    """
+    @def_function.function
+    def f(g):
+      self.assertIsInstance(g.algorithm, six.integer_types)
+      return g.make_seeds(), g
+    gen = random.Generator.from_seed(123, alg="philox")
+    f(gen)
+
   @test_util.run_v2_only
   def testLimitedRetracingWithCompositeTensors(self):
     """Tests that RNGs with the same shape/dtype won't cause retracing.
@@ -719,6 +748,18 @@ class StatefulRandomOpsTest(test.TestCase, parameterized.TestCase):
       self.assertAllEqual(2, len(local_results))
       self.assertAllDifferent(local_results)
 
+  @test_util.run_v2_only
+  def testUniformFullInt(self):
+    """Tests full-range int uniform.
+    """
+    shape = [3, 4]
+    dtype = dtypes.int32
+    g = random.Generator.from_seed(1)
+    r1 = g.uniform(shape=shape, dtype=dtype, minval=None)
+    g = random.Generator.from_seed(1)
+    r2 = g.uniform_full_int(shape=shape, dtype=dtype)
+    self.assertAllEqual(r1, r2)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/stateless_random_ops.py b/tensorflow/python/ops/stateless_random_ops.py
index eb08383ba6c..eb3a2d18b3a 100644
--- a/tensorflow/python/ops/stateless_random_ops.py
+++ b/tensorflow/python/ops/stateless_random_ops.py
@@ -66,7 +66,7 @@ def stateless_random_uniform(shape,
   `maxval - minval` significantly smaller than the range of the output (either
   `2**32` or `2**64`).
 
-  For full full-range (i.e. inclusive of both max and min) random integers, pass
+  For full-range (i.e. inclusive of both max and min) random integers, pass
   `minval=None` and `maxval=None` with an integer `dtype`. For an integer dtype
   either both `minval` and `maxval` must be `None` or neither may be `None`. For
   example:
@@ -78,12 +78,15 @@ def stateless_random_uniform(shape,
   Args:
     shape: A 1-D integer Tensor or Python array. The shape of the output tensor.
     seed: A shape [2] integer Tensor of seeds to the random number generator.
-    minval: A 0-D Tensor or Python value of type `dtype`. The lower bound on the
-      range of random values to generate. Pass `None` for full-range integers.
-      Defaults to 0.
-    maxval: A 0-D Tensor or Python value of type `dtype`. The upper bound on the
-      range of random values to generate.  Defaults to 1 if `dtype` is floating
-      point. Pass `None` for full-range integers.
+    minval: A Tensor or Python value of type `dtype`, broadcastable with
+      `shape` (for integer types, broadcasting is not supported, so it needs to
+      be a scalar). The lower bound on the range of random values to
+      generate. Pass `None` for full-range integers.  Defaults to 0.
+    maxval: A Tensor or Python value of type `dtype`, broadcastable with
+      `shape` (for integer types, broadcasting is not supported, so it needs to
+      be a scalar). The upper bound on the range of random values to generate.
+      Defaults to 1 if `dtype` is floating point. Pass `None` for full-range
+      integers.
     dtype: The type of the output: `float16`, `float32`, `float64`, `int32`, or
       `int64`. For unbounded uniform ints (`minval`, `maxval` both `None`),
       `uint32` and `uint64` may be used.
@@ -307,13 +310,13 @@ def stateless_random_poisson(shape,
 
   This is a stateless version of `tf.random.poisson`: if run twice with the same
   seeds, it will produce the same pseudorandom numbers. The output is consistent
-  across multiple runs on the same hardware (and between CPU and GPU), but may
-  change between versions of TensorFlow or on non-CPU/GPU hardware.
+  across multiple runs on the same hardware, but may change between versions of
+  TensorFlow or on non-CPU/GPU hardware.
 
   A slight difference exists in the interpretation of the `shape` parameter
   between `stateless_poisson` and `poisson`: in `poisson`, the `shape` is always
-  prepended to the shape of `rate`; whereas in `stateless_poisson` the shape of
-  `rate` must match the trailing dimensions of `shape`.
+  prepended to the shape of `lam`; whereas in `stateless_poisson` the shape of
+  `lam` must match the trailing dimensions of `shape`.
 
   Example:
 
diff --git a/tensorflow/python/ops/while_v2_indexed_slices_rewriter.py b/tensorflow/python/ops/while_v2_indexed_slices_rewriter.py
index 9637ee174d7..70e63133bdb 100644
--- a/tensorflow/python/ops/while_v2_indexed_slices_rewriter.py
+++ b/tensorflow/python/ops/while_v2_indexed_slices_rewriter.py
@@ -81,6 +81,14 @@ def rewrite_grad_indexed_slices(grads, body_grad_graph, loop_vars,
   return loop_vars
 
 
+def _get_tensor_index_in_iterable(iterable, t):
+  """Returns index of first occurence of `t`, raises ValueError if not found."""
+  for i, elem in enumerate(iterable):
+    if t is elem:
+      return i
+  raise ValueError("%s is not in iterable" % str(t))
+
+
 def _rewrite_output_as_tensor(body_grad_graph, grad_output_slices):
   """Rewrites grad_output_slices to be a Tensor output.
 
@@ -91,7 +99,8 @@ def _rewrite_output_as_tensor(body_grad_graph, grad_output_slices):
   with body_grad_graph.as_default():
     new_output = ops.convert_to_tensor_v2(grad_output_slices)
 
-  idx = body_grad_graph.structured_outputs.index(grad_output_slices)
+  idx = _get_tensor_index_in_iterable(body_grad_graph.structured_outputs,
+                                      grad_output_slices)
   body_grad_graph.structured_outputs[idx] = new_output
   body_grad_graph.outputs = func_graph.flatten(
       body_grad_graph.structured_outputs)
@@ -259,11 +268,14 @@ def _update_indexed_slices_param(graph, loop_vars, init_slices, input_slices,
   Returns:
     New loop_vars to pass to graph.
   """
-  structured_idx = graph.structured_outputs.index(old_output_slices)
+  structured_idx = _get_tensor_index_in_iterable(graph.structured_outputs,
+                                                 old_output_slices)
   # We assume that the component tensors of old_output_slices appear
   # sequentially in graph.outputs. We use the first of these tensors
   # as the reference index.
-  flat_idx = graph.outputs.index(func_graph.flatten(old_output_slices)[0])
+  flat_idx = _get_tensor_index_in_iterable(
+      graph.outputs,
+      func_graph.flatten(old_output_slices)[0])
 
   graph.structured_outputs[structured_idx] = output_slices
   graph.outputs = func_graph.flatten(
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index 810dd23e00d..1f80b864549 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -59,7 +59,6 @@ cuda_py_test(
     python_version = "PY3",
     tags = [
         "no_pip",
-        "no_windows",
     ],
     deps = [
         ":profiler_v2",
@@ -218,13 +217,24 @@ py_test(
     ],
 )
 
+py_library(
+    name = "trace",
+    srcs = ["trace.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python:util",
+        "//tensorflow/python/profiler/internal:_pywrap_traceme",
+        "@six_archive//:six",
+    ],
+)
+
 py_library(
     name = "traceme",
     srcs = ["traceme.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python/profiler/internal:_pywrap_traceme",
-        "@six_archive//:six",
+        ":trace",
     ],
 )
 
diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD
index 05717904df1..3dccf7144e4 100644
--- a/tensorflow/python/profiler/internal/BUILD
+++ b/tensorflow/python/profiler/internal/BUILD
@@ -125,7 +125,6 @@ tf_python_pybind_extension(
         "//tensorflow/core/profiler/rpc/client:save_profile",
         "//tensorflow/python:pybind11_status",
         "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/time",
         "@pybind11",
     ],
 )
diff --git a/tensorflow/python/profiler/internal/profiler_wrapper.cc b/tensorflow/python/profiler/internal/profiler_wrapper.cc
index eb633bf6364..c7780b7dc01 100644
--- a/tensorflow/python/profiler/internal/profiler_wrapper.cc
+++ b/tensorflow/python/profiler/internal/profiler_wrapper.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include <memory>
 
 #include "absl/memory/memory.h"
-#include "absl/time/time.h"
 #include "include/pybind11/pybind11.h"
 #include "tensorflow/core/platform/host_info.h"
 #include "tensorflow/core/platform/types.h"
@@ -31,11 +30,6 @@ namespace py = ::pybind11;
 
 namespace {
 
-tensorflow::string GetCurrentTimeStampAsString() {
-  return absl::FormatTime("%E4Y-%m-%d_%H:%M:%S", absl::Now(),
-                          absl::LocalTimeZone());
-}
-
 tensorflow::ProfileRequest MakeProfileRequest(
     const tensorflow::string& logdir, const tensorflow::string& session_id,
     const tensorflow::string& host) {
@@ -76,15 +70,15 @@ class ProfilerSessionWrapper {
     tensorflow::Status status;
     status = session_->CollectData(&xspace);
     session_.reset();
-    if (!status.ok()) {
-      tensorflow::MaybeRaiseRegisteredFromStatus(status);
-      return;
-    }
+    tensorflow::MaybeRaiseRegisteredFromStatus(status);
+
     tensorflow::ProfileResponse response;
     tensorflow::ProfileRequest request = MakeProfileRequest(
-        logdir_, GetCurrentTimeStampAsString(), tensorflow::port::Hostname());
-    tensorflow::profiler::ConvertXSpaceToProfileResponse(xspace, request,
-                                                         &response);
+        logdir_, tensorflow::profiler::GetCurrentTimeStampAsString(),
+        tensorflow::port::Hostname());
+    status = tensorflow::profiler::ConvertXSpaceToProfileResponse(
+        xspace, request, &response);
+    tensorflow::MaybeRaiseRegisteredFromStatus(status);
 
     std::stringstream ss;  // Record LOG messages.
     status = tensorflow::profiler::SaveTensorboardProfile(
diff --git a/tensorflow/python/profiler/profiler_v2.py b/tensorflow/python/profiler/profiler_v2.py
index ba65aea7621..3c15f70aa04 100644
--- a/tensorflow/python/profiler/profiler_v2.py
+++ b/tensorflow/python/profiler/profiler_v2.py
@@ -81,6 +81,9 @@ def start(logdir):
                       'server and profiler APIs at the same time.')
       raise errors.AlreadyExistsError(None, None,
                                       'Another profiler is running.')
+    except Exception:
+      _profiler = None
+      raise
 
 
 @tf_export('profiler.experimental.stop', v1=[])
@@ -102,7 +105,11 @@ def stop(save=True):
           None, None,
           'Cannot export profiling results. No profiler is running.')
     if save:
-      _profiler.export_to_tb()
+      try:
+        _profiler.export_to_tb()
+      except Exception:
+        _profiler = None
+        raise
     _profiler = None
 
 
@@ -127,12 +134,8 @@ def start_server(port):
 
   Args:
     port: port profiler server listens to.
-
-  Example usage:
-  ```python
-  tf.profiler.experimental.server.start('6009')
-  # do your training here.
-
+  Example usage: ```python tf.profiler.experimental.server.start('6009') # do
+    your training here.
   """
   _pywrap_profiler.start_server(port)
 
diff --git a/tensorflow/python/profiler/profiler_v2_test.py b/tensorflow/python/profiler/profiler_v2_test.py
index 59603aa8d3e..de35cc78a75 100644
--- a/tensorflow/python/profiler/profiler_v2_test.py
+++ b/tensorflow/python/profiler/profiler_v2_test.py
@@ -42,6 +42,16 @@ class ProfilerTest(test_util.TensorFlowTestCase):
     with self.assertRaises(errors.UnavailableError):
       profiler.stop()
 
+    # Test with a bad logdir, and it correctly raises exception and deletes
+    # profiler.
+    # pylint: disable=anomalous-backslash-in-string
+    profiler.start('/\/\/:123')
+    # pylint: enable=anomalous-backslash-in-string
+    with self.assertRaises(Exception):
+      profiler.stop()
+    profiler.start(logdir)
+    profiler.stop()
+
   def test_save_profile(self):
     logdir = self.get_temp_dir()
     profiler.start(logdir)
diff --git a/tensorflow/python/profiler/trace.py b/tensorflow/python/profiler/trace.py
new file mode 100644
index 00000000000..88f38bd4968
--- /dev/null
+++ b/tensorflow/python/profiler/trace.py
@@ -0,0 +1,71 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Trace allows the profiler to trace Python events."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.python.profiler.internal import _pywrap_traceme
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export('profiler.experimental.Trace', v1=[])
+class Trace(object):
+  """Context manager that generates a trace event in the profiler.
+
+  A trace event will start when entering the context, and stop and save the
+  result to the profiler when exiting the context. Open TensorBoard Profile tab
+  and choose trace viewer to view the trace event in the timeline.
+
+  Trace events are created only when the profiler is enabled. More information
+  on how to use the profiler can be found at
+  https://tensorflow.org/guide/profiler
+
+  Example usage:
+  ```python
+  tf.profiler.experimental.start('logdir')
+  for step in range(num_steps):
+    # Creates a trace event for each training step with the step number.
+    with tf.profiler.experimental.Trace("Train", step_num=step):
+      train_fn()
+  tf.profiler.experimental.stop()
+  ```
+  """
+
+  def __init__(self, name, **kwargs):
+    """Creates a trace event in the profiler.
+
+    Args:
+      name: The name of the trace event.
+      **kwargs: Keyword arguments added to the trace event.
+    """
+    if _pywrap_traceme.TraceMe.IsEnabled():
+      if kwargs:
+        name += '#' + ','.join(key + '=' + str(value)
+                               for key, value in six.iteritems(kwargs)) + '#'
+      self._traceme = _pywrap_traceme.TraceMe(name)
+    else:
+      self._traceme = None
+
+  def __enter__(self):
+    if self._traceme:
+      self._traceme.Enter()
+
+  def __exit__(self, exc_type, exc_val, exc_tb):
+    if self._traceme:
+      self._traceme.Exit()
diff --git a/tensorflow/python/profiler/traceme.py b/tensorflow/python/profiler/traceme.py
index ffabc14c0b6..7807afbc54b 100644
--- a/tensorflow/python/profiler/traceme.py
+++ b/tensorflow/python/profiler/traceme.py
@@ -12,41 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TraceMe allows the profiler to trace python events.
-
-Usage:
-    with profiler.TraceMe('name'):
-      ...
-"""
+"""TraceMe allows the profiler to trace Python events."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import six
-
-from tensorflow.python.profiler.internal import _pywrap_traceme
-
-
-class TraceMe(object):
-  """Context manager that generates a trace event in the profiler."""
-
-  def __init__(self, name, **kwargs):
-    if _pywrap_traceme.TraceMe.IsEnabled():
-      if kwargs:
-        name += '#' + ','.join(key + '=' + str(value)
-                               for key, value in six.iteritems(kwargs)) + '#'
-      self._traceme = _pywrap_traceme.TraceMe(name)
-    else:
-      self._traceme = None
-
-  def __enter__(self):
-    if self._traceme:
-      self._traceme.Enter()
-
-  def __exit__(self, exc_type, exc_val, exc_tb):
-    if self._traceme:
-      self._traceme.Exit()
+from tensorflow.python.profiler.trace import Trace as TraceMe
 
 
 def traceme_wrapper(func):
@@ -58,4 +30,3 @@ def traceme_wrapper(func):
     with TraceMe(name):
       return func(*args, **kwargs)
   return wrapper
-
diff --git a/tensorflow/python/saved_model/function_deserialization.py b/tensorflow/python/saved_model/function_deserialization.py
index b25247aa912..e8a9514dd56 100644
--- a/tensorflow/python/saved_model/function_deserialization.py
+++ b/tensorflow/python/saved_model/function_deserialization.py
@@ -22,11 +22,11 @@ import collections
 import re
 
 from tensorflow.core.framework import function_pb2
-from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function as function_lib
 from tensorflow.python.framework import func_graph as func_graph_lib
 from tensorflow.python.framework import function_def_to_graph as function_def_lib
+from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import type_spec
@@ -298,6 +298,19 @@ def load_function_def_library(library, load_shared_name_suffix=None):
   functions = {}
   renamed_functions = {}
 
+  # Our graph building code currently requires functions to be registered with
+  # some tf.Graph in order to import functions using the
+  # op-name-is-function-name calling convention. To avoid leaking memory into
+  # the global default graph when executing eagerly, we create a temporary
+  # Graph.
+  #
+  # TODO(allenl): Make this Graph creation unnecessary when executing eagerly by
+  # fixing function_def_to_graph_def.
+  if ops.executing_eagerly_outside_functions():
+    graph = ops.Graph()
+  else:
+    graph = ops.get_default_graph()
+
   if load_shared_name_suffix is None:
     load_shared_name_suffix = "_load_{}".format(ops.uid())
   for fdef in _sort_function_defs(library, library_function_names):
@@ -308,18 +321,22 @@ def load_function_def_library(library, load_shared_name_suffix=None):
     # extra function definitions are a no-op since they already imported as a
     # function before and passed in explicitly (due to the topologic sort
     # import).
-    func_graph = function_def_lib.function_def_to_graph(copy)
+    with graph.as_default():
+      func_graph = function_def_lib.function_def_to_graph(copy)
     _restore_gradient_functions(func_graph, renamed_functions)
 
     for dep in _list_function_deps(fdef, library_function_names):
       functions[dep].add_to_graph(func_graph)
     func = function_lib.ConcreteFunction(func_graph)
-    func.add_to_graph()
-    if context.executing_eagerly():
-      func.add_to_graph(ops.get_default_graph())
+    func.add_to_graph(graph)
 
     functions[fdef.signature.name] = func
     renamed_functions[func.name] = func
+    if any(op.type == "TRTEngineOp" for op in func_graph.get_operations()):
+      # TODO(b/150708051): Remove this hack once TensorRT SavedModel integration
+      # is fixed. Currently it's leaking memory to maintain bug compatibility
+      # with previous behavior.
+      func.add_to_graph(ops.get_default_graph())
 
   return functions
 
@@ -394,14 +411,21 @@ def fix_node_def(node_def, functions, shared_name_suffix, debug_name):
   # TODO(b/124205571): Avoid accidental sharing and destruction of restored
   # resources. For now uniquify "shared_name" when loading functions to avoid
   # sharing.
-  if "shared_name" in node_def.attr:
-    if node_def.attr["shared_name"].s:
-      node_def.attr["shared_name"].s += compat.as_bytes(shared_name_suffix)
-    else:
-      # Blank shared_name attributes would use the node name, so we'll start
-      # with that when uniquifying.
+  # TODO: Add regression test for b/150826922.
+  op_def = op_def_registry.get(node_def.op)
+  if op_def:
+    attr = next((a for a in op_def.attr if a.name == "shared_name"), None)
+    if attr:
+      shared_name = None
+      if "shared_name" in node_def.attr and node_def.attr["shared_name"].s:
+        shared_name = node_def.attr["shared_name"].s
+      elif attr.default_value.s:
+        shared_name = compat.as_bytes(attr.default_value.s)
+      if not shared_name:
+        shared_name = compat.as_bytes(node_def.name)
+
       node_def.attr["shared_name"].s = (
-          compat.as_bytes(node_def.name) + compat.as_bytes(shared_name_suffix))
+          shared_name + compat.as_bytes(shared_name_suffix))
 
 
 def _fix_fdef(orig_fdef, functions, shared_name_suffix):
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
index fc77270651b..69ce35609c6 100644
--- a/tensorflow/python/saved_model/load_test.py
+++ b/tensorflow/python/saved_model/load_test.py
@@ -1992,6 +1992,21 @@ class SingleCycleTests(test.TestCase, parameterized.TestCase):
     self.assertAllClose({"output_0": 13},
                         imported.signatures["serving_default"]())
 
+  # TODO(allenl, kkb): Use the new memory checker here once it's fast enough (3
+  # iterations took hundreds of seconds). It would be really nice to check
+  # allocations at a lower level.
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  def test_functions_cleaned(self):
+    if sys.version_info.major < 3:
+      self.skipTest("Not working in Python 2")
+    root = module.Module()
+    root.v = variables.Variable(1.)
+    root.f = def_function.function(
+        lambda x: x + root.v,
+        input_signature=[
+            tensor_spec.TensorSpec(shape=[], dtype=dtypes.float32)])
+    cycle(root, 1)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/nested_structure_coder.py b/tensorflow/python/saved_model/nested_structure_coder.py
index 4ddbce3f81e..2cf01515181 100644
--- a/tensorflow/python/saved_model/nested_structure_coder.py
+++ b/tensorflow/python/saved_model/nested_structure_coder.py
@@ -48,6 +48,7 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import row_partition
 from tensorflow.python.util import compat
 from tensorflow.python.util.compat import collections_abc
 
@@ -471,6 +472,8 @@ class _TypeSpecCodec(object):
           values.PerReplicaSpec,
       struct_pb2.TypeSpecProto.VARIABLE_SPEC:
           resource_variable_ops.VariableSpec,
+      struct_pb2.TypeSpecProto.ROW_PARTITION_SPEC:
+          row_partition.RowPartitionSpec,
   }
 
   # Mapping from type (TypeSpec subclass) to enum value.
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index c2774a98b86..a5d6353280c 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -960,6 +960,8 @@ def save(obj, export_dir, signatures=None, options=None):
   # Note that this needs to be the last file operation when saving the
   # SavedModel. Users rely on checking saved_model_dir/saved_model.pb as an
   # indication that the SavedModel is completely written.
+  if context.executing_eagerly():
+    context.async_wait()  # Ensure save operations have completed.
   path = os.path.join(
       compat.as_str(export_dir),
       compat.as_str(constants.SAVED_MODEL_FILENAME_PB))
diff --git a/tensorflow/python/tools/saved_model_utils.py b/tensorflow/python/tools/saved_model_utils.py
index c5c168c1d27..b88ecb5ead0 100644
--- a/tensorflow/python/tools/saved_model_utils.py
+++ b/tensorflow/python/tools/saved_model_utils.py
@@ -29,7 +29,7 @@ from tensorflow.python.util import compat
 
 
 def read_saved_model(saved_model_dir):
-  """Reads the savedmodel.pb or savedmodel.pbtxt file containing `SavedModel`.
+  """Reads the saved_model.pb or saved_model.pbtxt file containing `SavedModel`.
 
   Args:
     saved_model_dir: Directory containing the SavedModel file.
diff --git a/tensorflow/python/tpu/device_assignment.py b/tensorflow/python/tpu/device_assignment.py
index 16081f100cf..9e805655a01 100644
--- a/tensorflow/python/tpu/device_assignment.py
+++ b/tensorflow/python/tpu/device_assignment.py
@@ -28,7 +28,7 @@ from tensorflow.python.tpu.topology import Topology
 from tensorflow.python.util.tf_export import tf_export
 
 
-SINGLE_CORE_ASSIGNMENT = [[[0, 0, 0]]]
+SINGLE_CORE_ASSIGNMENT = [[[0, 0, 0, 0]]]
 
 
 def _compute_task_and_cores_to_replicas(core_assignment, topology):
@@ -176,8 +176,8 @@ class DeviceAssignment(object):
                              num_replicas)
 
 
-def _ring_2d(height, width):
-  """Ring-order of a height x width mesh.
+def _open_ring_2d(x_size, y_size, z_coord):
+  """Ring-order of a X by Y mesh, with a fixed Z coordinate.
 
   For example, in a 4x4 mesh, this returns the following order.
     0 -- 1 -- 2 -- 3
@@ -188,28 +188,128 @@ def _ring_2d(height, width):
     |    |    |    |
     13-- 12-- 11-- 10
 
+  Note that chip 0 is not included in the output.
+
   Args:
-    height: An integer represents the height.
-    width: An integer represents the width.
+    x_size: An integer represents the mesh size in the x-dimension. Must be
+      larger than 1.
+    y_size: An integer represents the mesh size in the y-dimension. Must be
+      larger than 1.
+    z_coord: An integer represents the z-coordinate to use for the chips in the
+      ring.
 
   Returns:
-    A list of [y, x] pairs with ring order.
+    A list of (x,y,z) triples in ring order.
   """
-  if height == 1:
-    return [(0, i) for i in range(width)]
-  if width == 1:
-    return [(i, 0) for i in range(height)]
-  if height % 2 != 0:
+  ret = []
+  for i in range(y_size // 2):
+    for j in range(1, x_size):
+      ret.append((j, 2 * i, z_coord))
+    for j in range(x_size - 1, 0, -1):
+      ret.append((j, 2 * i + 1, z_coord))
+  for i in range(y_size - 1, 0, -1):
+    ret.append((0, i, z_coord))
+  return ret
+
+
+def _ring_3d(x_size, y_size, z_size):
+  """Ring-order of a X by Y by Z mesh.
+
+  Constructs the 3d ring from 2d rings that are stacked in the Z dimension and
+  joined in one corner.
+
+  z == 0:
+    0 -- 1 -- 2 -- 3
+    |    |    |    |
+    15 - 6 -- 5 -- 4
+    |    |    |    |
+    14 - 7 -- 8 -- 9
+    |    |    |    |
+    13 - 12 - 11 - 10
+  z == 1:
+    63 - 30 - 29 - 28
+    |    |    |    |
+    16 - 25 - 26 - 27
+    |    |    |    |
+    17 - 24 - 23 - 22
+    |    |    |    |
+    18 - 19 - 20 - 21
+  z == 2:
+    62 - 31 - 32 - 33
+    |    |    |    |
+    45 - 36 - 35 - 34
+    |    |    |    |
+    44 - 37 - 38 - 39
+    |    |    |    |
+    43 - 42 - 41 - 40
+  z == 3:
+    61 - 60 - 59 - 58
+    |    |    |    |
+    46 - 55 - 56 - 57
+    |    |    |    |
+    47 - 54 - 53 - 52
+    |    |    |    |
+    48 - 49 - 50 - 51
+
+  Args:
+    x_size: An integer represents the mesh size in the x-dimension. Must be
+      larger than 1.
+    y_size: An integer represents the mesh size in the y-dimension. Must be
+      larger than 1.
+    z_size: An integer represents the mesh size in the z-dimension. Must be
+      larger than 1.  For example, in a 4x4x4 mesh, this returns the following
+      order.
+
+  Returns:
+    A list of (x,y,z) triples in ring order.
+  """
+
+  # Handle the case where 2 dimensions are size 1.
+  if x_size == 1 and y_size == 1:
+    return [(0, 0, i) for i in range(z_size)]
+  if x_size == 1 and z_size == 1:
+    return [(0, i, 0) for i in range(y_size)]
+  if y_size == 1 and z_size == 1:
+    return [(i, 0, 0) for i in range(x_size)]
+
+  # Handle odd mesh dimensions.  This never happens in practice, so we don't
+  # bother to try building something optimal.
+  if (x_size > 1 and x_size % 2 != 0) or (y_size > 1 and
+                                          y_size % 2 != 0) or (z_size > 1 and
+                                                               z_size % 2 != 0):
     logging.warning("Odd dimension")
-    return [(i % height, i // height) for i in range(width * height)]
-  ret = [(0, 0)]
-  for i in range(height // 2):
-    for j in range(1, width):
-      ret.append((2 * i, j))
-    for j in range(width - 1, 0, -1):
-      ret.append((2 * i + 1, j))
-  for i in range(height - 1, 0, -1):
-    ret.append((i, 0))
+    ret = []
+    for z in range(z_size):
+      for y in range(y_size):
+        ret.extend((x, y, z) for x in range(x_size))
+    return ret
+
+  # Always start with chip 0.
+  ret = [(0, 0, 0)]
+  # Handle the case where one dimension is size 1.  We just build a flat, 2d
+  # ring.
+  if z_size == 1:
+    ret.extend(_open_ring_2d(x_size, y_size, 0))
+    return ret
+  if y_size == 1:
+    ret = [(0, 0, 0)]
+    ret.extend((x, y, z) for (x, z, y) in _open_ring_2d(x_size, z_size, 0))
+    return ret
+  if x_size == 1:
+    ret = [(0, 0, 0)]
+    ret.extend((x, y, z) for (y, z, x) in _open_ring_2d(y_size, z_size, 0))
+    return ret
+
+  # Handle the case where all dimensions have size > 1 and even.
+  ret = [(0, 0, 0)]
+  for i in range(0, z_size):
+    r = _open_ring_2d(x_size, y_size, i)
+    if i % 2 == 0:
+      ret.extend(r)
+    else:
+      ret.extend(reversed(r))
+  for i in range(z_size - 1, 0, -1):
+    ret.append((0, 0, i))
   return ret
 
 
@@ -322,12 +422,19 @@ def device_assignment(topology,
       # in increasing size, we assign the most constrained dimension
       # first, so we won't make infeasible choices.
       #
-      # As a secondary sort order, visit the dimensions in reverse
-      # order. This means we try to use both cores on the same chip
-      # in preference to two cores on different chips.
+      # As a secondary sort order, visit the last dimension (core index) first,
+      # then the other dimensions in increasing order. This means we try to use
+      # both cores on the same chip in preference to two cores on different
+      # chips.  We visit the x dimension first, and the z dimension last, so
+      # that we prefer to arrange adjacent replicas on the same machine when
+      # possible.
+      #
+      # For example, if num_replicas == 4, we prefer to use a replica_shape of
+      # (2,1,1,2) over (1,1,2,2).
 
-      for x, ni in sorted(((x, -i) for (i, x) in enumerate(replica_counts))):
-        i = -ni
+      for x, ni in sorted(((x, ((i + 1) % topology_rank))
+                           for (i, x) in enumerate(replica_counts))):
+        i = (ni + topology_rank - 1) % topology_rank
         target_size = int(math.ceil(remaining_replicas**(1.0 / remaining_dims)))
         replica_shape[i] = min(target_size, x)
         remaining_replicas = ceil_of_ratio(remaining_replicas, replica_shape[i])
@@ -338,37 +445,39 @@ def device_assignment(topology,
     # Assigns an offset to each replica such that no two replicas overlap.
     replica_offsets = np.full([num_replicas, topology_rank], -1, dtype=np.int32)
 
-    # TODO(ylc): Revisit here when topology_rank > 3.
-    enable_2d_tiling = (
-        topology_rank == 3 and
-        computation_shape[-1] == 2  # Only handle 2D case.
+    enable_3d_tiling = (
+        topology_rank == 4 and
+        computation_shape[-1] == 2  # Only handle 3D case.
         and np.prod(computation_stride) == 1  # Ensure no stride.
         and num_replicas == max_replicas)  # Full replication.
-    logging.info("enable_2d_tiling: {}".format(enable_2d_tiling))
-    if enable_2d_tiling:
+    if enable_3d_tiling:
       assignment = []
-      inner_ring = _ring_2d(computation_shape[0], computation_shape[1])
-      outer_ring = _ring_2d(replica_shape[0], replica_shape[1])
+      inner_ring = _ring_3d(computation_shape[0], computation_shape[1],
+                            computation_shape[2])
+      outer_ring = _ring_3d(replica_shape[0], replica_shape[1],
+                            replica_shape[2])
 
       for replica in xrange(num_replicas):
-        outer_x, outer_y = outer_ring[replica]
+        outer_x, outer_y, outer_z = outer_ring[replica]
         per_replica_assignment = []
         for index in xrange(np.prod(computation_shape)):
-          inner_x, inner_y = inner_ring[index // 2]
+          inner_x, inner_y, inner_z = inner_ring[index // 2]
           px = outer_x * computation_shape[0] + inner_x
           py = outer_y * computation_shape[1] + inner_y
-          pz = index % 2
-          per_replica_assignment.append([px, py, pz])
+          pz = outer_z * computation_shape[2] + inner_z
+          pi = index % 2
+          per_replica_assignment.append([px, py, pz, pi])
         assignment.append(per_replica_assignment)
     else:
       for replica in xrange(num_replicas):
         # Chooses a replica number in each axis.
         t = replica
         pos = []
-        for dim in replica_shape[::-1]:
+        # Visit the core number first.
+        for dim in np.concatenate([[replica_shape[-1]], replica_shape[:-1]]):
           pos.append(t % dim)
           t //= dim
-        replica_pos = np.array(pos[::-1], dtype=np.int32)
+        replica_pos = np.concatenate([pos[1:], [pos[0]]])
 
         # Determines where that replica starts in each axis.
         outer = replica_pos // computation_stride
@@ -378,7 +487,7 @@ def device_assignment(topology,
       # Computes a logical core -> physical core mapping for each replica.
       indices = [
           np.arange(0, computation_shape[i] * computation_stride[i],
-                    computation_stride[i]) for i in xrange(topology_rank)
+                    computation_stride[i]) for i in range(topology_rank)
       ]
       indices = np.concatenate(
           [i[..., np.newaxis] for i in np.meshgrid(*indices, indexing="ij")],
diff --git a/tensorflow/python/tpu/feature_column.py b/tensorflow/python/tpu/feature_column.py
index 5f9535dc3c6..6039a57ce90 100644
--- a/tensorflow/python/tpu/feature_column.py
+++ b/tensorflow/python/tpu/feature_column.py
@@ -86,7 +86,10 @@ def embedding_column(categorical_column,
       and any sequence longer will be truncated. This must be positive for
       sequence features and 0 for non-sequence features.
     learning_rate_fn: A function that takes global step and returns learning
-      rate for the embedding table.
+      rate for the embedding table. If you intend to use the same learning rate
+      for multiple embedding tables, please ensure that you pass the exact same
+      python function to all calls of embedding_column, otherwise performence
+      may suffer.
     use_safe_embedding_lookup: If true, uses safe_embedding_lookup_sparse
       instead of embedding_lookup_sparse. safe_embedding_lookup_sparse ensures
       there are no empty rows and all weights and ids are positive at the
@@ -196,7 +199,10 @@ def shared_embedding_columns(categorical_columns,
       sequence shorter then this will be padded with 0 embeddings and any
       sequence longer will be truncated.
     learning_rate_fn: A function that takes global step and returns learning
-      rate for the embedding table.
+      rate for the embedding table. If you intend to use the same learning rate
+      for multiple embedding tables, please ensure that you pass the exact same
+      python function to all calls of shared_embedding_columns, otherwise
+      performence may suffer.
     use_safe_embedding_lookup: If true, uses safe_embedding_lookup_sparse
       instead of embedding_lookup_sparse. safe_embedding_lookup_sparse ensures
       there are no empty rows and all weights and ids are positive at the
diff --git a/tensorflow/python/tpu/feature_column_v2.py b/tensorflow/python/tpu/feature_column_v2.py
index a51a3153c76..d9820425467 100644
--- a/tensorflow/python/tpu/feature_column_v2.py
+++ b/tensorflow/python/tpu/feature_column_v2.py
@@ -107,7 +107,10 @@ def embedding_column_v2(categorical_column,
       and any sequence longer will be truncated. This must be positive for
       sequence features and 0 for non-sequence features.
     learning_rate_fn: A function that takes global step and returns learning
-      rate for the embedding table.
+      rate for the embedding table. If you intend to use the same learning rate
+      for multiple embedding tables, please ensure that you pass the exact same
+      python function to all calls of embedding_column, otherwise performence
+      may suffer.
     embedding_lookup_device: The device on which to run the embedding lookup.
       Valid options are "cpu", "tpu_tensor_core", and "tpu_embedding_core".
       If specifying "tpu_tensor_core", a tensor_core_shape must be supplied.
@@ -266,7 +269,10 @@ def shared_embedding_columns_v2(categorical_columns,
       sequence shorter then this will be padded with 0 embeddings and any
       sequence longer will be truncated.
     learning_rate_fn: A function that takes global step and returns learning
-      rate for the embedding table.
+      rate for the embedding table. If you intend to use the same learning rate
+      for multiple embedding tables, please ensure that you pass the exact same
+      python function to all calls of shared_embedding_columns, otherwise
+      performence may suffer.
     embedding_lookup_device: The device on which to run the embedding lookup.
       Valid options are "cpu", "tpu_tensor_core", and "tpu_embedding_core". If
       specifying "tpu_tensor_core", a tensor_core_shape must be supplied.
diff --git a/tensorflow/python/tpu/session_support.py b/tensorflow/python/tpu/session_support.py
index 2b4abfde447..6e85c417f54 100644
--- a/tensorflow/python/tpu/session_support.py
+++ b/tensorflow/python/tpu/session_support.py
@@ -418,7 +418,7 @@ class ResetComputation(object):
 
   def __call__(self, run_context, all_workers, lame_workers):
     del run_context, lame_workers
-    all_workers.shutdown()
+    all_workers.shutdown(exit_code=42)
 
     logging.info('Resetting coordinator.')
     raise CoordinatorResetError()
@@ -435,7 +435,7 @@ class ShutdownLameWorkers(object):
     pass
 
   def __call__(self, run_context, all_workers, lame_workers):
-    lame_workers.shutdown()
+    lame_workers.shutdown(exit_code=42)
 
 
 class ShutdownAllWorkers(object):
@@ -449,4 +449,4 @@ class ShutdownAllWorkers(object):
     pass
 
   def __call__(self, run_context, all_workers, lame_workers):
-    all_workers.shutdown()
+    all_workers.shutdown(exit_code=42)
diff --git a/tensorflow/python/tpu/topology.py b/tensorflow/python/tpu/topology.py
index c5d4a9c4ffb..de233949a60 100644
--- a/tensorflow/python/tpu/topology.py
+++ b/tensorflow/python/tpu/topology.py
@@ -57,19 +57,19 @@ class Topology(object):
     Args:
       serialized: A serialized `TopologyProto`, or `None`. If not `None`, the
         serialized proto is parsed to discover the topology.
-      mesh_shape: A sequence of 3 positive integers, or `None`. If not `None`,
+      mesh_shape: A sequence of 4 positive integers, or `None`. If not `None`,
         the shape of the TPU topology, in number of cores. Ignored if
         `serialized` is not `None`.
-      device_coordinates: A rank 3 numpy array that describes the mapping from
+      device_coordinates: A rank 4 numpy array that describes the mapping from
         TensorFlow TPU devices to TPU fabric coordinates, or `None`. Ignored
         if `serialized is not `None`.
 
     Raises:
       ValueError: If `serialized` does not describe a well-formed topology.
       ValueError: If `serialized` is `None` and `mesh_shape` is not a sequence
-        of 3 positive integers.
+        of 4 positive integers.
       ValueError: If `serialized` is `None` and `device_coordinates` is not a
-        rank 3 numpy int32 array that describes a valid coordinate mapping.
+        rank 4 numpy int32 array that describes a valid coordinate mapping.
     """
 
     self._serialized = serialized
@@ -79,14 +79,19 @@ class Topology(object):
     else:
       self._mesh_shape = np.asarray(mesh_shape, dtype=np.int32)
       self._device_coordinates = np.asarray(device_coordinates, np.int32)
-      if len(self._mesh_shape) != 3 or any(self._mesh_shape < 1):
-        raise ValueError("`mesh_shape` must be a sequence of 3 positive "
+      if len(self._mesh_shape) != 4 or any(self._mesh_shape < 1):
+        raise ValueError("`mesh_shape` must be a sequence of 4 positive "
                          "entries; got {}".format(self._mesh_shape))
 
       if (len(self._device_coordinates.shape) != 3 or
           self._device_coordinates.shape[2] != len(self._mesh_shape)):
         raise ValueError("`device_coordinates` must be a rank 3 int32 array "
-                         "with minor dimension equal to the mesh shape rank")
+                         "with minor dimension equal to the mesh shape rank"
+                         "got {} {} {} mesh_shape={},len {}".format(
+                             self._device_coordinates.shape,
+                             len(self._device_coordinates.shape),
+                             self._device_coordinates.shape[2],
+                             self._mesh_shape, len(self._mesh_shape)))
 
     self._topology_tasks, self._topology_devices = self._invert_topology()
 
@@ -99,8 +104,8 @@ class Topology(object):
     proto.ParseFromString(serialized)
 
     self._mesh_shape = np.array(proto.mesh_shape, dtype=np.int32)
-    if len(self._mesh_shape) != 3 or any(self._mesh_shape < 1):
-      raise ValueError("`mesh_shape` must be a vector of size 3 with positive "
+    if len(self._mesh_shape) != 4 or any(self._mesh_shape < 1):
+      raise ValueError("`mesh_shape` must be a vector of size 4 with positive "
                        "entries; got {}".format(self._mesh_shape))
 
     if proto.num_tasks < 0:
@@ -134,9 +139,9 @@ class Topology(object):
     devices = np.full(list(self.mesh_shape), -1, dtype=np.int32)
     for task in xrange(self.device_coordinates.shape[0]):
       for device in xrange(self.device_coordinates.shape[1]):
-        x, y, z = self.device_coordinates[task, device, :]
-        tasks[x, y, z] = task
-        devices[x, y, z] = device
+        x, y, z, core = self.device_coordinates[task, device, :]
+        tasks[x, y, z, core] = task
+        devices[x, y, z, core] = device
     return tasks, devices
 
   @property
@@ -158,8 +163,8 @@ class Topology(object):
       `tasks` is the number of tasks in the TPU cluster, `devices` is the number
       of TPU devices per task, and `axis` is the number of axes in the TPU
       cluster topology. Each entry gives the `axis`-th coordinate in the
-      topology of a task/device pair. TPU topologies are 3-dimensional, with
-      dimensions `(x, y, core number)`.
+      topology of a task/device pair. TPU topologies are 4-dimensional, with
+      dimensions `(x, y, z, core number)`.
     """
     return self._device_coordinates
 
diff --git a/tensorflow/python/tpu/topology_test.py b/tensorflow/python/tpu/topology_test.py
index 9e1b7de8597..c30bf99d024 100644
--- a/tensorflow/python/tpu/topology_test.py
+++ b/tensorflow/python/tpu/topology_test.py
@@ -28,8 +28,8 @@ class TopologyTest(test.TestCase):
   def testSerialization(self):
     """Tests if the class is able to generate serialized strings."""
     original_topology = topology.Topology(
-        mesh_shape=[1, 1, 2],
-        device_coordinates=[[[0, 0, 0], [0, 0, 1]]],
+        mesh_shape=[1, 1, 1, 2],
+        device_coordinates=[[[0, 0, 0, 0], [0, 0, 0, 1]]],
     )
     serialized_str = original_topology.serialized()
     new_topology = topology.Topology(serialized=serialized_str)
diff --git a/tensorflow/python/tpu/tpu.py b/tensorflow/python/tpu/tpu.py
index 1db972260c7..5c3a61d5d8d 100644
--- a/tensorflow/python/tpu/tpu.py
+++ b/tensorflow/python/tpu/tpu.py
@@ -32,6 +32,7 @@ from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.framework import auto_control_deps
 from tensorflow.python.framework import c_api_util
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import config
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
@@ -83,6 +84,7 @@ _TPU_REPLICATE_ATTR = "_tpu_replicate"
 _POST_DEVICE_REWRITE_ATTR = "_post_device_rewrite"
 _TPU_COMPILATION_STATUS_ATTR = "_tpu_compilation_status"
 _OUTSIDE_COMPILATION_ATTR = "_xla_outside_compilation"
+_PIVOT_FOR_CLUSTER = "_pivot_for_cluster"
 
 
 def _tpu_system_device_name(job):
@@ -686,7 +688,7 @@ def outside_compilation(computation, *args, **kwargs):
   `tf.tpu.outside_compilation()` should be called inside a function that is
   passed to `tpu.split_compile_and_replicate()` -- this is implied when
   outside compilation is invoked inside a function passed to TPUStrategy
-  `experimental_run_v2()`. If invoked outside of TPUReplicateContext,
+  `run()`. If invoked outside of TPUReplicateContext,
   then this simply returns the result of `computation`, and therefore,
   would be a no-op. Note that outside compilation is different from
   `tf.distribute.experimental.TPUStrategy.merge_call()` as logic in
@@ -910,7 +912,8 @@ def _pad_all_input(inputs, padded_shapes, padding_spec):
 
   Args:
     inputs: The original inputs.
-    padded_shapes: A list of padded shapes for each input.
+    padded_shapes: A list of padded shapes for each input. If an entry is None,
+      no padding is performed.
     padding_spec: An enum specified by `tpu.PaddingSpec`. This describes the
       padding policy when the `inputs` to `tf.tpu.replicate` is dynamic.
       One usage is to enable automatic bucketizing on the inputs by setting the
@@ -966,7 +969,8 @@ def _pad_all_input(inputs, padded_shapes, padding_spec):
       input_shape = input_tensor.get_shape().as_list()
       padded_shape = padded_shapes[idx]
 
-      if any(need_padding[idx]):
+      # If we have no padded_shape, then skip padding.
+      if any(need_padding[idx]) and padded_shape is not None:
         for i, s in enumerate(input_shape):
           if need_padding[idx][i]:
             if core_idx == 0:
@@ -1028,6 +1032,51 @@ def _pad_all_input(inputs, padded_shapes, padding_spec):
   return padded_inputs, padding_maps
 
 
+def _flatten_and_filter_composite(maybe_composite, non_composite_output,
+                                  composite_output=None):
+  """For an input, replaced the input by a tuple if the input is composite.
+
+  If `maybe_composite` is not composite, return the parameter
+  `non_composite_output` otherwise return a tuple which consists of the value of
+  the parameter `composite_output` the same number of times as there are
+  components of the composite tensor.
+
+  This is useful for computing a mask when flattening nested data with
+  `expand_composites=True`. For example
+
+  ```python
+  nest.flatten(data, expand_composites=True)
+  ```
+
+  and
+
+  ```python
+  nest.flatten(nest.map(
+      data, lambda x: _flatten_and_filter_composite(x, False, True)))
+  ```
+
+  will have the same length and second will be True if the tensor in the first
+  is derived from a expanding a composite tensor.
+
+  Args:
+    maybe_composite: A value to test for being a composite tensor.
+    non_composite_output: The value to return when `maybe_composite` is not a
+      composite.
+    composite_output: the value to fill the output tuple with if
+      `maybe_composite` is a composite.
+
+  Returns:
+    `non_composite_output` or a tuple with multiple copies of
+    `composite_output`.
+  """
+
+  if isinstance(maybe_composite, composite_tensor.CompositeTensor):
+    num_components = len(
+        maybe_composite._type_spec._to_components(maybe_composite))  # pylint: disable=protected-access
+    return (composite_output,) * num_components
+  return non_composite_output
+
+
 def split_compile_and_replicate(computation,
                                 inputs=None,
                                 infeed_queue=None,
@@ -1124,10 +1173,17 @@ def split_compile_and_replicate(computation,
 
   # Flatten inputs.
   flat_inputs = [
-      nest.flatten(per_replica_input) for per_replica_input in inputs
+      nest.flatten(per_replica_input, expand_composites=True)
+      for per_replica_input in inputs
   ]
+  # Mask parallel to one replicat's inputs with True for tensors coming from
+  # composites.
+  is_composite = nest.flatten(nest.map_structure(
+      lambda x: _flatten_and_filter_composite(x, False, True), inputs[0]))
+
   # Converts inputs to Tensors.
-  flat_inputs = [[ops.convert_to_tensor(x) for x in inp] for inp in flat_inputs]
+  flat_inputs = [[ops.convert_to_tensor(x) for x in inp]
+                 for inp in flat_inputs]
 
   # Verifies that all replicas have matching numbers and types of inputs
   flat_input_types = [x.dtype for x in flat_inputs[0]]
@@ -1170,11 +1226,25 @@ def split_compile_and_replicate(computation,
     # Make sure maximum_shapes has the same structure as inputs.
     nest.assert_same_structure(inputs[0], maximum_shapes, check_types=False)
 
-    # Flatten padded shapes.
-    flat_maximum_shapes = nest.flatten(maximum_shapes)
+    # Flatten padded shapes:
+    # For composite tensor components, we don't want to pad them. For each
+    # entry of maximum_shapes that corresponds to a composite tensor, replace it
+    # by a tuple of Nones of the same length as the number of components of the
+    # composite tensor. When we flatten a second time, this makes
+    # flat_maximum_shapes have the same length as flat_inputs[i]. We can then
+    # avoid padding these tensors. The assumption is that they will be used by
+    # outside compilation or that the components are statically shaped and will
+    # be used by tpu compatible ops.
+    flat_maximum_shapes = nest.flatten(
+        [_flatten_and_filter_composite(x, y)
+         for x, y in zip(nest.flatten(inputs[0]),
+                         nest.flatten(maximum_shapes))])
     flat_maximum_shapes = [
-        tensor_shape.TensorShape(s) for s in flat_maximum_shapes
+        tensor_shape.TensorShape(s) if s is not None else None
+        for s in flat_maximum_shapes
     ]
+    nest.assert_same_structure(flat_inputs[0], flat_maximum_shapes,
+                               check_types=False)
 
     flat_inputs, padding_maps = _pad_all_input(flat_inputs, flat_maximum_shapes,
                                                padding_spec)
@@ -1206,6 +1276,8 @@ def split_compile_and_replicate(computation,
   else:
     cluster_name = graph.unique_name("cluster")
   pivot = control_flow_ops.no_op(name=cluster_name + "/pivot")
+  pivot._set_attr(_PIVOT_FOR_CLUSTER,  # pylint: disable=protected-access
+                  attr_value_pb2.AttrValue(s=compat.as_bytes(cluster_name)))
   context = TPUReplicateContext(
       name=cluster_name, num_replicas=num_replicas, pivot=pivot)
   try:
@@ -1225,14 +1297,14 @@ def split_compile_and_replicate(computation,
           array_ops.identity(x, name="replicated_input_{}".format(i))
           for i, x in enumerate(flat_replicated_inputs)
       ]
-      for i in flat_replicated_inputs:
+      for i, composite in zip(flat_replicated_inputs, is_composite):
         # pylint: disable=protected-access
         # Add an attribute to the identity node so that they could be removed in
         # encapsulate TPU computation pass if unused. However we don't remove
         # inputs when dynamic padding is enabled.
         # TODO(rxsang): Use other ways except argument index in padding_map so
         # outside compilation can work with dynamic padding correctly.
-        if maximum_shapes is None:
+        if maximum_shapes is None or composite:
           i.op._set_attr("_tpu_input_identity",
                          attr_value_pb2.AttrValue(b=True))
         # pylint: enable=protected-access
@@ -1240,7 +1312,8 @@ def split_compile_and_replicate(computation,
       # Unflatten the computation inputs to match original input structure.
       computation_inputs = nest.pack_sequence_as(
           structure=inputs[0],
-          flat_sequence=flat_replicated_inputs[:flat_input_arity])
+          flat_sequence=flat_replicated_inputs[:flat_input_arity],
+          expand_composites=True)
 
       # If there is an infeed queue, adds the dequeued values to the
       # computation's inputs.
diff --git a/tensorflow/python/tpu/tpu_embedding.py b/tensorflow/python/tpu/tpu_embedding.py
index a073d49c666..a6677c82daf 100644
--- a/tensorflow/python/tpu/tpu_embedding.py
+++ b/tensorflow/python/tpu/tpu_embedding.py
@@ -235,6 +235,9 @@ AdamSlotVariableNames = collections.namedtuple(
 AdagradSlotVariableName = collections.namedtuple(
     'AdagradSlotVariableName', ['accumulator'])
 
+ProximalAdagradSlotVariableName = collections.namedtuple(
+    'ProximalAdagradSlotVariableName', ['accumulator'])
+
 FtrlSlotVariableName = collections.namedtuple(
     'FtrlSlotVariableName', ['accumulator', 'linear'])
 
@@ -244,6 +247,9 @@ AdamSlotVariables = collections.namedtuple(
 AdagradSlotVariable = collections.namedtuple(
     'AdagradSlotVariable', ['accumulator'])
 
+ProximalAdagradSlotVariable = collections.namedtuple(
+    'ProximalAdagradSlotVariable', ['accumulator'])
+
 FtrlSlotVariable = collections.namedtuple(
     'FtrlSlotVariable', ['accumulator', 'linear'])
 
@@ -322,6 +328,63 @@ class AdagradParameters(_OptimizationParameters):
     self.initial_accumulator = initial_accumulator
 
 
+class ProximalAdagradParameters(_OptimizationParameters):
+  """Optimization parameters for ProximalAdagrad with TPU embeddings.
+
+  Pass this to `tf.estimator.tpu.experimental.EmbeddingConfigSpec` via the
+  `optimization_parameters` argument to set the optimizer and its parameters.
+  See the documentation for `tf.estimator.tpu.experimental.EmbeddingConfigSpec`
+  for more details.
+  """
+
+  def __init__(self,
+               learning_rate,
+               initial_accumulator=0.1,
+               l1_regularization_strength=0.0,
+               l2_regularization_strength=0.0,
+               use_gradient_accumulation=True,
+               clip_weight_min=None,
+               clip_weight_max=None,
+               weight_decay_factor=None,
+               multiply_weight_decay_factor_by_learning_rate=None):
+    """Optimization parameters for Adagrad.
+
+    Args:
+      learning_rate: used for updating embedding table.
+      initial_accumulator: initial accumulator for Adagrad.
+      l1_regularization_strength: A float value, must be greater than or equal
+        to zero.
+      l2_regularization_strength: A float value, must be greater than or equal
+        to zero.
+      use_gradient_accumulation: setting this to `False` makes embedding
+        gradients calculation less accurate but faster. Please see
+        `optimization_parameters.proto` for details. for details.
+      clip_weight_min: the minimum value to clip by; None means -infinity.
+      clip_weight_max: the maximum value to clip by; None means +infinity.
+      weight_decay_factor: amount of weight decay to apply; None means that the
+        weights are not decayed.
+      multiply_weight_decay_factor_by_learning_rate: if true,
+        `weight_decay_factor` is multiplied by the current learning rate.
+    """
+    super(ProximalAdagradParameters,
+          self).__init__(learning_rate, use_gradient_accumulation,
+                         clip_weight_min, clip_weight_max, weight_decay_factor,
+                         multiply_weight_decay_factor_by_learning_rate)
+    if initial_accumulator <= 0:
+      raise ValueError('Adagrad initial_accumulator must be positive')
+    if l1_regularization_strength < 0.:
+      raise ValueError('l1_regularization_strength must be greater than or '
+                       'equal to 0. got {}.'.format(l1_regularization_strength))
+
+    if l2_regularization_strength < 0.:
+      raise ValueError('l2_regularization_strength must be greater than or '
+                       'equal to 0. got {}.'.format(l2_regularization_strength))
+
+    self.initial_accumulator = initial_accumulator
+    self.l1_regularization_strength = l1_regularization_strength
+    self.l2_regularization_strength = l2_regularization_strength
+
+
 @tf_export(v1=['tpu.experimental.AdamParameters'])
 class AdamParameters(_OptimizationParameters):
   """Optimization parameters for Adam with TPU embeddings.
@@ -1304,10 +1367,6 @@ class _OptimizerHandler(object):
 class _AdagradHandler(_OptimizerHandler):
   """Handles Adagrad specific logic."""
 
-  def __init__(self, optimization_parameters):
-    super(_AdagradHandler, self).__init__(optimization_parameters)
-    self._table_to_accumulator_variables_dict = {}
-
   def set_optimization_parameters(self, table_descriptor):
     table_descriptor.optimization_parameters.adagrad.SetInParent()
 
@@ -1377,14 +1436,86 @@ class _AdagradHandler(_OptimizerHandler):
     return slot_variables, load_ops_fn, retrieve_ops_fn
 
 
+class _ProximalAdagradHandler(_OptimizerHandler):
+  """Handles ProximalAdagrad specific logic."""
+
+  def set_optimization_parameters(self, table_descriptor):
+    table_descriptor.optimization_parameters.proximal_adagrad.SetInParent()
+    table_descriptor.optimization_parameters.proximal_adagrad.l1 = (
+        self._optimization_parameters.l1_regularization_strength)
+    table_descriptor.optimization_parameters.proximal_adagrad.l2 = (
+        self._optimization_parameters.l2_regularization_strength)
+
+  def get_default_slot_variable_names(self, table):
+    return ProximalAdagradSlotVariableName('{}/{}'.format(
+        table, 'ProximalAdagrad'))
+
+  def create_variables_and_ops(self, table, slot_variable_names, num_hosts,
+                               table_config, table_variables, config_proto):
+    accumulator_initializer = init_ops.constant_initializer(
+        self._optimization_parameters.initial_accumulator)
+    accumulator_variables = _create_partitioned_variables(
+        name=slot_variable_names.accumulator,
+        num_hosts=num_hosts,
+        vocabulary_size=table_config.vocabulary_size,
+        embedding_dimension=table_config.dimension,
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES],
+        initializer=accumulator_initializer)
+    slot_variables = ProximalAdagradSlotVariable(accumulator_variables)
+
+    def load_ops_fn():
+      """Returns the retrieve ops for Proximal AdaGrad embedding tables.
+
+      Returns:
+        A list of ops to load embedding and slot variables from CPU to TPU.
+      """
+      config = config_proto
+      load_op_list = []
+      for host_id, table_variable, accumulator_variable in zip(
+          range(num_hosts), table_variables, accumulator_variables):
+        with ops.colocate_with(table_variable):
+          load_parameters_op = (
+              tpu_ops.load_tpu_embedding_proximal_adagrad_parameters(
+                  parameters=table_variable,
+                  accumulators=accumulator_variable,
+                  table_name=table,
+                  num_shards=num_hosts,
+                  shard_id=host_id,
+                  config=config))
+        config = None
+        load_op_list.append(load_parameters_op)
+      return load_op_list
+
+    def retrieve_ops_fn():
+      """Returns the retrieve ops for Proximal AdaGrad embedding tables.
+
+      Returns:
+        A list of ops to retrieve embedding and slot variables from TPU to CPU.
+      """
+      config = config_proto
+      retrieve_op_list = []
+      for host_id, table_variable, accumulator_variable in (zip(
+          range(num_hosts), table_variables, accumulator_variables)):
+        with ops.colocate_with(table_variable):
+          retrieved_table, retrieved_accumulator = (
+              tpu_ops.retrieve_tpu_embedding_proximal_adagrad_parameters(
+                  table_name=table,
+                  num_shards=num_hosts,
+                  shard_id=host_id,
+                  config=config))
+          retrieve_parameters_op = control_flow_ops.group(
+              state_ops.assign(table_variable, retrieved_table),
+              state_ops.assign(accumulator_variable, retrieved_accumulator))
+        config = None
+        retrieve_op_list.append(retrieve_parameters_op)
+      return retrieve_op_list
+
+    return slot_variables, load_ops_fn, retrieve_ops_fn
+
+
 class _AdamHandler(_OptimizerHandler):
   """Handles Adam specific logic."""
 
-  def __init__(self, optimization_parameters):
-    super(_AdamHandler, self).__init__(optimization_parameters)
-    self._table_to_m_variables_dict = {}
-    self._table_to_v_variables_dict = {}
-
   def set_optimization_parameters(self, table_descriptor):
     table_descriptor.optimization_parameters.adam.beta1 = (
         self._optimization_parameters.beta1)
@@ -1480,11 +1611,6 @@ class _AdamHandler(_OptimizerHandler):
 class _FtrlHandler(_OptimizerHandler):
   """Handles Ftrl specific logic."""
 
-  def __init__(self, optimization_parameters):
-    super(_FtrlHandler, self).__init__(optimization_parameters)
-    self._table_to_accumulator_variables_dict = {}
-    self._table_to_linear_variables_dict = {}
-
   def set_optimization_parameters(self, table_descriptor):
     table_descriptor.optimization_parameters.ftrl.lr_power = (
         self._optimization_parameters.learning_rate_power)
@@ -1647,6 +1773,8 @@ def _get_optimization_handler(optimization_parameters):
   """Gets the optimization handler given the parameter type."""
   if isinstance(optimization_parameters, AdagradParameters):
     return _AdagradHandler(optimization_parameters)
+  elif isinstance(optimization_parameters, ProximalAdagradParameters):
+    return _ProximalAdagradHandler(optimization_parameters)
   elif isinstance(optimization_parameters, AdamParameters):
     return _AdamHandler(optimization_parameters)
   elif isinstance(optimization_parameters, FtrlParameters):
diff --git a/tensorflow/python/training/experimental/loss_scaling_gradient_tape.py b/tensorflow/python/training/experimental/loss_scaling_gradient_tape.py
index 43bed153bd6..a2502b8a43f 100644
--- a/tensorflow/python/training/experimental/loss_scaling_gradient_tape.py
+++ b/tensorflow/python/training/experimental/loss_scaling_gradient_tape.py
@@ -302,10 +302,11 @@ def _compute_gradients_until_finite(
       return grads
 
     # Switch to a replica-context to compute gradients once per replica.
-    grads = distribution.experimental_run_v2(
-        replica_fn, args=(loss_scale_gradient_tapes, target, flattened_sources,
-                          output_gradients, initial_grads))
-    # Check for non-finite gradients possibly resulting from scaling.
+    grads = distribution.run(
+        replica_fn,
+        args=(loss_scale_gradient_tapes, target, flattened_sources,
+              output_gradients, initial_grads))
+    # Check for non-finite gradients possibly resulting from scaling
     _, ready_to_update = loss_scale.update(grads)
     is_first_iteration = False
     return grads, ready_to_update, is_first_iteration
diff --git a/tensorflow/python/training/experimental/loss_scaling_gradient_tape_test.py b/tensorflow/python/training/experimental/loss_scaling_gradient_tape_test.py
index 74a1836f343..19b8c7f7f46 100644
--- a/tensorflow/python/training/experimental/loss_scaling_gradient_tape_test.py
+++ b/tensorflow/python/training/experimental/loss_scaling_gradient_tape_test.py
@@ -54,7 +54,7 @@ class LossScaleGradientTapeTest(test.TestCase, parameterized.TestCase):
   def _run_with_strategy(self, run_fn, strategy, use_tf_function=False):
     """Runs `run_fn` under the DistributionStrategy `strategy`.
 
-    Runs `run_fn` with `strategy.experimental_run_v2`. Returns a list of the
+    Runs `run_fn` with `strategy.run`. Returns a list of the
     return values of `run_fn`, one per replica.
 
     Args:
@@ -67,7 +67,7 @@ class LossScaleGradientTapeTest(test.TestCase, parameterized.TestCase):
       replica. If a nested structure is returned from `run_fn`, returns a
       nested structure, where each element is a list of tensors.
     """
-    strategy_fn = lambda: strategy.experimental_run_v2(run_fn)
+    strategy_fn = lambda: strategy.run(run_fn)
     if use_tf_function:
       strategy_fn = def_function.function(strategy_fn)
 
diff --git a/tensorflow/python/training/experimental/mixed_precision.py b/tensorflow/python/training/experimental/mixed_precision.py
index 8441134c77d..8e1bf42ddb8 100644
--- a/tensorflow/python/training/experimental/mixed_precision.py
+++ b/tensorflow/python/training/experimental/mixed_precision.py
@@ -336,15 +336,15 @@ def enable_mixed_precision_graph_rewrite_v1(opt, loss_scale='dynamic'):
 def _enable_mixed_precision_graph_rewrite_base(opt, loss_scale,
                                                use_v1_behavior):
   """Enables mixed precision. See `enable_mixed_precision_graph_rewrite`."""
-  if not mixed_precision_global_state.using_default_mixed_precision_policy:
+  if mixed_precision_global_state.using_mixed_precision_policy:
     raise ValueError(
-        'The mixed precision graph rewrite cannot be enabled, because a keras '
-        'mixed precision Policy has been set. At most, one of the following '
-        'functions can be called:\n\n'
-        '  1. tf.keras.mixed_precision.experimental.set_policy() (You called '
-        'this first)\n'
+        'The mixed precision graph rewrite cannot be enabled, because the '
+        'global Keras dtype Policy has been set to a mixed precision policy. '
+        'At most, one of the following can be called:\n\n'
+        '  1. tf.keras.mixed_precision.experimental.set_policy() with a mixed '
+        'precision policy (You called this first)\n\n'
         '  2. tf.train.experimental.enable_mixed_precision_graph_rewrite() '
-        '(You called this second)\n\n'
+        '(You called this second)\n'
         'You called both functions, which is an error, because both functions '
         'enable you to use mixed precision. If in doubt which function to use, '
         'use the first, as it supports Eager execution and is more '
diff --git a/tensorflow/python/training/experimental/mixed_precision_global_state.py b/tensorflow/python/training/experimental/mixed_precision_global_state.py
index 8a2cd8a0160..6df4fdbe593 100644
--- a/tensorflow/python/training/experimental/mixed_precision_global_state.py
+++ b/tensorflow/python/training/experimental/mixed_precision_global_state.py
@@ -33,7 +33,7 @@ mixed_precision_graph_rewrite_is_enabled = False
 # Session has already been created.
 non_mixed_precision_session_created = False
 
-# Whether the default tf.keras.mixed_precision.experimental.Policy is in effect.
-# Used to raise an error message if both a non-default Policy and the graph
+# Whether the global tf.keras.mixed_precision.experimental.Policy uses mixed
+# precision. Used to raise an error message if both a mixed Policy and the graph
 # rewrite are used at the same time.
-using_default_mixed_precision_policy = True
+using_mixed_precision_policy = False
diff --git a/tensorflow/python/training/experimental/mixed_precision_test.py b/tensorflow/python/training/experimental/mixed_precision_test.py
index 7397ae9a086..65d5b690b32 100644
--- a/tensorflow/python/training/experimental/mixed_precision_test.py
+++ b/tensorflow/python/training/experimental/mixed_precision_test.py
@@ -194,10 +194,13 @@ class MixedPrecisionTest(test.TestCase, parameterized.TestCase):
   def test_error_if_policy_is_set(self):
     with policy.policy_scope('mixed_float16'):
       with self.assertRaisesRegexp(
-          ValueError, 'a keras mixed precision Policy has been set'):
+          ValueError, 'the global Keras dtype Policy has been set'):
         enable_mixed_precision_graph_rewrite(gradient_descent_v2.SGD(1.0))
-    # Test no error is thrown when the policy is current the default.
+    # Test no error is thrown when the policy is currently the default.
     enable_mixed_precision_graph_rewrite(gradient_descent_v2.SGD(1.0))
+    # Test no error is thrown when the policy is a non-mixed policy.
+    with policy.policy_scope('float64'):
+      enable_mixed_precision_graph_rewrite(gradient_descent_v2.SGD(1.0))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/training/rmsprop.py b/tensorflow/python/training/rmsprop.py
index 90ab83f1ad4..05f449153dc 100644
--- a/tensorflow/python/training/rmsprop.py
+++ b/tensorflow/python/training/rmsprop.py
@@ -30,7 +30,7 @@ This implementation of RMSProp uses plain momentum, not Nesterov momentum.
 The centered version additionally maintains a moving (discounted) average of the
 gradients, and uses that average to estimate the variance:
 
-mean_grad = decay * mean_square{t-1} + (1-decay) * gradient
+mean_grad = decay * mean_grad{t-1} + (1-decay) * gradient
 mean_square = decay * mean_square{t-1} + (1-decay) * gradient ** 2
 mom = momentum * mom{t-1} + learning_rate * g_t /
     sqrt(mean_square - mean_grad**2 + epsilon)
diff --git a/tensorflow/python/util/module_wrapper.py b/tensorflow/python/util/module_wrapper.py
index dffdd513b4b..c5856eeb13d 100644
--- a/tensorflow/python/util/module_wrapper.py
+++ b/tensorflow/python/util/module_wrapper.py
@@ -236,4 +236,4 @@ class TFModuleWrapper(types.ModuleType):
     return self._tfmw_wrapped_module.__repr__()
 
   def __reduce__(self):
-    return __import__, (self.__name__,)
+    return importlib.import_module, (self.__name__,)
diff --git a/tensorflow/python/util/module_wrapper_test.py b/tensorflow/python/util/module_wrapper_test.py
index 582e98abdfa..f8a2161bdff 100644
--- a/tensorflow/python/util/module_wrapper_test.py
+++ b/tensorflow/python/util/module_wrapper_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import logging
+import pickle
 import types
 
 from tensorflow.python.platform import test
@@ -132,5 +133,15 @@ class LazyLoadingWrapperTest(test.TestCase):
     self.assertEqual(wrapped_module.lite, _cmd)
 
 
+class PickleTest(test.TestCase):
+
+  def testPickleSubmodule(self):
+    name = PickleTest.__module__  # The current module is a submodule.
+    module = module_wrapper.TFModuleWrapper(MockModule(name), name)
+    restored = pickle.loads(pickle.dumps(module))
+    self.assertEqual(restored.__name__, name)
+    self.assertIsNotNone(restored.PickleTest)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index cecba34e4e8..6da3fdbf945 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -189,7 +189,7 @@ class CachedTypeCheck {
   std::function<int(PyObject*)> ternary_predicate_;
   mutex type_to_sequence_map_mu_;
   std::unordered_map<PyTypeObject*, bool> type_to_sequence_map_
-      GUARDED_BY(type_to_sequence_map_mu_);
+      TF_GUARDED_BY(type_to_sequence_map_mu_);
 };
 
 // Returns 1 if 'obj' is an instance of 'type_name'
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index b58f6f5d19e..871576f6cef 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -373,9 +373,9 @@ cc_library(
     deps = [
         ":platform",
         ":stream_executor_headers",
+        "//tensorflow/core/platform:thread_annotations",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
-        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD
index f3bc704de6f..1789abadde8 100644
--- a/tensorflow/stream_executor/cuda/BUILD
+++ b/tensorflow/stream_executor/cuda/BUILD
@@ -416,6 +416,7 @@ cc_library(
     data = if_cuda_is_configured(["@local_config_cuda//cuda:cupti_dsos"]),
     textual_hdrs = ["cupti_10_0.inc"],
     deps = if_cuda_is_configured([
+        "@local_config_cuda//cuda:cuda_headers",
         "@local_config_cuda//cuda:cupti_headers",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform:dso_loader",
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h
index 0e5d3b18fed..817bdb72777 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@@ -68,7 +68,7 @@ class CUDABlas : public blas::BlasSupport {
   // cuBLAS is stateful, and only be associated with one stream (in order to
   // enqueue dispatch) at a given time. As a result, this generally must be
   // invoked before calling into cuBLAS.
-  bool SetStream(Stream *stream) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  bool SetStream(Stream *stream) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // A helper function that calls the real cuBLAS function together with error
   // handling.
@@ -150,7 +150,7 @@ class CUDABlas : public blas::BlasSupport {
   GpuExecutor *parent_;
 
   // cuBLAS library handle on the device.
-  cublasHandle_t blas_ GUARDED_BY(mu_);
+  cublasHandle_t blas_ TF_GUARDED_BY(mu_);
 
   SE_DISALLOW_COPY_AND_ASSIGN(CUDABlas);
 };
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 35260ad3d42..dd511f7a976 100755
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -195,7 +195,7 @@ class CudnnAccess {
   absl::Mutex mutex_;
 
   // cuDNN library handle.
-  cudnnHandle_t handle_ GUARDED_BY(mutex_);  // Owned.
+  cudnnHandle_t handle_ TF_GUARDED_BY(mutex_);  // Owned.
 };
 
 namespace {
@@ -2062,7 +2062,9 @@ port::Status CudnnSupport::DoCtcLossImpl(
       /*inputLengths=*/input_lengths_data.data(),
       /*costs=*/costs_data.opaque(), /*gradientsDesc=*/grads_desc.handle(),
       /*gradients=*/grads_data.opaque(),
-      /*algo=*/CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC,
+      /*algo=*/
+      RequireCudnnDeterminism() ? CUDNN_CTC_LOSS_ALGO_DETERMINISTIC
+                                : CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC,
       /*ctcLossDesc=*/ctc_loss_desc.handle(),
       /*workspace=*/scratch_memory.opaque(),
       /*workSpaceSizeInBytes=*/scratch_memory.size()));
@@ -2620,8 +2622,8 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
     bool specify_workspace_limit = scratch_allocator != nullptr;
     auto memory_limit_bytes =
         specify_workspace_limit
-            ? std::max(scratch_allocator->GetMemoryLimitInBytes(), 0ll)
-            : 0ll;
+            ? std::max(scratch_allocator->GetMemoryLimitInBytes(), int64{0})
+            : int64{0};
     SE_ASSIGN_OR_RETURN(cudnnConvolutionFwdAlgo_t algo,
                         GetCudnnConvolutionForwardAlgo(
                             cudnn, input_nd, filter, conv, output_nd,
@@ -2673,8 +2675,8 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
     bool specify_workspace_limit = scratch_allocator != nullptr;
     auto memory_limit_bytes =
         specify_workspace_limit
-            ? std::max(scratch_allocator->GetMemoryLimitInBytes(), 0ll)
-            : 0ll;
+            ? std::max(scratch_allocator->GetMemoryLimitInBytes(), int64{0})
+            : int64{0};
     SE_ASSIGN_OR_RETURN(cudnnConvolutionBwdDataAlgo_t algo,
                         GetCudnnConvolutionBackwardDataAlgo(
                             cudnn, input_nd, filter, conv, output_nd,
@@ -2725,8 +2727,8 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
     bool specify_workspace_limit = scratch_allocator != nullptr;
     auto memory_limit_bytes =
         specify_workspace_limit
-            ? std::max(scratch_allocator->GetMemoryLimitInBytes(), 0ll)
-            : 0ll;
+            ? std::max(scratch_allocator->GetMemoryLimitInBytes(), int64{0})
+            : int64{0};
     SE_ASSIGN_OR_RETURN(cudnnConvolutionBwdFilterAlgo_t algo,
                         GetCudnnConvolutionBackwardFilterAlgo(
                             cudnn, input_nd, filter, conv, output_nd,
@@ -3949,7 +3951,9 @@ port::Status CudnnSupport::DoPrepareForCtcLoss(
       /*labels=*/labels_data.data(),
       /*labelLengths=*/labels_lengths_data.data(),
       /*inputLengths=*/input_lengths_data.data(),
-      /*algo=*/CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC,
+      /*algo=*/
+      RequireCudnnDeterminism() ? CUDNN_CTC_LOSS_ALGO_DETERMINISTIC
+                                : CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC,
       /*ctcLossDesc=*/cudnn_ctc_loss_desc.handle(),
       /*sizeInBytes=*/&workspace_size_in_bytes));
 #else
diff --git a/tensorflow/stream_executor/device_memory_allocator.h b/tensorflow/stream_executor/device_memory_allocator.h
index 983274816a1..f80352e1138 100644
--- a/tensorflow/stream_executor/device_memory_allocator.h
+++ b/tensorflow/stream_executor/device_memory_allocator.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
@@ -263,7 +264,7 @@ class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
   absl::Mutex mutex_;
 
   // Cache of streams for GetStream.
-  std::map<int, Stream> streams_ GUARDED_BY(mutex_);
+  std::map<int, Stream> streams_ TF_GUARDED_BY(mutex_);
 };
 
 template <typename ElemT>
diff --git a/tensorflow/stream_executor/executor_cache.h b/tensorflow/stream_executor/executor_cache.h
index 0100f6d9ac0..ca0a0c2e66d 100644
--- a/tensorflow/stream_executor/executor_cache.h
+++ b/tensorflow/stream_executor/executor_cache.h
@@ -63,14 +63,14 @@ class ExecutorCache {
     // Vector of cached {config, executor} pairs.
     std::vector<
         std::pair<StreamExecutorConfig, std::unique_ptr<StreamExecutor>>>
-        configurations GUARDED_BY(configurations_mutex);
+        configurations TF_GUARDED_BY(configurations_mutex);
   };
 
   // Maps ordinal number to a list of cached executors for that ordinal.
   // We key off of ordinal (instead of just looking up all fields in the
   // StreamExecutorConfig) for a slight improvement in lookup time.
   absl::Mutex mutex_;
-  std::map<int, Entry> cache_ GUARDED_BY(mutex_);
+  std::map<int, Entry> cache_ TF_GUARDED_BY(mutex_);
 
   SE_DISALLOW_COPY_AND_ASSIGN(ExecutorCache);
 };
diff --git a/tensorflow/stream_executor/gpu/BUILD b/tensorflow/stream_executor/gpu/BUILD
index a5dcb4a98f8..bfc80cb4a0e 100644
--- a/tensorflow/stream_executor/gpu/BUILD
+++ b/tensorflow/stream_executor/gpu/BUILD
@@ -207,7 +207,10 @@ cc_library(
         "//tensorflow/core/kernels:__subpackages__",
         "//tensorflow/stream_executor:__subpackages__",
     ],
-    deps = ["@com_google_absl//absl/strings"],
+    deps = [
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
 )
 
 cc_library(
@@ -220,6 +223,7 @@ cc_library(
         "//tensorflow/compiler/xla/service/mlir_gpu:__subpackages__",
         "//tensorflow/core/kernels:__subpackages__",
         "//tensorflow/stream_executor:__subpackages__",
+        "//third_party/tf_runtime/tools/tf_kernel_gen:__subpackages__",
     ],
     deps = if_gpu_is_configured([
         ":gpu_asm_opts",
diff --git a/tensorflow/stream_executor/gpu/asm_compiler.cc b/tensorflow/stream_executor/gpu/asm_compiler.cc
index 25864cc0199..20ed4732039 100644
--- a/tensorflow/stream_executor/gpu/asm_compiler.cc
+++ b/tensorflow/stream_executor/gpu/asm_compiler.cc
@@ -57,7 +57,7 @@ port::StatusOr<absl::Span<const uint8>> CompileGpuAsmOrGetCached(
 // Locks on entry.
 static void WarnIfBadPtxasVersion(const string& ptxas_path) {
   static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
-  static std::unordered_set<string>* seen_ptxas_paths GUARDED_BY(mu) =
+  static std::unordered_set<string>* seen_ptxas_paths TF_GUARDED_BY(mu) =
       new std::unordered_set<string>();
 
   tensorflow::mutex_lock lock(mu);
@@ -127,7 +127,7 @@ port::StatusOr<absl::Span<const uint8>> CompileGpuAsmOrGetCached(
     int device_ordinal, const char* ptx, GpuAsmOpts compilation_options) {
   using PtxCacheKey = std::tuple<int, std::string, GpuAsmOpts::PtxOptionsTuple>;
   static tensorflow::mutex ptx_cache_mutex(tensorflow::LINKER_INITIALIZED);
-  static auto& ptx_cache GUARDED_BY(ptx_cache_mutex) =
+  static auto& ptx_cache TF_GUARDED_BY(ptx_cache_mutex) =
       *new absl::flat_hash_map<PtxCacheKey, std::vector<uint8>>();
 
   tensorflow::mutex_lock lock(ptx_cache_mutex);
@@ -212,6 +212,8 @@ port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
   if (options.disable_gpuasm_optimizations) {
     ptxas_args.push_back("-O0");
   }
+  ptxas_args.insert(ptxas_args.end(), options.extra_flags.begin(),
+                    options.extra_flags.end());
   ptxas_info_dumper.SetProgram(ptxas_path, ptxas_args);
   ptxas_info_dumper.SetChannelAction(tensorflow::CHAN_STDERR,
                                      tensorflow::ACTION_PIPE);
diff --git a/tensorflow/stream_executor/gpu/gpu_asm_opts.h b/tensorflow/stream_executor/gpu/gpu_asm_opts.h
index 722cc4774ff..9f8f0af6cb8 100644
--- a/tensorflow/stream_executor/gpu/gpu_asm_opts.h
+++ b/tensorflow/stream_executor/gpu/gpu_asm_opts.h
@@ -18,8 +18,10 @@ limitations under the License.
 
 #include <string>
 #include <tuple>
+#include <vector>
 
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 
 namespace stream_executor {
 // Compilation options for compiling ptxas.
@@ -30,15 +32,21 @@ struct GpuAsmOpts {
   // Cuda directory which would be searched first.
   std::string preferred_cuda_dir;
 
-  explicit GpuAsmOpts(bool disable_gpuasm_optimizations = false,
-                      absl::string_view preferred_cuda_dir = "")
-      : disable_gpuasm_optimizations(disable_gpuasm_optimizations),
-        preferred_cuda_dir(preferred_cuda_dir) {}
+  std::vector<std::string> extra_flags;
 
-  using PtxOptionsTuple = std::tuple<bool, std::string>;
+  explicit GpuAsmOpts(bool disable_gpuasm_optimizations = false,
+                      absl::string_view preferred_cuda_dir = "",
+                      absl::Span<const std::string> extra_flags = {})
+      : disable_gpuasm_optimizations(disable_gpuasm_optimizations),
+        preferred_cuda_dir(preferred_cuda_dir),
+        extra_flags(extra_flags.begin(), extra_flags.end()) {}
+
+  using PtxOptionsTuple =
+      std::tuple<bool, std::string, std::vector<std::string>>;
 
   PtxOptionsTuple ToTuple() {
-    return std::make_tuple(disable_gpuasm_optimizations, preferred_cuda_dir);
+    return std::make_tuple(disable_gpuasm_optimizations, preferred_cuda_dir,
+                           extra_flags);
   }
 };
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/gpu/gpu_executor.h b/tensorflow/stream_executor/gpu/gpu_executor.h
index d9452abcef7..47c6b85b9c6 100644
--- a/tensorflow/stream_executor/gpu/gpu_executor.h
+++ b/tensorflow/stream_executor/gpu/gpu_executor.h
@@ -273,19 +273,19 @@ class GpuExecutor : public internal::StreamExecutorInterface {
 
   // (supported on CUDA only)
   port::Status LoadModuleFromCuBin(const char* cubin, GpuModuleHandle* module)
-      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
 
   // Loads the PTX text `ptx` as a CUDA module.  `ptx` must be null terminated.
   // (supported on CUDA only)
   port::Status LoadModuleFromPtx(const char* ptx, GpuModuleHandle* module)
-      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
 
   // (supported on ROCm only)
   port::Status LoadModuleFromHsaco(const char* hsaco, GpuModuleHandle* module)
-      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
 
   bool UnloadGpuBinary(const void* gpu_binary)
-      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
 
   // Guards the on-disk-module mapping.
   absl::Mutex disk_modules_mu_;
@@ -294,20 +294,21 @@ class GpuExecutor : public internal::StreamExecutorInterface {
   // Multiple GPUFunctionHandle are usually obtained from a single
   // GPUModuleHandle so we attempt to hit in this mapping first, before
   // retrieving it.
-  std::map<string, GpuModuleHandle> disk_modules_ GUARDED_BY(disk_modules_mu_);
+  std::map<string, GpuModuleHandle> disk_modules_
+      TF_GUARDED_BY(disk_modules_mu_);
 
   // Guards the in-memory-module mapping.
   absl::Mutex in_memory_modules_mu_;
 
   std::map<const char*, GpuModuleHandle> in_memory_modules_
-      GUARDED_BY(in_memory_modules_mu_);
+      TF_GUARDED_BY(in_memory_modules_mu_);
 
   // Kernel -> loaded GPU binary. Many kernels may load the same binary.
   std::unordered_map<const KernelBase*, const void*> kernel_to_gpu_binary_
-      GUARDED_BY(in_memory_modules_mu_);
+      TF_GUARDED_BY(in_memory_modules_mu_);
   // GPU binary (PTX or CUBIN or HSACO) -> {CUDA module, reference count}.
   std::unordered_map<const void*, std::pair<GpuModuleHandle, uint64>>
-      gpu_binary_to_module_ GUARDED_BY(in_memory_modules_mu_);
+      gpu_binary_to_module_ TF_GUARDED_BY(in_memory_modules_mu_);
 
   // Guards the launched kernel set.
   absl::Mutex launched_kernels_mu_;
@@ -315,7 +316,7 @@ class GpuExecutor : public internal::StreamExecutorInterface {
   // Keeps track of the set of launched kernels. Currently used to suppress the
   // occupancy check on subsequent launches.
   std::set<GpuFunctionHandle> launched_kernels_
-      GUARDED_BY(launched_kernels_mu_);
+      TF_GUARDED_BY(launched_kernels_mu_);
 
   // Handle for the CUDA device being operated on. Immutable
   // post-initialization.
diff --git a/tensorflow/stream_executor/gpu/gpu_rng.h b/tensorflow/stream_executor/gpu/gpu_rng.h
index 8a05f5b4763..8dbe2961fff 100644
--- a/tensorflow/stream_executor/gpu/gpu_rng.h
+++ b/tensorflow/stream_executor/gpu/gpu_rng.h
@@ -80,7 +80,7 @@ class GpuRng : public rng::RngSupport {
   // This is a stateful operation, as the handle can only have one stream set at
   // a given time, so it is usually performed right before enqueuing work to do
   // with random number generation.
-  bool SetStream(Stream* stream) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  bool SetStream(Stream* stream) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Guards the gpu rng library handle for this device.
   absl::Mutex mu_;
@@ -90,7 +90,7 @@ class GpuRng : public rng::RngSupport {
   GpuExecutor* parent_;
 
   // gpu rng library handle on the device.
-  GpuRngHandle rng_ GUARDED_BY(mu_);
+  GpuRngHandle rng_ TF_GUARDED_BY(mu_);
 
   SE_DISALLOW_COPY_AND_ASSIGN(GpuRng);
 };
diff --git a/tensorflow/stream_executor/host/host_stream.h b/tensorflow/stream_executor/host/host_stream.h
index 1bf37eaebb3..0a353d4a19b 100644
--- a/tensorflow/stream_executor/host/host_stream.h
+++ b/tensorflow/stream_executor/host/host_stream.h
@@ -42,11 +42,11 @@ class HostStream : public internal::StreamInterface {
   void BlockUntilDone();
 
  private:
-  bool WorkAvailable() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  bool WorkAvailable() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
   void WorkLoop();
 
   absl::Mutex mu_;
-  std::queue<std::function<void()>> work_queue_ GUARDED_BY(mu_);
+  std::queue<std::function<void()>> work_queue_ TF_GUARDED_BY(mu_);
   std::unique_ptr<port::Thread> thread_;
 };
 
diff --git a/tensorflow/stream_executor/multi_platform_manager.cc b/tensorflow/stream_executor/multi_platform_manager.cc
index cbbe0654953..dfee2152165 100644
--- a/tensorflow/stream_executor/multi_platform_manager.cc
+++ b/tensorflow/stream_executor/multi_platform_manager.cc
@@ -15,12 +15,12 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 
-#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 
@@ -30,43 +30,44 @@ namespace {
 class MultiPlatformManagerImpl {
  public:
   port::Status RegisterPlatform(std::unique_ptr<Platform> platform)
-      LOCKS_EXCLUDED(mu_);
+      TF_LOCKS_EXCLUDED(mu_);
 
   port::StatusOr<Platform*> PlatformWithName(absl::string_view target)
-      LOCKS_EXCLUDED(mu_);
+      TF_LOCKS_EXCLUDED(mu_);
 
   port::StatusOr<Platform*> PlatformWithId(const Platform::Id& id)
-      LOCKS_EXCLUDED(mu_);
+      TF_LOCKS_EXCLUDED(mu_);
 
   port::StatusOr<Platform*> InitializePlatformWithName(
       absl::string_view target,
-      const std::map<std::string, std::string>& options) LOCKS_EXCLUDED(mu_);
+      const std::map<std::string, std::string>& options) TF_LOCKS_EXCLUDED(mu_);
   port::StatusOr<Platform*> InitializePlatformWithId(
       const Platform::Id& id, const std::map<std::string, std::string>& options)
-      LOCKS_EXCLUDED(mu_);
+      TF_LOCKS_EXCLUDED(mu_);
 
   port::StatusOr<std::vector<Platform*>> PlatformsWithFilter(
-      const std::function<bool(const Platform*)>& filter) LOCKS_EXCLUDED(mu_);
+      const std::function<bool(const Platform*)>& filter)
+      TF_LOCKS_EXCLUDED(mu_);
 
   using Listener = MultiPlatformManager::Listener;
   port::Status RegisterListener(std::unique_ptr<Listener> listener)
-      LOCKS_EXCLUDED(mu_);
+      TF_LOCKS_EXCLUDED(mu_);
 
  private:
   // Looks up the platform object with the given name.  Assumes the Platforms
   // mutex is held.
   port::StatusOr<Platform*> LookupByNameLocked(absl::string_view target)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Looks up the platform object with the given id.  Assumes the Platforms
   // mutex is held.
   port::StatusOr<Platform*> LookupByIdLocked(const Platform::Id& id)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   absl::Mutex mu_;
-  std::vector<std::unique_ptr<Listener>> listeners_ GUARDED_BY(mu_);
-  absl::flat_hash_map<Platform::Id, Platform*> id_map_ GUARDED_BY(mu_);
-  absl::flat_hash_map<std::string, Platform*> name_map_ GUARDED_BY(mu_);
+  std::vector<std::unique_ptr<Listener>> listeners_ TF_GUARDED_BY(mu_);
+  absl::flat_hash_map<Platform::Id, Platform*> id_map_ TF_GUARDED_BY(mu_);
+  absl::flat_hash_map<std::string, Platform*> name_map_ TF_GUARDED_BY(mu_);
 };
 
 port::Status MultiPlatformManagerImpl::RegisterPlatform(
diff --git a/tensorflow/stream_executor/rocm/rocm_blas.h b/tensorflow/stream_executor/rocm/rocm_blas.h
index 60a81b329b5..c992922a1df 100644
--- a/tensorflow/stream_executor/rocm/rocm_blas.h
+++ b/tensorflow/stream_executor/rocm/rocm_blas.h
@@ -78,7 +78,7 @@ class ROCMBlas : public blas::BlasSupport {
   // rocBLAS is stateful, and only be associated with one stream (in order to
   // enqueue dispatch) at a given time. As a result, this generally must be
   // invoked before calling into rocBLAS.
-  bool SetStream(Stream *stream) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  bool SetStream(Stream *stream) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // A helper function that calls the real rocBLAS function together with error
   // handling.
@@ -188,7 +188,7 @@ class ROCMBlas : public blas::BlasSupport {
   GpuExecutor *parent_;
 
   // rocBLAS library handle on the device.
-  rocblas_handle blas_ GUARDED_BY(mu_);
+  rocblas_handle blas_ TF_GUARDED_BY(mu_);
 
   SE_DISALLOW_COPY_AND_ASSIGN(ROCMBlas);
 };
diff --git a/tensorflow/stream_executor/rocm/rocm_dnn.cc b/tensorflow/stream_executor/rocm/rocm_dnn.cc
index 7dde5a02bf7..a9951b1ecfe 100644
--- a/tensorflow/stream_executor/rocm/rocm_dnn.cc
+++ b/tensorflow/stream_executor/rocm/rocm_dnn.cc
@@ -539,7 +539,7 @@ class MIOpenAccess {
   absl::Mutex mutex_;
 
   // MIOpen library handle.
-  miopenHandle_t handle_ GUARDED_BY(mutex_);  // Owned.
+  miopenHandle_t handle_ TF_GUARDED_BY(mutex_);  // Owned.
 };
 
 MIOpenSupport::MIOpenSupport(GpuExecutor* parent) : parent_(parent) {}
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index 7e4f2e627ee..bf727d63da2 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -118,11 +118,11 @@ class Stream {
   // devices should also override AllowsSyncOnCompletion to return false.) For
   // these devices, this method can be used after work is finished to retrieve
   // execution status.
-  port::Status RefreshStatus() LOCKS_EXCLUDED(mu_);
+  port::Status RefreshStatus() TF_LOCKS_EXCLUDED(mu_);
 
   // Initialize the stream. This must be performed before entraining any other
   // operations.
-  Stream &Init() LOCKS_EXCLUDED(mu_);
+  Stream &Init() TF_LOCKS_EXCLUDED(mu_);
 
   // Initializes timer t via the StreamExecutor.
   Stream &InitTimer(Timer *t);
@@ -135,13 +135,13 @@ class Stream {
   // create a new sub-stream.
   //
   // TODO(b/112196569): The semantics of failed sub-streams is error-prone.
-  Stream *GetOrCreateSubStream() LOCKS_EXCLUDED(mu_);
+  Stream *GetOrCreateSubStream() TF_LOCKS_EXCLUDED(mu_);
 
   // Return the sub-stream back to the host stream so that it can be reused
   // later. Sub-streams that are !ok() will not be reused.
   //
   // TODO(b/112196569): The semantics of failed sub-streams is error-prone.
-  void ReturnSubStream(Stream *sub_stream) LOCKS_EXCLUDED(mu_);
+  void ReturnSubStream(Stream *sub_stream) TF_LOCKS_EXCLUDED(mu_);
 
   // Allocate temporary memories. The stream will deallocate them when blocked
   // or destroyed.
@@ -1949,7 +1949,7 @@ class Stream {
   //
   // Returns an OK status if the blocking was successful and the stream is ok().
   // Otherwise returns an error describing why the blocking failed.
-  port::Status BlockHostUntilDone() LOCKS_EXCLUDED(mu_);
+  port::Status BlockHostUntilDone() TF_LOCKS_EXCLUDED(mu_);
 
   // Warning! This method interacts with internal threads in
   // sometimes-unpredictable ways and is intended for GPU-Executor-internal
@@ -2024,14 +2024,14 @@ class Stream {
   friend struct ThenBlasImpl;  // for implementing ThenBlasXXX.
   friend class ocl::CLBlas;    // for parent_.
 
-  bool InErrorState() const LOCKS_EXCLUDED(mu_) {
+  bool InErrorState() const TF_LOCKS_EXCLUDED(mu_) {
     absl::ReaderMutexLock lock(&mu_);
     return !ok_;
   }
 
   // Sets the error state if operation_retcode is false.
   // This is a useful shorthand for many stream routines.
-  void CheckError(bool operation_retcode) LOCKS_EXCLUDED(mu_) {
+  void CheckError(bool operation_retcode) TF_LOCKS_EXCLUDED(mu_) {
     if (operation_retcode) {
       return;
     }
@@ -2040,7 +2040,7 @@ class Stream {
   }
 
   // Checks the status and logs the error message, if any.
-  void CheckStatus(port::Status status) LOCKS_EXCLUDED(mu_);
+  void CheckStatus(port::Status status) TF_LOCKS_EXCLUDED(mu_);
 
   void SetError() { CheckError(false /* = operation_retcode */); }
 
@@ -2068,17 +2068,17 @@ class Stream {
   // Whether Init() was successfully called to allocate this stream on the
   // underlying platform. It simply flips from 0 to 1 with a sanity check.
   // See StreamExecutor::AllocateStream.
-  bool allocated_ GUARDED_BY(mu_);
+  bool allocated_ TF_GUARDED_BY(mu_);
 
   // Whether all operations have entrained successfully to the current program
   // point.
-  bool ok_ GUARDED_BY(mu_);
+  bool ok_ TF_GUARDED_BY(mu_);
 
   // Sub-streams that are generated from this stream. Each element has a pointer
   // to sub-stream and a boolean value indicating if this substream is ready to
   // be reused.
   std::vector<std::pair<std::unique_ptr<Stream>, bool>> sub_streams_
-      GUARDED_BY(mu_);
+      TF_GUARDED_BY(mu_);
 
   // Streams can allocate temporary memories to help with work they enqueue
   // (e.g. for scratch memory spaces). This member tracks those allocations and
@@ -2088,7 +2088,7 @@ class Stream {
 
   // Callbacks enqueued to be run after the next call to BlockHostUntilDone().
   std::vector<std::function<void()>> after_block_host_until_done_callbacks_
-      GUARDED_BY(mu_);
+      TF_GUARDED_BY(mu_);
 
   // Implementation of ThenConvolveBackwardBias that is shared by all types.
   template <typename T>
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index 391ae52ebfd..f541508d108 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -667,15 +667,15 @@ class StreamExecutor {
   // A mapping of pointer (to device memory) to string representation of the
   // stack (of the allocating thread) at the time at which the pointer was
   // allocated.
-  std::map<void *, AllocRecord> mem_allocs_ GUARDED_BY(mu_);
+  std::map<void *, AllocRecord> mem_allocs_ TF_GUARDED_BY(mu_);
 
   // Memoized BLAS support object -- we only want to create this once when asked
   // for a BLAS interface.
-  std::unique_ptr<blas::BlasSupport> blas_ GUARDED_BY(mu_);
+  std::unique_ptr<blas::BlasSupport> blas_ TF_GUARDED_BY(mu_);
 
   // Memoized DNN support object -- we only want to create this once when asked
   // for an DNN interface.
-  std::unique_ptr<dnn::DnnSupport> dnn_ GUARDED_BY(mu_);
+  std::unique_ptr<dnn::DnnSupport> dnn_ TF_GUARDED_BY(mu_);
 
   // Memoized FFT support object -- we only want to create this once when asked
   // for a FFT interface.
@@ -683,12 +683,12 @@ class StreamExecutor {
 
   // Memoized RNG support object -- we only want to create this once when asked
   // for an RNG interface.
-  std::unique_ptr<rng::RngSupport> rng_ GUARDED_BY(mu_);
+  std::unique_ptr<rng::RngSupport> rng_ TF_GUARDED_BY(mu_);
 
   // Slot to cache the owned DeviceDescription for the underlying device
   // once it has been queried from DeviceDescription().
   mutable std::unique_ptr<DeviceDescription> device_description_
-      GUARDED_BY(mu_);
+      TF_GUARDED_BY(mu_);
 
   // The kind of the underlying platform that is being targeted, as passed
   // during construction.
@@ -726,7 +726,7 @@ class StreamExecutor {
   bool tracing_enabled_;
 
   // The set of TraceListeners registered for this StreamExecutor.
-  std::set<TraceListener*> listeners_ GUARDED_BY(mu_);
+  std::set<TraceListener *> listeners_ TF_GUARDED_BY(mu_);
 
   // Allocated memory in bytes.
   int64 mem_alloc_bytes_;
diff --git a/tensorflow/stream_executor/temporary_memory_manager.h b/tensorflow/stream_executor/temporary_memory_manager.h
index 3d2a144315c..6bef0768b0e 100644
--- a/tensorflow/stream_executor/temporary_memory_manager.h
+++ b/tensorflow/stream_executor/temporary_memory_manager.h
@@ -114,12 +114,13 @@ class TemporaryMemoryManager {
   //
   // If a device memory is not in this mapping, it is not a temporary currently
   // allocated and owned by this temporary memory manager.
-  std::map<DeviceMemoryBase, TemporaryMemoryRecord> records_ GUARDED_BY(mutex_);
+  std::map<DeviceMemoryBase, TemporaryMemoryRecord> records_
+      TF_GUARDED_BY(mutex_);
 
   // Allocation generation -- we bump this counter to distinguish temporary
   // memory handles that have been deallocated and later reallocated at the same
   // device memory address.
-  uint64 generation_ GUARDED_BY(mutex_);
+  uint64 generation_ TF_GUARDED_BY(mutex_);
 
   // The stream (parent object) for this temporary memory manager -- allocations
   // are performed through this stream handle.
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 501977f4050..390acacefe8 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -767,7 +767,7 @@ def tf_gen_op_wrapper_cc(
             out_ops_file + "_internal.cc",
         ],
         srcs = srcs,
-        tools = [":" + tool] + tf_binary_additional_srcs(),
+        exec_tools = [":" + tool] + tf_binary_additional_srcs(),
         cmd = ("$(location :" + tool + ") $(location :" + out_ops_file + ".h) " +
                "$(location :" + out_ops_file + ".cc) " +
                str(include_internal_ops) + " " + api_def_args_str),
@@ -969,7 +969,7 @@ def tf_gen_op_wrapper_py(
             name = name + "_pygenrule",
             outs = [out],
             srcs = api_def_srcs + [hidden_file],
-            tools = [tool_name] + tf_binary_additional_srcs(),
+            exec_tools = [tool_name] + tf_binary_additional_srcs(),
             cmd = ("$(location " + tool_name + ") " + api_def_args_str +
                    " @$(location " + hidden_file + ") > $@"),
         )
@@ -978,7 +978,7 @@ def tf_gen_op_wrapper_py(
             name = name + "_pygenrule",
             outs = [out],
             srcs = api_def_srcs,
-            tools = [tool_name] + tf_binary_additional_srcs(),
+            exec_tools = [tool_name] + tf_binary_additional_srcs(),
             cmd = ("$(location " + tool_name + ") " + api_def_args_str + " " +
                    op_list_arg + " " +
                    ("1" if op_list_is_whitelist else "0") + " > $@"),
@@ -2213,6 +2213,15 @@ def tf_py_test(
     if grpc_enabled:
         deps = deps + tf_additional_grpc_deps_py()
 
+    # NOTE(ebrevdo): This is a workaround for depset() not being able to tell
+    # the difference between 'dep' and 'clean_dep(dep)'.
+    for to_add in [
+        "//tensorflow/python:extra_py_tests_deps",
+        "//tensorflow/python:gradient_checker",
+    ]:
+        if to_add not in deps and clean_dep(to_add) not in deps:
+            deps.append(clean_dep(to_add))
+
     # Python version placeholder
     kwargs.setdefault("srcs_version", "PY2AND3")
     py_test(
@@ -2228,10 +2237,7 @@ def tf_py_test(
         tags = tags,
         visibility = [clean_dep("//tensorflow:internal")] +
                      additional_visibility,
-        deps = depset([
-            clean_dep("//tensorflow/python:extra_py_tests_deps"),
-            clean_dep("//tensorflow/python:gradient_checker"),
-        ] + deps + xla_test_true_list),
+        deps = depset(deps + xla_test_true_list),
         **kwargs
     )
 
@@ -2255,9 +2261,6 @@ def gpu_py_test(
         xla_enabled = False,
         grpc_enabled = False,
         **kwargs):
-    # TODO(b/122522101): Don't ignore xla_enable_strict_auto_jit and enable additional
-    # XLA tests once enough compute resources are available.
-    _ignored = [xla_enable_strict_auto_jit]
     if main == None:
         main = name + ".py"
     if "additional_deps" in kwargs:
@@ -2266,8 +2269,26 @@ def gpu_py_test(
         test_name = name
         test_tags = tags
         if config == "gpu":
-            test_name += "_gpu"
             test_tags = test_tags + tf_gpu_tests_tags()
+        if xla_enable_strict_auto_jit:
+            tf_py_test(
+                name = test_name + "_xla_" + config,
+                size = size,
+                srcs = srcs,
+                args = args,
+                data = data,
+                flaky = flaky,
+                grpc_enabled = grpc_enabled,
+                kernels = kernels,
+                main = main,
+                shard_count = shard_count,
+                tags = test_tags + ["xla", "manual"],
+                xla_enabled = xla_enabled,
+                xla_enable_strict_auto_jit = True,
+                **kwargs
+            )
+        if config == "gpu":
+            test_name += "_gpu"
         tf_py_test(
             name = test_name,
             size = size,
@@ -2386,10 +2407,24 @@ def gpu_py_tests(
         **kwargs):
     # TODO(b/122522101): Don't ignore xla_enable_strict_auto_jit and enable additional
     # XLA tests once enough compute resources are available.
-    _ignored = [xla_enable_strict_auto_jit]
     test_tags = tags + tf_gpu_tests_tags()
     if "additional_deps" in kwargs:
         fail("Use `deps` to specify dependencies. `additional_deps` has been replaced with the standard pattern of `deps`.")
+    if xla_enable_strict_auto_jit:
+        py_tests(
+            name = name + "_xla",
+            size = size,
+            srcs = srcs,
+            data = data,
+            grpc_enabled = grpc_enabled,
+            kernels = kernels,
+            prefix = prefix,
+            shard_count = shard_count,
+            tags = test_tags + ["xla", "manual"],
+            xla_enabled = xla_enabled,
+            xla_enable_strict_auto_jit = True,
+            **kwargs
+        )
     py_tests(
         name = name,
         size = size,
@@ -2430,7 +2465,7 @@ def tf_generate_proto_text_sources(name, srcs_relative_dir, srcs, protodeps = []
         cmd =
             "$(location //tensorflow/tools/proto_text:gen_proto_text_functions) " +
             "$(@D) " + srcs_relative_dir + " $(SRCS)",
-        tools = [
+        exec_tools = [
             clean_dep("//tensorflow/tools/proto_text:gen_proto_text_functions"),
         ],
     )
@@ -2466,7 +2501,7 @@ def tf_version_info_genrule(name, out):
         cmd =
             "$(location //tensorflow/tools/git:gen_git_source) --generate $(SRCS) \"$@\" --git_tag_override=$${GIT_TAG_OVERRIDE:-}",
         local = 1,
-        tools = [clean_dep("//tensorflow/tools/git:gen_git_source")],
+        exec_tools = [clean_dep("//tensorflow/tools/git:gen_git_source")],
     )
 
 def tf_py_build_info_genrule(name, out, **kwargs):
@@ -2486,7 +2521,7 @@ def tf_py_build_info_genrule(name, out, **kwargs):
                 "cudnn_dll_name=cudnn64_$${TF_CUDNN_VERSION:-}.dll",
             ]), ""),
         local = 1,
-        tools = [clean_dep("//tensorflow/tools/build_info:gen_build_info")],
+        exec_tools = [clean_dep("//tensorflow/tools/build_info:gen_build_info")],
         **kwargs
     )
 
@@ -2520,12 +2555,14 @@ def pybind_extension(
         linkopts = [],
         deps = [],
         defines = [],
+        additional_exported_symbols = [],
         visibility = None,
         testonly = None,
         licenses = None,
         compatible_with = None,
         restricted_to = None,
-        deprecation = None):
+        deprecation = None,
+        link_in_framework = False):
     """Builds a generic Python extension module."""
     _ignore = [module_name]
     p = name.rfind("/")
@@ -2537,15 +2574,22 @@ def pybind_extension(
         prefix = name[:p + 1]
     so_file = "%s%s.so" % (prefix, sname)
     pyd_file = "%s%s.pyd" % (prefix, sname)
-    symbol = "init%s" % sname
-    symbol2 = "init_%s" % sname
-    symbol3 = "PyInit_%s" % sname
+    exported_symbols = [
+        "init%s" % sname,
+        "init_%s" % sname,
+        "PyInit_%s" % sname,
+    ] + additional_exported_symbols
+
     exported_symbols_file = "%s-exported-symbols.lds" % name
     version_script_file = "%s-version-script.lds" % name
+
+    exported_symbols_output = "\n".join(["_%s" % symbol for symbol in exported_symbols])
+    version_script_output = "\n".join([" %s;" % symbol for symbol in exported_symbols])
+
     native.genrule(
         name = name + "_exported_symbols",
         outs = [exported_symbols_file],
-        cmd = "echo '_%s\n_%s\n_%s' >$@" % (symbol, symbol2, symbol3),
+        cmd = "echo '%s' >$@" % exported_symbols_output,
         output_licenses = ["unencumbered"],
         visibility = ["//visibility:private"],
         testonly = testonly,
@@ -2554,11 +2598,17 @@ def pybind_extension(
     native.genrule(
         name = name + "_version_script",
         outs = [version_script_file],
-        cmd = "echo '{global:\n %s;\n %s;\n %s;\n local: *;};' >$@" % (symbol, symbol2, symbol3),
+        cmd = "echo '{global:\n%s\n local: *;};' >$@" % version_script_output,
         output_licenses = ["unencumbered"],
         visibility = ["//visibility:private"],
         testonly = testonly,
     )
+
+    # If we are to link to libtensorflow_framework.so, add
+    # it as a source.
+    if link_in_framework:
+        srcs += tf_binary_additional_srcs()
+
     cc_binary(
         name = so_file,
         srcs = srcs + hdrs,
@@ -2636,12 +2686,14 @@ def tf_python_pybind_extension(
         visibility = None):
     """A wrapper macro for pybind_extension that is used in tensorflow/python/BUILD.
 
+    Please do not use it anywhere else as it may behave unexpectedly. b/146445820
+
     It is used for targets under //third_party/tensorflow/python that link
     against libtensorflow_framework.so and pywrap_tensorflow_internal.so.
     """
     pybind_extension(
         name,
-        srcs + tf_binary_additional_srcs(),
+        srcs,
         module_name,
         features = features,
         copts = copts,
@@ -2649,6 +2701,7 @@ def tf_python_pybind_extension(
         deps = deps + tf_binary_pybind_deps() + mkl_deps(),
         defines = defines,
         visibility = visibility,
+        link_in_framework = True,
     )
 
 def tf_pybind_cc_library_wrapper(name, deps, visibility = None):
@@ -2710,7 +2763,7 @@ def tf_external_workspace_visible(visibility):
     # External workspaces can see this target.
     return ["//visibility:public"]
 
-def _filegroup_as_file(ctx):
+def _filegroup_as_file_impl(ctx):
     out = ctx.actions.declare_file(ctx.label.name)
     ctx.actions.write(
         output = out,
@@ -2718,9 +2771,18 @@ def _filegroup_as_file(ctx):
     )
     return DefaultInfo(files = depset([out]))
 
-filegroup_as_file = rule(
-    implementation = _filegroup_as_file,
+_filegroup_as_file = rule(
+    implementation = _filegroup_as_file_impl,
     attrs = {
         "dep": attr.label(),
     },
 )
+
+def filegroup_as_file(name, dep, visibility = []):
+    """Creates a filegroup ${name}_file which contains the file ${name}."""
+    _filegroup_as_file(name = name, dep = dep)
+    native.filegroup(
+        name = name + "_file",
+        srcs = [name],
+        visibility = visibility,
+    )
diff --git a/tensorflow/tf_version_script.lds b/tensorflow/tf_version_script.lds
index a96ef055eea..303ba98b9a3 100644
--- a/tensorflow/tf_version_script.lds
+++ b/tensorflow/tf_version_script.lds
@@ -3,6 +3,7 @@ tensorflow {
     *tensorflow*;
     *toco*;
     *perftools*gputools*;
+    *tf_*;
     *TF_*;
     *Eager*;
     *TFE_*;
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
index cde90e76f5d..3d8187ca752 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
@@ -75,6 +75,12 @@ tf_proto {
       label: LABEL_OPTIONAL
       type: TYPE_BOOL
     }
+    field {
+      name: "enable_mlir_graph_optimization"
+      number: 16
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
     field {
       name: "disable_output_partition_graphs"
       number: 14
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
index 1309eb79938..c32fdee5af0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
@@ -204,6 +204,12 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_BOOL
       }
+      field {
+        name: "enable_mlir_graph_optimization"
+        number: 16
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
       field {
         name: "disable_output_partition_graphs"
         number: 14
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt
index c64909d45f5..920a48a2294 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt
@@ -37,7 +37,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'values\', \'row_splits\', \'cached_row_lengths\', \'cached_value_rowids\', \'cached_nrows\', \'internal\', \'uniform_row_length\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'values\', \'row_partition\', \'internal\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "bounding_shape"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-run-options.-experimental.-run-handler-pool-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-run-options.-experimental.-run-handler-pool-options.pbtxt
new file mode 100644
index 00000000000..c010dbb0030
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-run-options.-experimental.-run-handler-pool-options.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.RunOptions.Experimental.RunHandlerPoolOptions"
+tf_proto {
+  descriptor {
+    name: "RunHandlerPoolOptions"
+    field {
+      name: "priority"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-run-options.-experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-run-options.-experimental.pbtxt
index 47b5b56faf6..913d82f680c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-run-options.-experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-run-options.-experimental.pbtxt
@@ -14,5 +14,21 @@ tf_proto {
       label: LABEL_OPTIONAL
       type: TYPE_BOOL
     }
+    field {
+      name: "run_handler_pool_options"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.RunOptions.Experimental.RunHandlerPoolOptions"
+    }
+    nested_type {
+      name: "RunHandlerPoolOptions"
+      field {
+        name: "priority"
+        number: 1
+        label: LABEL_OPTIONAL
+        type: TYPE_INT64
+      }
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-run-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-run-options.pbtxt
index c0c2e7b9f8d..9020b61d64f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-run-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-run-options.pbtxt
@@ -61,6 +61,22 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_BOOL
       }
+      field {
+        name: "run_handler_pool_options"
+        number: 3
+        label: LABEL_OPTIONAL
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.RunOptions.Experimental.RunHandlerPoolOptions"
+      }
+      nested_type {
+        name: "RunHandlerPoolOptions"
+        field {
+          name: "priority"
+          number: 1
+          label: LABEL_OPTIONAL
+          type: TYPE_INT64
+        }
+      }
     }
     enum_type {
       name: "TraceLevel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
index a194c56a7a9..0b74423ce62 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -64,6 +64,10 @@ tf_class {
     name: "reduce"
     argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "run"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
+  }
   member_method {
     name: "scope"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt
index a45c54ef8f0..67d6923e86c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt
@@ -64,6 +64,10 @@ tf_class {
     name: "reduce"
     argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "run"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
+  }
   member_method {
     name: "scope"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
index 98cf7aaa74c..d22b42d9098 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
@@ -63,6 +63,10 @@ tf_class {
     name: "reduce"
     argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "run"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
+  }
   member_method {
     name: "scope"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
index 4ecc4794b32..03c5b2476b0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
@@ -64,6 +64,10 @@ tf_class {
     name: "reduce"
     argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "run"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
+  }
   member_method {
     name: "scope"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
index 948399fe1f3..baee19e2a50 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
@@ -64,6 +64,10 @@ tf_class {
     name: "reduce"
     argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "run"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
+  }
   member_method {
     name: "scope"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
index 53802706a64..d92dab8f5bf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
@@ -64,6 +64,10 @@ tf_class {
     name: "reduce"
     argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "run"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
+  }
   member_method {
     name: "scope"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
index f40b9dca222..c7c8c832764 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
@@ -68,6 +68,10 @@ tf_class {
     name: "reduce"
     argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "run"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
+  }
   member_method {
     name: "scope"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
index 3d3d73da7ac..4a152111289 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
@@ -112,6 +112,10 @@ tf_module {
     name: "encode_jpeg"
     argspec: "args=[\'image\', \'format\', \'quality\', \'progressive\', \'optimize_size\', \'chroma_downsampling\', \'density_unit\', \'x_density\', \'y_density\', \'xmp_metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'95\', \'False\', \'False\', \'True\', \'in\', \'300\', \'300\', \'\', \'None\'], "
   }
+  member_method {
+    name: "encode_png"
+    argspec: "args=[\'image\', \'compression\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
   member_method {
     name: "encode_proto"
     argspec: "args=[\'sizes\', \'values\', \'field_names\', \'message_type\', \'descriptor_source\', \'name\'], varargs=None, keywords=None, defaults=[\'local://\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
index 9c653e47d20..98327d1b614 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
@@ -372,6 +372,10 @@ tf_module {
     name: "prod"
     argspec: "args=[\'x\', \'axis\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
+  member_method {
+    name: "random_bernoulli"
+    argspec: "args=[\'shape\', \'p\', \'dtype\', \'seed\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\', \'None\'], "
+  }
   member_method {
     name: "random_binomial"
     argspec: "args=[\'shape\', \'p\', \'dtype\', \'seed\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-random-normal.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-random-normal.pbtxt
index 26784ce55d0..0dc36c98210 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-random-normal.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-random-normal.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.initializers.RandomNormal"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.initializers.RandomNormal\'>"
+  is_instance: "<class \'tensorflow.python.keras.initializers.RandomNormalV1\'>"
   is_instance: "<class \'tensorflow.python.ops.init_ops.RandomNormal\'>"
   is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
   is_instance: "<type \'object\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-random-uniform.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-random-uniform.pbtxt
index 4110bda5f6d..fd6647cb381 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-random-uniform.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-random-uniform.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.initializers.RandomUniform"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.initializers.RandomUniform\'>"
+  is_instance: "<class \'tensorflow.python.keras.initializers.RandomUniformV1\'>"
   is_instance: "<class \'tensorflow.python.ops.init_ops.RandomUniform\'>"
   is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
   is_instance: "<type \'object\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-truncated-normal.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-truncated-normal.pbtxt
index 0451d0d73a0..6fe47498271 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-truncated-normal.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.-truncated-normal.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.initializers.TruncatedNormal"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.initializers.TruncatedNormal\'>"
+  is_instance: "<class \'tensorflow.python.keras.initializers.TruncatedNormalV1\'>"
   is_instance: "<class \'tensorflow.python.ops.init_ops.TruncatedNormal\'>"
   is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
   is_instance: "<type \'object\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.normal.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.normal.pbtxt
index 8d0b5c242bd..145200c0a59 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.normal.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.normal.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.initializers.normal"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.initializers.RandomNormal\'>"
+  is_instance: "<class \'tensorflow.python.keras.initializers.RandomNormalV1\'>"
   is_instance: "<class \'tensorflow.python.ops.init_ops.RandomNormal\'>"
   is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
   is_instance: "<type \'object\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.random_normal.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.random_normal.pbtxt
index bac8211a10a..51ff41cb283 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.random_normal.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.random_normal.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.initializers.random_normal"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.initializers.RandomNormal\'>"
+  is_instance: "<class \'tensorflow.python.keras.initializers.RandomNormalV1\'>"
   is_instance: "<class \'tensorflow.python.ops.init_ops.RandomNormal\'>"
   is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
   is_instance: "<type \'object\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.random_uniform.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.random_uniform.pbtxt
index ab0d74d0717..bdb6724d063 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.random_uniform.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.random_uniform.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.initializers.random_uniform"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.initializers.RandomUniform\'>"
+  is_instance: "<class \'tensorflow.python.keras.initializers.RandomUniformV1\'>"
   is_instance: "<class \'tensorflow.python.ops.init_ops.RandomUniform\'>"
   is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
   is_instance: "<type \'object\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.truncated_normal.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.truncated_normal.pbtxt
index 358cca2b9cf..7647e170786 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.truncated_normal.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.truncated_normal.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.initializers.truncated_normal"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.initializers.TruncatedNormal\'>"
+  is_instance: "<class \'tensorflow.python.keras.initializers.TruncatedNormalV1\'>"
   is_instance: "<class \'tensorflow.python.ops.init_ops.TruncatedNormal\'>"
   is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
   is_instance: "<type \'object\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.uniform.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.uniform.pbtxt
index e6c731361ac..0b85fbea918 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.uniform.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.initializers.uniform.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.initializers.uniform"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.initializers.RandomUniform\'>"
+  is_instance: "<class \'tensorflow.python.keras.initializers.RandomUniformV1\'>"
   is_instance: "<class \'tensorflow.python.ops.init_ops.RandomUniform\'>"
   is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
   is_instance: "<type \'object\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt
new file mode 100644
index 00000000000..02bb9b6e6fa
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.CenterCrop"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.CenterCrop\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'height\', \'width\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt
new file mode 100644
index 00000000000..009e4781cc1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.RandomContrast"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomContrast\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'factor\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt
new file mode 100644
index 00000000000..b652c8d099f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.RandomCrop"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomCrop\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'height\', \'width\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt
new file mode 100644
index 00000000000..1a1aaea1964
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.RandomFlip"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomFlip\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'mode\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'horizontal_and_vertical\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt
new file mode 100644
index 00000000000..1c4d6639f8f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.RandomHeight"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomHeight\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'factor\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'bilinear\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
new file mode 100644
index 00000000000..8aeee741de8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.RandomRotation"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomRotation\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'reflect\', \'bilinear\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
new file mode 100644
index 00000000000..1b0daace7bf
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.RandomTranslation"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomTranslation\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'reflect\', \'bilinear\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt
new file mode 100644
index 00000000000..704fb827c45
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.RandomWidth"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomWidth\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'factor\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'bilinear\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
new file mode 100644
index 00000000000..3fb7e6856c8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.RandomZoom"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomZoom\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'reflect\', \'bilinear\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
new file mode 100644
index 00000000000..a4c6a0b1510
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.Rescaling"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.Rescaling\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'scale\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt
new file mode 100644
index 00000000000..398b93ccd72
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.Resizing"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.Resizing\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'height\', \'width\', \'interpolation\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'bilinear\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
index abfd2c682de..09bd5f15ad3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.keras.layers.experimental.preprocessing"
 tf_module {
+  member {
+    name: "CenterCrop"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Normalization"
     mtype: "<type \'type\'>"
@@ -8,6 +12,46 @@ tf_module {
     name: "PreprocessingLayer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "RandomContrast"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RandomCrop"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RandomFlip"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RandomHeight"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RandomRotation"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RandomTranslation"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RandomWidth"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RandomZoom"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Rescaling"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Resizing"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TextVectorization"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-policy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-policy.pbtxt
deleted file mode 100644
index c37ee31da6b..00000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-policy.pbtxt
+++ /dev/null
@@ -1,37 +0,0 @@
-path: "tensorflow.keras.mixed_precision.experimental.Policy"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.mixed_precision.experimental.policy.Policy\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "compute_dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "loss_scale"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "should_cast_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variable_dtype"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'loss_scale\'], varargs=None, keywords=None, defaults=[\'USE_DEFAULT\'], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.pbtxt
index edbbd8ba1e0..b8139b7fc70 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.pbtxt
@@ -4,20 +4,4 @@ tf_module {
     name: "LossScaleOptimizer"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "Policy"
-    mtype: "<type \'type\'>"
-  }
-  member_method {
-    name: "get_layer_policy"
-    argspec: "args=[\'layer\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "global_policy"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_policy"
-    argspec: "args=[\'policy\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
index e862c75e524..f99dad33a6d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.layers.AveragePooling1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.pooling.AveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
index 690decf44b3..eb688a9c676 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.layers.AveragePooling2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.pooling.AveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
index 1eb8e3fdb1b..20ee5a52952 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.layers.AveragePooling3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.pooling.AveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
index 366e19a6dd7..e910faef781 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.layers.BatchNormalization"
 tf_class {
-  is_instance: "<class \'tensorflow.python.layers.normalization.BatchNormalization\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.normalization.BatchNormalization\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalization\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationBase\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
index 178fb1be257..5ff802f6e48 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.layers.Conv1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.convolutional.Conv1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
index 6a518eb565d..cd98e2773ba 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.layers.Conv2DTranspose"
 tf_class {
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.convolutional.Conv2DTranspose\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2DTranspose\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
index abd6d7f6e68..a65847cfda5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.layers.Conv2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
index 075f164245f..3ea91c6fb66 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.layers.Conv3DTranspose"
 tf_class {
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.convolutional.Conv3DTranspose\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3DTranspose\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
index 271f421ad4c..d39b663299e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.layers.Conv3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
index daea01b2942..fec6b718cd2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.layers.Dense"
 tf_class {
-  is_instance: "<class \'tensorflow.python.layers.core.Dense\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.core.Dense\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dense\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
index 80590162f11..edc81f739e6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.layers.Dropout"
 tf_class {
-  is_instance: "<class \'tensorflow.python.layers.core.Dropout\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
index 96a12b73633..3cc78accc78 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.layers.Flatten"
 tf_class {
-  is_instance: "<class \'tensorflow.python.layers.core.Flatten\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.core.Flatten\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.core.Flatten\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
index fe594e836d1..2b65efadcef 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.layers.Layer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
index 8a0bcec6740..c05d1d10329 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.layers.MaxPooling1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.pooling.MaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
index ab3465609bf..a0e5e6fb050 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.layers.MaxPooling2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.pooling.MaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
index 1a1400e838e..788c181a96f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.layers.MaxPooling3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.pooling.MaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
index 2fde0d8f59a..920dcf0f747 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.layers.SeparableConv1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.convolutional.SeparableConv1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
index cf628d0f684..33bb2ee8785 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.layers.SeparableConv2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.convolutional.SeparableConv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt
index 2a5ba923f28..a143468c615 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.lite.experimental.examples.lstm.rnn_cell.TFLiteLSTMCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt
index b09694a7e4c..fd240a31637 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.lite.experimental.examples.lstm.rnn_cell.TfLiteRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
index d59011dcf16..02ec119a24a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
index 6b1e0239aa5..185bfa99489 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.BasicRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 914a97a86d9..102a2266f5a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_wrapper_impl.DeviceWrapperBase\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl._RNNCellWrapperV1\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index 8fb43eac9a7..bb6bde99e53 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_wrapper_impl.DropoutWrapperBase\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl._RNNCellWrapperV1\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
index b699722de26..832ec6f6be6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.GRUCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
index f2787db8f50..6f471d3f811 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LSTMCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
index 5a7770783f8..48d17d35fbe 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.nn.rnn_cell.MultiRNNCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.MultiRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index 0ee22729a11..5c428f658c9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.nn.rnn_cell.RNNCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index 64054d1581e..629d73640f3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_wrapper_impl.ResidualWrapperBase\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl._RNNCellWrapperV1\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.legacy_tf_layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index 2f7c4e8bbd3..70f60e5cb92 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -1626,7 +1626,7 @@ tf_module {
   }
   member_method {
     name: "map_fn"
-    argspec: "args=[\'fn\', \'elems\', \'dtype\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'infer_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'False\', \'True\', \'None\'], "
+    argspec: "args=[\'fn\', \'elems\', \'dtype\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'infer_shape\', \'name\', \'fn_output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'False\', \'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "matching_files"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 2ca5025ba9a..45c92b94119 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -424,6 +424,10 @@ tf_module {
     name: "BatchToSpaceND"
     argspec: "args=[\'input\', \'block_shape\', \'crops\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "BeginEpoch"
+    argspec: "args=[\'dataset_id\', \'address\', \'protocol\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "BesselI0e"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -928,6 +932,10 @@ tf_module {
     name: "DataFormatVecPermute"
     argspec: "args=[\'x\', \'src_format\', \'dst_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'NCHW\', \'None\'], "
   }
+  member_method {
+    name: "DataServiceDataset"
+    argspec: "args=[\'address\', \'protocol\', \'max_outstanding_requests\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "DatasetCardinality"
     argspec: "args=[\'input_dataset\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1770,7 +1778,7 @@ tf_module {
   }
   member_method {
     name: "ImageProjectiveTransformV2"
-    argspec: "args=[\'images\', \'transforms\', \'output_shape\', \'interpolation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'images\', \'transforms\', \'output_shape\', \'interpolation\', \'fill_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'CONSTANT\', \'None\'], "
   }
   member_method {
     name: "ImageSummary"
@@ -2156,6 +2164,10 @@ tf_module {
     name: "Lu"
     argspec: "args=[\'input\', \'output_idx_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
   }
+  member_method {
+    name: "MakeDataServiceIterator"
+    argspec: "args=[\'dataset\', \'epoch_id\', \'iterator\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "MakeIterator"
     argspec: "args=[\'dataset\', \'iterator\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -3248,6 +3260,10 @@ tf_module {
     name: "RegexReplace"
     argspec: "args=[\'input\', \'pattern\', \'rewrite\', \'replace_global\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
+  member_method {
+    name: "RegisterDataset"
+    argspec: "args=[\'dataset\', \'address\', \'protocol\', \'external_state_policy\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "Relu"
     argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -4946,7 +4962,7 @@ tf_module {
   }
   member_method {
     name: "VarHandleOp"
-    argspec: "args=[\'dtype\', \'shape\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+    argspec: "args=[\'dtype\', \'shape\', \'container\', \'shared_name\', \'allowed_devices\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'[]\', \'None\'], "
   }
   member_method {
     name: "VarIsInitializedOp"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt
index c64909d45f5..920a48a2294 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt
@@ -37,7 +37,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'values\', \'row_splits\', \'cached_row_lengths\', \'cached_value_rowids\', \'cached_nrows\', \'internal\', \'uniform_row_length\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'values\', \'row_partition\', \'internal\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "bounding_shape"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
index 34e6be4f6f1..20dfe7fe5a6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "reduce"
     argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "run"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
+  }
   member_method {
     name: "scope"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
index 35718ae1773..4557fe1060b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "reduce"
     argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "run"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
+  }
   member_method {
     name: "scope"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
index 1cdeab10f43..0844739c8eb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "reduce"
     argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "run"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
+  }
   member_method {
     name: "scope"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
index 341f138dbb7..dd61960c66f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "reduce"
     argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "run"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
+  }
   member_method {
     name: "scope"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
index 37e437d226f..0f722ecc8b9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "reduce"
     argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "run"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
+  }
   member_method {
     name: "scope"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
index 36cbca468eb..d6dc9627d9a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "reduce"
     argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "run"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
+  }
   member_method {
     name: "scope"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
index 422f65e8eb3..6cefc4e7977 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "reduce"
     argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "run"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\', \'options\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'None\'], "
+  }
   member_method {
     name: "scope"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
index 2f662a63049..4e96f212b11 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
@@ -92,6 +92,10 @@ tf_module {
     name: "encode_jpeg"
     argspec: "args=[\'image\', \'format\', \'quality\', \'progressive\', \'optimize_size\', \'chroma_downsampling\', \'density_unit\', \'x_density\', \'y_density\', \'xmp_metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'95\', \'False\', \'False\', \'True\', \'in\', \'300\', \'300\', \'\', \'None\'], "
   }
+  member_method {
+    name: "encode_png"
+    argspec: "args=[\'image\', \'compression\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
   member_method {
     name: "encode_proto"
     argspec: "args=[\'sizes\', \'values\', \'field_names\', \'message_type\', \'descriptor_source\', \'name\'], varargs=None, keywords=None, defaults=[\'local://\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
index 2a2ed6e9b00..e09e7ab9816 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
@@ -368,6 +368,10 @@ tf_module {
     name: "prod"
     argspec: "args=[\'x\', \'axis\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
+  member_method {
+    name: "random_bernoulli"
+    argspec: "args=[\'shape\', \'p\', \'dtype\', \'seed\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\', \'None\'], "
+  }
   member_method {
     name: "random_binomial"
     argspec: "args=[\'shape\', \'p\', \'dtype\', \'seed\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt
new file mode 100644
index 00000000000..02bb9b6e6fa
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-center-crop.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.CenterCrop"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.CenterCrop\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'height\', \'width\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt
new file mode 100644
index 00000000000..009e4781cc1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-contrast.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.RandomContrast"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomContrast\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'factor\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt
new file mode 100644
index 00000000000..b652c8d099f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-crop.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.RandomCrop"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomCrop\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'height\', \'width\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt
new file mode 100644
index 00000000000..1a1aaea1964
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-flip.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.RandomFlip"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomFlip\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'mode\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'horizontal_and_vertical\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt
new file mode 100644
index 00000000000..1c4d6639f8f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-height.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.RandomHeight"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomHeight\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'factor\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'bilinear\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
new file mode 100644
index 00000000000..8aeee741de8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.RandomRotation"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomRotation\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'reflect\', \'bilinear\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
new file mode 100644
index 00000000000..1b0daace7bf
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.RandomTranslation"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomTranslation\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'reflect\', \'bilinear\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt
new file mode 100644
index 00000000000..704fb827c45
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-width.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.RandomWidth"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomWidth\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'factor\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'bilinear\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
new file mode 100644
index 00000000000..3fb7e6856c8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.RandomZoom"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomZoom\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'reflect\', \'bilinear\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
new file mode 100644
index 00000000000..a4c6a0b1510
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.Rescaling"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.Rescaling\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'scale\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt
new file mode 100644
index 00000000000..398b93ccd72
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-resizing.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.Resizing"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.Resizing\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'height\', \'width\', \'interpolation\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'bilinear\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
index abfd2c682de..09bd5f15ad3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.keras.layers.experimental.preprocessing"
 tf_module {
+  member {
+    name: "CenterCrop"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Normalization"
     mtype: "<type \'type\'>"
@@ -8,6 +12,46 @@ tf_module {
     name: "PreprocessingLayer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "RandomContrast"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RandomCrop"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RandomFlip"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RandomHeight"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RandomRotation"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RandomTranslation"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RandomWidth"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RandomZoom"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Rescaling"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Resizing"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TextVectorization"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index c56730870eb..a5200f86bfa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -786,7 +786,7 @@ tf_module {
   }
   member_method {
     name: "map_fn"
-    argspec: "args=[\'fn\', \'elems\', \'dtype\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'infer_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'False\', \'True\', \'None\'], "
+    argspec: "args=[\'fn\', \'elems\', \'dtype\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'infer_shape\', \'name\', \'fn_output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'False\', \'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "matmul"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.-trace.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.-trace.pbtxt
new file mode 100644
index 00000000000..cd7e6631047
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.-trace.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.profiler.experimental.Trace"
+tf_class {
+  is_instance: "<class \'tensorflow.python.profiler.trace.Trace\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.pbtxt
index 2823f422b85..ed55fbef167 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "Profile"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Trace"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "client"
     mtype: "<type \'module\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 2ca5025ba9a..45c92b94119 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -424,6 +424,10 @@ tf_module {
     name: "BatchToSpaceND"
     argspec: "args=[\'input\', \'block_shape\', \'crops\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "BeginEpoch"
+    argspec: "args=[\'dataset_id\', \'address\', \'protocol\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "BesselI0e"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -928,6 +932,10 @@ tf_module {
     name: "DataFormatVecPermute"
     argspec: "args=[\'x\', \'src_format\', \'dst_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'NCHW\', \'None\'], "
   }
+  member_method {
+    name: "DataServiceDataset"
+    argspec: "args=[\'address\', \'protocol\', \'max_outstanding_requests\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "DatasetCardinality"
     argspec: "args=[\'input_dataset\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1770,7 +1778,7 @@ tf_module {
   }
   member_method {
     name: "ImageProjectiveTransformV2"
-    argspec: "args=[\'images\', \'transforms\', \'output_shape\', \'interpolation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'images\', \'transforms\', \'output_shape\', \'interpolation\', \'fill_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'CONSTANT\', \'None\'], "
   }
   member_method {
     name: "ImageSummary"
@@ -2156,6 +2164,10 @@ tf_module {
     name: "Lu"
     argspec: "args=[\'input\', \'output_idx_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
   }
+  member_method {
+    name: "MakeDataServiceIterator"
+    argspec: "args=[\'dataset\', \'epoch_id\', \'iterator\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "MakeIterator"
     argspec: "args=[\'dataset\', \'iterator\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -3248,6 +3260,10 @@ tf_module {
     name: "RegexReplace"
     argspec: "args=[\'input\', \'pattern\', \'rewrite\', \'replace_global\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
+  member_method {
+    name: "RegisterDataset"
+    argspec: "args=[\'dataset\', \'address\', \'protocol\', \'external_state_policy\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "Relu"
     argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -4946,7 +4962,7 @@ tf_module {
   }
   member_method {
     name: "VarHandleOp"
-    argspec: "args=[\'dtype\', \'shape\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
+    argspec: "args=[\'dtype\', \'shape\', \'container\', \'shared_name\', \'allowed_devices\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'[]\', \'None\'], "
   }
   member_method {
     name: "VarIsInitializedOp"
diff --git a/tensorflow/tools/api/lib/python_object_to_proto_visitor.py b/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
index e1b2902332f..921fc32a5ed 100644
--- a/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
+++ b/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
@@ -46,7 +46,8 @@ _CORNER_CASES = {
         'message': {}
     },
     'train.LooperThread': {
-        'join': {}
+        'join': {},
+        'native_id': {}
     }
 }
 
@@ -94,6 +95,10 @@ else:
     return False
 
 
+if sys.version_info.major == 3 and sys.version_info.minor >= 8:
+  _NORMALIZE_TYPE["<class '_collections._tuplegetter'>"] = "<type 'property'>"
+
+
 def _NormalizeType(ty):
   return _NORMALIZE_TYPE.get(ty, ty)
 
diff --git a/tensorflow/tools/api/tests/BUILD b/tensorflow/tools/api/tests/BUILD
index 76d3324d27c..8a2880bcb64 100644
--- a/tensorflow/tools/api/tests/BUILD
+++ b/tensorflow/tools/api/tests/BUILD
@@ -28,7 +28,6 @@ py_test(
     python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
-        "no_oss_py38",
         "no_pip",
         "no_rocm",
         "no_windows",  # Bugs due to some paths.
@@ -45,23 +44,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "deprecation_test",
-    srcs = ["deprecation_test.py"],
-    python_version = "PY3",
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_windows",  # Failing due to missing API symbols.
-        "v1only",
-    ],
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-    ],
-)
-
 py_test(
     name = "module_test",
     srcs = ["module_test.py"],
diff --git a/tensorflow/tools/api/tests/deprecation_test.py b/tensorflow/tools/api/tests/deprecation_test.py
deleted file mode 100644
index 185dd70696b..00000000000
--- a/tensorflow/tools/api/tests/deprecation_test.py
+++ /dev/null
@@ -1,201 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests deprecation warnings in a few special cases."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import os
-import tensorflow as tf
-
-from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import module_wrapper
-
-module_wrapper._PER_MODULE_WARNING_LIMIT = 5
-
-
-class DeprecationTest(test.TestCase):
-
-  @test.mock.patch.object(logging, "warning", autospec=True)
-  def testDeprecatedFunction(self, mock_warning):
-    self.assertEqual(0, mock_warning.call_count)
-    tf.compat.v1.initializers.tables_initializer()
-    self.assertEqual(0, mock_warning.call_count)
-
-    tf.tables_initializer()
-    self.assertEqual(1, mock_warning.call_count)
-    self.assertRegexpMatches(mock_warning.call_args[0][1],
-                             "deprecation_test.py:")
-    self.assertRegexpMatches(
-        mock_warning.call_args[0][2], r"tables_initializer")
-    self.assertRegexpMatches(
-        mock_warning.call_args[0][3],
-        r"compat.v1.tables_initializer")
-    tf.tables_initializer()
-    self.assertEqual(1, mock_warning.call_count)
-
-  @test.mock.patch.object(logging, "warning", autospec=True)
-  def testDeprecatedClass(self, mock_warning):
-    value = np.array([1, 2, 3])
-    row_splits = np.array([1])
-
-    self.assertEqual(0, mock_warning.call_count)
-    tf.compat.v1.ragged.RaggedTensorValue(value, row_splits)
-    self.assertEqual(0, mock_warning.call_count)
-
-    tf.ragged.RaggedTensorValue(value, row_splits)
-    self.assertEqual(1, mock_warning.call_count)
-    self.assertRegexpMatches(mock_warning.call_args[0][1],
-                             "deprecation_test.py:")
-    self.assertRegexpMatches(
-        mock_warning.call_args[0][2], r"ragged.RaggedTensorValue")
-    self.assertRegexpMatches(
-        mock_warning.call_args[0][3],
-        r"compat.v1.ragged.RaggedTensorValue")
-    tf.ragged.RaggedTensorValue(value, row_splits)
-    self.assertEqual(1, mock_warning.call_count)
-
-  @test.mock.patch.object(logging, "warning", autospec=True)
-  def testDeprecatedFunctionEndpoint(self, mock_warning):
-    array = tf.IndexedSlices(
-        tf.compat.v1.convert_to_tensor(np.array([1, 2])),
-        tf.compat.v1.convert_to_tensor(np.array([0, 2])))
-    mask_indices = tf.compat.v1.convert_to_tensor(np.array([2]))
-
-    self.assertEqual(0, mock_warning.call_count)
-    tf.sparse.mask(array, mask_indices)
-    self.assertEqual(0, mock_warning.call_count)
-
-    tf.sparse_mask(array, mask_indices)
-    self.assertEqual(1, mock_warning.call_count)
-    self.assertRegexpMatches(mock_warning.call_args[0][1],
-                             "deprecation_test.py:")
-    self.assertRegexpMatches(
-        mock_warning.call_args[0][2], r"sparse_mask")
-    self.assertRegexpMatches(
-        mock_warning.call_args[0][3],
-        "sparse.mask")
-    tf.sparse_mask(array, mask_indices)
-    self.assertEqual(1, mock_warning.call_count)
-
-  @test.mock.patch.object(logging, "warning", autospec=True)
-  def testDeprecatedClassEndpoint(self, mock_warning):
-    self.assertEqual(0, mock_warning.call_count)
-    tf.io.VarLenFeature(tf.dtypes.int32)
-    self.assertEqual(0, mock_warning.call_count)
-
-    tf.VarLenFeature(tf.dtypes.int32)
-    self.assertEqual(1, mock_warning.call_count)
-    self.assertRegexpMatches(mock_warning.call_args[0][1],
-                             "deprecation_test.py:")
-    self.assertRegexpMatches(
-        mock_warning.call_args[0][2], r"VarLenFeature")
-    self.assertRegexpMatches(
-        mock_warning.call_args[0][3],
-        r"io.VarLenFeature")
-    tf.VarLenFeature(tf.dtypes.int32)
-    self.assertEqual(1, mock_warning.call_count)
-
-  @test.mock.patch.object(logging, "warning", autospec=True)
-  def testDeprecatedConstantEndpoint(self, mock_warning):
-    self.assertEqual(0, mock_warning.call_count)
-    tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY  # pylint: disable=pointless-statement
-    self.assertEqual(0, mock_warning.call_count)
-
-    tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY  # pylint: disable=pointless-statement
-    self.assertEqual(1, mock_warning.call_count)
-    self.assertRegexpMatches(mock_warning.call_args[0][1],
-                             "deprecation_test.py:")
-    self.assertRegexpMatches(
-        mock_warning.call_args[0][2],
-        r"saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY")
-    self.assertRegexpMatches(
-        mock_warning.call_args[0][3],
-        r"saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY")
-    tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY  # pylint: disable=pointless-statement
-    self.assertEqual(1, mock_warning.call_count)
-
-  @test.mock.patch.object(logging, "warning", autospec=True)
-  def testKerasDeprecationNoWarning(self, mock_warning):
-    self.assertEqual(0, mock_warning.call_count)
-    tf.keras.layers.GRUCell(20)
-    self.assertLessEqual(mock_warning.call_count, 1)
-    if mock_warning.call_count == 1:
-      # The only message printed should be due to referencing init op.
-      self.assertRegexpMatches(
-          mock_warning.call_args[0][-1],
-          "Call initializer instance with the dtype argument instead of "
-          "passing it to the constructor")
-
-  @test.mock.patch.object(logging, "warning", autospec=True)
-  def testKerasDeprecation(self, mock_warning):
-    self.assertEqual(0, mock_warning.call_count)
-    tf.keras.backend.get_session()
-    # if OpenMP is set in environment, then logging.warning
-    # is called two times. First for deprecation and 2nd for
-    # OMP related warning.
-    if os.environ.get("OMP_NUM_THREADS"):
-      self.assertEqual(2, mock_warning.call_count)
-      # First message on deprecation warning.
-      self.assertRegexpMatches(mock_warning.call_args_list[0][0][-1],
-                               "tf.compat.v1.keras.backend.get_session")
-      # Second message is not a deprecation warning.
-      self.assertRegexpMatches(
-          mock_warning.call_args_list[1][0][0],
-          "OMP_NUM_THREADS is no longer used by the default Keras config."
-          " To configure the number of threads, use tf.config.threading"
-          " APIs")
-    else:
-      self.assertEqual(1, mock_warning.call_count)
-      self.assertRegexpMatches(mock_warning.call_args[0][-1],
-                               "tf.compat.v1.keras.backend.get_session")
-    tf.keras.backend.get_session()
-    if os.environ.get("OMP_NUM_THREADS"):
-      self.assertEqual(2, mock_warning.call_count)
-    else:
-      self.assertEqual(1, mock_warning.call_count)
-
-  @test.mock.patch.object(logging, "warning", autospec=True)
-  def testKerasEndpointDeprecation(self, mock_warning):
-    self.assertEqual(0, mock_warning.call_count)
-    tf.keras.metrics.cosine_proximity([0.5], [0.5])
-    self.assertEqual(1, mock_warning.call_count)
-    self.assertRegexpMatches(
-        mock_warning.call_args[0][-1],
-        "tf.keras.losses.cosine_similarity")
-    tf.keras.metrics.cosine_proximity([0.5], [0.5])
-    self.assertEqual(1, mock_warning.call_count)
-
-  @test.mock.patch.object(logging, "warning", autospec=True)
-  def testEstimatorDeprecation(self, mock_warning):
-    if "KMeans" in tf.estimator.experimental.__dict__:
-      self.assertEqual(0, mock_warning.call_count)
-      tf.estimator.experimental.KMeans(2)
-      self.assertEqual(2, mock_warning.call_count)
-      # First message is not a deprecation warning.
-      self.assertRegexpMatches(
-          mock_warning.call_args_list[1][0][0],
-          "Using temporary folder as model directory:")
-      # Second message is a deprecation warning.
-      self.assertRegexpMatches(
-          mock_warning.call_args_list[0][0][-1],
-          "tf.compat.v1.estimator.experimental.KMeans")
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16 b/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16
index 3f72aea8862..c540d03df7f 100644
--- a/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16
+++ b/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16
@@ -69,6 +69,9 @@ RUN wget https://bootstrap.pypa.io/get-pip.py && python3.5 get-pip.py && rm get-
 RUN /install/install_pip_packages.sh
 RUN /install/install_auditwheel.sh
 
+# Make python3.6 the default python version
+RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.6 0
+
 # Install given tensorflow or tf-nightly version, if not specified, install the # latest official release
 ARG TF_PACKAGE=tensorflow
 ARG TF_PACKAGE_VERSION=
diff --git a/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_cuda10.0 b/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_cuda10.0
index 38f867ae3e9..1a1200095b1 100644
--- a/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_cuda10.0
+++ b/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_cuda10.0
@@ -69,6 +69,9 @@ RUN wget https://bootstrap.pypa.io/get-pip.py && python3.5 get-pip.py && rm get-
 RUN /install/install_pip_packages.sh
 RUN /install/install_auditwheel.sh
 
+# Make python3.6 the default python version
+RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.6 0
+
 # Install given tensorflow or tf-nightly version, if not specified, install the # latest official release
 ARG TF_PACKAGE=tensorflow
 ARG TF_PACKAGE_VERSION=
diff --git a/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_cuda10.1 b/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_cuda10.1
index 885b1b16886..01f34365ff2 100644
--- a/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_cuda10.1
+++ b/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_cuda10.1
@@ -71,6 +71,9 @@ RUN wget https://bootstrap.pypa.io/get-pip.py && python3.5 get-pip.py && rm get-
 RUN /install/install_pip_packages.sh
 RUN /install/install_auditwheel.sh
 
+# Make python3.6 the default python version
+RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.6 0
+
 # Install given tensorflow or tf-nightly version, if not specified, install the # latest official release
 ARG TF_PACKAGE=tensorflow
 ARG TF_PACKAGE_VERSION=
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu18.04-manylinux2014.ppc64le b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu18.04-manylinux2014.ppc64le
deleted file mode 100644
index 0ff887322c3..00000000000
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu18.04-manylinux2014.ppc64le
+++ /dev/null
@@ -1,54 +0,0 @@
-# Dockerfile to build a manylinux 2014 compliant compiler for ppc64le.
-# Based on Dockerfile.rbe.ubuntu16.04-manylinux2010
-#
-# Builds a devtoolset gcc/libstdc++ that targets manylinux 2014 compatible
-# glibc (2.17) and system libstdc++ (4.8).
-#
-# To push a new version, run:
-# $ docker build -f Dockerfile.rbe.cuda10.0-cudnn7-ubuntu18.04-manylinux2014.ppc64le
-#  --tag "ibmcom/tensorflow-ppc64le/devel-cuda10.0-cudnn7-ubuntu18.04-manylinux2014" .
-# $ docker push ibmcom/tensorflow-ppc64le/devel-cuda10.0-cudnn7-ubuntu18.04-manylinux2014
-
-FROM nvidia/cuda-ppc64le:10.0-cudnn7-devel-ubuntu18.04
-
-ENV DEBIAN_FRONTEND=noninteractive
-RUN apt-get update && apt-get install -y \
-      bzip2 \
-      cpio \
-      file \
-      flex \
-      g++ \
-      make \
-      patch \
-      rpm2cpio \
-      unar \
-      wget \
-      tar \
-      xz-utils \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-ADD devtoolset/fixlinks.sh fixlinks.sh
-ADD devtoolset/build_devtoolset_ppc64le.sh build_devtoolset.sh
-ADD devtoolset/rpm-patch.sh rpm-patch.sh
-
-# Set up a sysroot for glibc 2.19 / libstdc++ 4.8 / devtoolset-7 in /dt7.
-RUN /build_devtoolset.sh devtoolset-7 /dt7
-# Set up a sysroot for glibc 2.19 / libstdc++ 4.8 / devtoolset-8 in /dt8.
-RUN /build_devtoolset.sh devtoolset-8 /dt8
-
-# Copy and run the install scripts.
-COPY install/*.sh /install/
-ARG DEBIAN_FRONTEND=noninteractive
-RUN /install/install_bootstrap_deb_packages.sh
-RUN /install/install_deb_packages.sh
-RUN /install/install_openblas_ppc64le.sh
-RUN /install/install_hdf5_ppc64le.sh
-#RUN /install/install_clang.sh #no apt repo for clang on ppc64le
-RUN /install/install_bazel_from_source.sh
-RUN /install/install_pip_packages.sh
-
-# TODO(klimek): Figure out a better way to get the right include paths
-# forwarded when we install new packages.
-RUN ln -s "/usr/include/powerpc64le-linux-gnu/python3.6m" "/dt7/usr/include/powerpc64le-linux-gnu/python3.6m"
-RUN ln -s "/usr/include/powerpc64le-linux-gnu/python3.6m" "/dt8/usr/include/powerpc64le-linux-gnu/python3.6m"
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2014.ppc64le b/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2014.ppc64le
deleted file mode 100644
index 21d024e62cf..00000000000
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2014.ppc64le
+++ /dev/null
@@ -1,77 +0,0 @@
-# Dockerfile to build a manylinux 2014 compliant compiler for ppc64le.
-# Based on Dockerfile.rbe.ubuntu16.04-manylinux2010
-#
-# Builds a devtoolset gcc/libstdc++ that targets manylinux 2014 compatible
-# glibc (2.17) and system libstdc++ (4.8).
-#
-# To push a new version, run:
-# $ docker build -f Dockerfile.rbe.ubuntu16.04-manylinux2014.ppc64le
-#  --tag "ibmcom/tensorflow-ppc64le/devel-ubuntu16.04-manylinux2014" .
-# $ docker push ibmcom/tensorflow-ppc64le/devel-ubuntu16.04-manylinux2014
-
-FROM ubuntu:16.04
-
-ENV DEBIAN_FRONTEND=noninteractive
-RUN apt-get update && apt-get install -y \
-      bzip2 \
-      cpio \
-      file \
-      flex \
-      g++ \
-      make \
-      patch \
-      rpm2cpio \
-      unar \
-      wget \
-      tar \
-      xz-utils \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-ADD devtoolset/fixlinks.sh fixlinks.sh
-ADD devtoolset/build_devtoolset_ppc64le.sh build_devtoolset.sh
-ADD devtoolset/rpm-patch.sh rpm-patch.sh
-
-# Set up a sysroot for glibc 2.19 / libstdc++ 4.8 / devtoolset-7 in /dt7.
-RUN /build_devtoolset.sh devtoolset-7 /dt7
-# Set up a sysroot for glibc 2.19 / libstdc++ 4.8 / devtoolset-8 in /dt8.
-RUN /build_devtoolset.sh devtoolset-8 /dt8
-
-# Copy and run the install scripts.
-COPY install/*.sh /install/
-ARG DEBIAN_FRONTEND=noninteractive
-RUN /install/install_bootstrap_deb_packages.sh
-RUN /install/install_deb_packages.sh
-RUN /install/install_openblas_ppc64le.sh
-RUN /install/install_hdf5_ppc64le.sh
-#RUN /install/install_clang.sh #no apt repo for clang on ppc64le
-RUN /install/install_bazel_from_source.sh
-
-# Install golang.
-RUN /install/install_golang.sh
-env GOROOT=/usr/local/go
-env PATH=$GOROOT/bin:$PATH
-
-# Install python 3.6.
-# Replace easy_install3 (3.5) with easy_install-3.6 because in the next
-#   step that is used to install pip
-RUN wget https://www.python.org/ftp/python/3.6.8/Python-3.6.8.tar.xz && \
-    tar xvf Python-3.6.8.tar.xz && \
-    cd Python-3.6.8 && \
-    ./configure --enable-optimizations && \
-    make altinstall && \
-    cd .. && \
-    rm -rf Python-3.6.8 Python-3.6.8.tar.xz && \
-    update-alternatives --install /usr/local/bin/python3 python3 /usr/local/bin/python3.6 0 && \
-    rm -f /usr/bin/easy_install3 && \
-    update-alternatives --install /usr/bin/easy_install3 easy_install3 /usr/local/bin/easy_install-3.6 0
-
-RUN /install/install_pip_packages.sh
-
-# TODO(klimek): Figure out a better way to get the right include paths
-# forwarded when we install new packages.
-RUN ln -s "/usr/include/powerpc64le-linux-gnu/python2.7" "/dt7/usr/include/powerpc64le-linux-gnu/python2.7"
-RUN ln -s "/usr/include/powerpc64le-linux-gnu/python2.7" "/dt8/usr/include/powerpc64le-linux-gnu/python2.7"
-
-RUN ln -s "/usr/include/powerpc64le-linux-gnu/python3.6m" "/dt7/usr/include/powerpc64le-linux-gnu/python3.6m"
-RUN ln -s "/usr/include/powerpc64le-linux-gnu/python3.6m" "/dt8/usr/include/powerpc64le-linux-gnu/python3.6m"
diff --git a/tensorflow/tools/ci_build/builds/android_full.sh b/tensorflow/tools/ci_build/builds/android_full.sh
index caa3e2e9d71..f76d1588e53 100755
--- a/tensorflow/tools/ci_build/builds/android_full.sh
+++ b/tensorflow/tools/ci_build/builds/android_full.sh
@@ -64,6 +64,7 @@ done
 echo "========== Building TensorFlow Android Jar and Demo =========="
 bazel --bazelrc=/dev/null build --config=monolithic --fat_apk_cpu=${CPUS} \
     --compilation_mode=opt --cxxopt=-std=c++14 \
+    --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
     --spawn_strategy=sandboxed --genrule_strategy=sandboxed \
     //tensorflow/tools/android/inference_interface:android_tensorflow_inference_java \
     //tensorflow/tools/android/inference_interface:android_tensorflow_inference_java.aar \
diff --git a/tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh b/tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh
index a4eefea6843..d696e08d790 100644
--- a/tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh
+++ b/tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh
@@ -18,6 +18,8 @@
 set -e
 set -x
 
+MAX_WHL_SIZE=550M
+
 function run_smoke_test() {
   VENV_TMP_DIR=$(mktemp -d)
 
@@ -35,6 +37,9 @@ function run_smoke_test() {
   # Test TensorflowFlow imports
   test_tf_imports
 
+  # Test TensorFlow whl file size
+  test_tf_whl_size
+
   RESULT=$?
   # Deactivate from virtualenv.
   deactivate || source deactivate || die "FAILED: Unable to deactivate from existing virtualenv."
@@ -73,6 +78,14 @@ function test_tf_imports() {
   return $RESULT
 }
 
+function test_tf_whl_size() {
+  if [[ $(find $WHL_NAME -type f -size +${MAX_WHL_SIZE}) ]]; then
+    echo "The whl size has exceeded 550MB. To keep within pypi's CDN
+distribution limit, we must not exceed that threshold."
+    return 1
+  fi
+}
+
 ###########################################################################
 # Main
 ###########################################################################
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index 9397bbd4f60..aff42505215 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -328,7 +328,7 @@ do_external_licenses_check(){
   TMP_FILE="$(mktemp)_tmp.log"
 
   echo "Getting external dependencies for ${BUILD_TARGET}"
- bazel query "attr('licenses', 'notice', deps(${BUILD_TARGET}))" --keep_going > "${TMP_FILE}" 2>&1
+ bazel cquery "attr('licenses', 'notice', deps(${BUILD_TARGET}))" --keep_going > "${TMP_FILE}" 2>&1
  cat "${TMP_FILE}" \
   | grep -e "^\/\/" -e "^@" \
   | grep -E -v "^//tensorflow" \
@@ -339,7 +339,7 @@ do_external_licenses_check(){
 
   echo
   echo "Getting list of external licenses mentioned in ${LICENSES_TARGET}."
-  bazel query "deps(${LICENSES_TARGET})" --keep_going > "${TMP_FILE}" 2>&1
+  bazel cquery "deps(${LICENSES_TARGET})" --keep_going > "${TMP_FILE}" 2>&1
  cat "${TMP_FILE}" \
   | grep -e "^\/\/" -e "^@" \
   | grep -E -v "^//tensorflow" \
@@ -372,6 +372,7 @@ do_external_licenses_check(){
   # Whitelist
   echo ${EXTRA_LICENSE_FILE}
   grep \
+    -e "//third_party/mkl" \
     -e "//third_party/mkl_dnn" \
     -e "@bazel_tools//src" \
     -e "@bazel_tools//tools/" \
diff --git a/tensorflow/tools/ci_build/ctpu/ctpu.sh b/tensorflow/tools/ci_build/ctpu/ctpu.sh
index da04e2d1c90..2d9beeb25e3 100644
--- a/tensorflow/tools/ci_build/ctpu/ctpu.sh
+++ b/tensorflow/tools/ci_build/ctpu/ctpu.sh
@@ -46,7 +46,7 @@ function ctpu_up {
   local name="kokoro-tpu-${RANDOM}"
   local zone="us-central1-c"
   local size="v2-8"
-  local version="nightly-2.x"
+  local version="nightly"
   local project  # Project automatically detected from environment.
   local gcp_network  # Network needed only if project default is Legacy.
 
diff --git a/tensorflow/tools/ci_build/devtoolset/build_devtoolset_ppc64le.sh b/tensorflow/tools/ci_build/devtoolset/build_devtoolset_ppc64le.sh
deleted file mode 100755
index b25ac9b2b52..00000000000
--- a/tensorflow/tools/ci_build/devtoolset/build_devtoolset_ppc64le.sh
+++ /dev/null
@@ -1,142 +0,0 @@
-#!/bin/bash -eu
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Builds a devtoolset cross-compiler targeting manylinux 2014 (glibc 2.17 /
-# libstdc++ 4.8).
-# On ppc64le glibc is version 2.19, that is the earlier ubuntu version for ppc64le
-# Based on the script: build_devtoolset.sh 
-
-VERSION="$1"
-TARGET="$2"
-
-case "${VERSION}" in
-devtoolset-7)
-  LIBSTDCXX_VERSION="6.0.24"
-  ;;
-devtoolset-8)
-  LIBSTDCXX_VERSION="6.0.25"
-  ;;
-*)
-  echo "Usage: $0 {devtoolset-7|devtoolset-8} <target-directory>"
-  exit 1
-  ;;
-esac
-
-mkdir -p "${TARGET}"
-# Download binary glibc 2.19 release.
-wget "http://old-releases.ubuntu.com/ubuntu/pool/main/g/glibc/libc6_2.19-10ubuntu2.3_ppc64el.deb" && \
-    unar "libc6_2.19-10ubuntu2.3_ppc64el.deb" && \
-    tar -C "${TARGET}" -xvzf "libc6_2.19-10ubuntu2.3_ppc64el/data.tar.gz" && \
-    rm -rf "libc6_2.19-10ubuntu2.3_ppc64el.deb" "libc6_2.19-10ubuntu2.3_ppc64el"
-wget "http://old-releases.ubuntu.com/ubuntu/pool/main/g/glibc/libc6-dev_2.19-10ubuntu2.3_ppc64el.deb" && \
-    unar "libc6-dev_2.19-10ubuntu2.3_ppc64el.deb" && \
-    tar -C "${TARGET}" -xvf "libc6-dev_2.19-10ubuntu2.3_ppc64el/data.tar.xz" && \
-    rm -rf "libc6-dev_2.19-10ubuntu2.3_ppc64el.deb" "libc6-dev_2.19-10ubuntu2.3_ppc64el"
-
-# Put the current kernel headers from ubuntu in place.
-ln -s "/usr/include/linux" "${TARGET}/usr/include/linux"
-ln -s "/usr/include/asm-generic" "${TARGET}/usr/include/asm-generic"
-ln -s "/usr/include/powerpc64le-linux-gnu/asm" "${TARGET}/usr/include/asm"
-
-# Symlinks in the binary distribution are set up for installation in /usr, we
-# need to fix up all the links to stay within ${TARGET}.
-/fixlinks.sh "${TARGET}"
-
-# Patch to allow non-glibc 2.19 compatible builds to work.
-sed -i '54i#define TCP_USER_TIMEOUT 18' "${TARGET}/usr/include/netinet/tcp.h"
-
-# Download binary libstdc++ 4.8 release we are going to link against.
-# We only need the shared library, as we're going to develop against the
-# libstdc++ provided by devtoolset.
-wget "http://ports.ubuntu.com/ubuntu-ports/pool/main/g/gcc-4.8/libstdc++6_4.8.4-2ubuntu1~14.04.4_ppc64el.deb" && \
-    unar "libstdc++6_4.8.4-2ubuntu1~14.04.4_ppc64el.deb" && \
-    tar -C "${TARGET}" -xvf "libstdc++6_4.8.4-2ubuntu1~14.04.4_ppc64el/data.tar.xz" "./usr/lib/powerpc64le-linux-gnu/libstdc++.so.6.0.19" && \
-    rm -rf "libstdc++6_4.8.4-2ubuntu1~14.04.4_ppc64el.deb" "libstdc++6_4.8.4-2ubuntu1~14.04.4_ppc64el"
-
-mkdir -p "${TARGET}-src"
-cd "${TARGET}-src"
-
-# Build a devtoolset cross-compiler based on our glibc 2.19 sysroot setup.
-
-case "${VERSION}" in
-devtoolset-7)
-  wget "http://vault.centos.org/centos/6/sclo/Source/rh/devtoolset-7/devtoolset-7-gcc-7.3.1-5.15.el6.src.rpm"
-  rpm2cpio "devtoolset-7-gcc-7.3.1-5.15.el6.src.rpm" |cpio -idmv
-  tar -xvjf "gcc-7.3.1-20180303.tar.bz2" --strip 1
-  ;;
-devtoolset-8)
-  wget "http://vault.centos.org/centos/6/sclo/Source/rh/devtoolset-8/devtoolset-8-gcc-8.2.1-3.el6.src.rpm"
-  rpm2cpio "devtoolset-8-gcc-8.2.1-3.el6.src.rpm" |cpio -idmv
-  tar -xvf "gcc-8.2.1-20180905.tar.xz" --strip 1
-  ;;
-esac
-
-# Apply the devtoolset patches to gcc.
-/rpm-patch.sh "gcc.spec"
-
-./contrib/download_prerequisites
-
-mkdir -p "${TARGET}-build"
-cd "${TARGET}-build"
-
-"${TARGET}-src/configure" \
-      --prefix="${TARGET}/usr" \
-      --with-sysroot="${TARGET}" \
-      --disable-bootstrap \
-      --disable-libmpx \
-      --disable-libsanitizer \
-      --disable-libunwind-exceptions \
-      --disable-libunwind-exceptions \
-      --disable-lto \
-      --disable-multilib \
-      --enable-__cxa_atexit \
-      --enable-gnu-indirect-function \
-      --enable-gnu-unique-object \
-      --enable-initfini-array \
-      --enable-languages="c,c++" \
-      --enable-linker-build-id \
-      --enable-plugin \
-      --enable-shared \
-      --enable-threads=posix \
-      --with-default-libstdcxx-abi="gcc4-compatible" \
-      --with-gcc-major-version-only \
-      --with-linker-hash-style="gnu" \
-      --with-tune="power8" \
-      && \
-    make -j 42 && \
-    make install
-
-# Create the devtoolset libstdc++ linkerscript that links dynamically against
-# the system libstdc++ 4.4 and provides all other symbols statically.
-# Run the command 'objdump -i' to find the correct OUTPUT_FORMAT for an architecture
-mv "${TARGET}/usr/lib64/libstdc++.so.${LIBSTDCXX_VERSION}" \
-   "${TARGET}/usr/lib64/libstdc++.so.${LIBSTDCXX_VERSION}.backup"
-echo -e "OUTPUT_FORMAT(elf64-powerpcle)\nINPUT ( libstdc++.so.6.0.19 -lstdc++_nonshared44 )" \
-   > "${TARGET}/usr/lib64/libstdc++.so.${LIBSTDCXX_VERSION}"
-cp "./powerpc64le-unknown-linux-gnu/libstdc++-v3/src/.libs/libstdc++_nonshared44.a" \
-   "${TARGET}/usr/lib64"
-
-# Link in architecture specific includes from the system; note that we cannot
-# link in the whole powerpc64le-linux-gnu folder, as otherwise we're overlaying
-# system gcc paths that we do not want to find.
-# TODO(klimek): Automate linking in all non-gcc / non-kernel include
-# directories.
-mkdir -p "${TARGET}/usr/include/powerpc64le-linux-gnu"
-ln -s "/usr/include/powerpc64le-linux-gnu/python3.5m" "${TARGET}/usr/include/powerpc64le-linux-gnu/python3.5m"
-
-# Clean up
-rm -rf "${TARGET}-build"
-rm -rf "${TARGET}-src"
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/tpu_py37_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/tpu_py37_full/nonpip.sh
index 557acf7fbf9..fd803569d33 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/tpu_py37_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/tpu_py37_full/nonpip.sh
@@ -54,6 +54,7 @@ bazel build "${bazel_args[@]}" -- "${test_patterns[@]}"
 ctpu_up -s v2-8 -p tensorflow-testing-tpu
 
 test_args=(
+  --test_timeout=120,600,-1,-1 \
   --test_arg=--tpu="${TPU_NAME}" \
   --test_arg=--zone="${TPU_ZONE}" \
   --test_arg=--test_dir_base=gs://kokoro-tpu-testing/tempdir/ \
diff --git a/tensorflow/tools/ci_build/release/windows/upload_nightly_pip/upload.sh b/tensorflow/tools/ci_build/release/windows/upload_nightly_pip/upload.sh
index 4d66a523c92..68486647d83 100644
--- a/tensorflow/tools/ci_build/release/windows/upload_nightly_pip/upload.sh
+++ b/tensorflow/tools/ci_build/release/windows/upload_nightly_pip/upload.sh
@@ -21,11 +21,11 @@ source tensorflow/tools/ci_build/release/common.sh
 sudo pip install --upgrade twine
 
 # Copy and rename to tf_nightly
-for f in $(ls "${TF_FILE_DIR}"/tf_nightly_cpu*dev*cp3*-cp3*m-win_amd64.whl); do
+for f in $(ls "${TF_FILE_DIR}"/tf_nightly_cpu*dev*cp3*-cp3*-win_amd64.whl); do
   copy_to_new_project_name "${f}" tf_nightly
 done
 
 # Upload the built packages to pypi.
-for f in $(ls "${TF_FILE_DIR}"/tf_nightly*dev*cp3*-cp3*m-win_amd64.whl); do
+for f in $(ls "${TF_FILE_DIR}"/tf_nightly*dev*cp3*-cp3*-win_amd64.whl); do
   twine upload -r pypi-warehouse "$f" || echo
 done
diff --git a/tensorflow/tools/compatibility/ast_edits_test.py b/tensorflow/tools/compatibility/ast_edits_test.py
index 2e5018b1ee4..2e4f6a2dfb2 100644
--- a/tensorflow/tools/compatibility/ast_edits_test.py
+++ b/tensorflow/tools/compatibility/ast_edits_test.py
@@ -608,6 +608,9 @@ def t():
     self.assertEqual(expected_text, new_text)
 
   def testUpgradeInplaceWithSymlink(self):
+    if os.name == "nt":
+      self.skipTest("os.symlink doesn't work uniformly on Windows.")
+
     upgrade_dir = os.path.join(self.get_temp_dir(), "foo")
     os.mkdir(upgrade_dir)
     file_a = os.path.join(upgrade_dir, "a.py")
@@ -626,6 +629,9 @@ def t():
       self.assertEqual("import bar as f", f.read())
 
   def testUpgradeInPlaceWithSymlinkInDifferentDir(self):
+    if os.name == "nt":
+      self.skipTest("os.symlink doesn't work uniformly on Windows.")
+
     upgrade_dir = os.path.join(self.get_temp_dir(), "foo")
     other_dir = os.path.join(self.get_temp_dir(), "bar")
     os.mkdir(upgrade_dir)
@@ -648,6 +654,9 @@ def t():
       self.assertEqual("import foo as f", f.read())
 
   def testUpgradeCopyWithSymlink(self):
+    if os.name == "nt":
+      self.skipTest("os.symlink doesn't work uniformly on Windows.")
+
     upgrade_dir = os.path.join(self.get_temp_dir(), "foo")
     output_dir = os.path.join(self.get_temp_dir(), "bar")
     os.mkdir(upgrade_dir)
@@ -669,6 +678,9 @@ def t():
       self.assertEqual("import bar as f", f.read())
 
   def testUpgradeCopyWithSymlinkInDifferentDir(self):
+    if os.name == "nt":
+      self.skipTest("os.symlink doesn't work uniformly on Windows.")
+
     upgrade_dir = os.path.join(self.get_temp_dir(), "foo")
     other_dir = os.path.join(self.get_temp_dir(), "bar")
     output_dir = os.path.join(self.get_temp_dir(), "baz")
diff --git a/tensorflow/tools/compatibility/testdata/test_file_v1_12.py b/tensorflow/tools/compatibility/testdata/test_file_v1_12.py
index 483c06ac3d5..36176e7d568 100644
--- a/tensorflow/tools/compatibility/testdata/test_file_v1_12.py
+++ b/tensorflow/tools/compatibility/testdata/test_file_v1_12.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test as test_lib
 
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2.py b/tensorflow/tools/compatibility/tf_upgrade_v2.py
index c7bbd3815f1..3b3feff1b58 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2.py
@@ -829,7 +829,7 @@ class TFAPIChangeSpec(ast_edits.NoUpdateSpec):
         "custom training loop, note the following changes in methods: "
         "make_dataset_iterator->experimental_distribute_dataset, "
         "experimental_make_numpy_iterator->experimental_make_numpy_dataset, "
-        "extended.call_for_each_replica->experimental_run_v2, "
+        "extended.call_for_each_replica->run, "
         "reduce requires an axis argument, "
         "unwrap->experimental_local_results "
         "experimental_initialize and experimental_finalize no longer needed ")
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
index d645b298ce3..059ed26889f 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
@@ -25,7 +25,7 @@ import tempfile
 
 from absl.testing import parameterized
 import six
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 # OSS TF V2 import placeholder.
 
 from tensorflow.python.framework import test_util
diff --git a/tensorflow/tools/dockerfiles/bashrc b/tensorflow/tools/dockerfiles/bashrc
index 48cacf20f64..d9dd7a671bc 100644
--- a/tensorflow/tools/dockerfiles/bashrc
+++ b/tensorflow/tools/dockerfiles/bashrc
@@ -14,6 +14,9 @@
 #
 # ==============================================================================
 
+# If not running interactively, don't do anything
+[ -z "$PS1" ] && return
+
 export PS1="\[\e[31m\]tf-docker\[\e[m\] \[\e[33m\]\w\[\e[m\] > "
 export TERM=xterm-256color
 alias grep="grep --color=auto"
diff --git a/tensorflow/tools/docs/generate2.py b/tensorflow/tools/docs/generate2.py
index dcc5444d53c..12c0b5151ed 100644
--- a/tensorflow/tools/docs/generate2.py
+++ b/tensorflow/tools/docs/generate2.py
@@ -116,14 +116,14 @@ def generate_raw_ops_doc():
       has_gradient = "\N{HEAVY CHECK MARK}\N{VARIATION SELECTOR-16}"
     except LookupError:
       has_gradient = "\N{CROSS MARK}"
-    path = pathlib.Path("/") / FLAGS.site_path / "tf/raw_ops" / op_name
-    path = path.with_suffix(".md")
-    link = ('<a id={op_name} href="{path}">'
-            "{op_name}</a>").format(
-                op_name=op_name, path=str(path))
-    parts.append(
-        "| {link} | {has_gradient} |".format(link=link,
-                                             has_gradient=has_gradient))
+
+    if not op_name.startswith("_"):
+      path = pathlib.Path("/") / FLAGS.site_path / "tf/raw_ops" / op_name
+      path = path.with_suffix(".md")
+      link = ('<a id={op_name} href="{path}">{op_name}</a>').format(
+          op_name=op_name, path=str(path))
+      parts.append("| {link} | {has_gradient} |".format(
+          link=link, has_gradient=has_gradient))
 
   return "\n".join(parts)
 
@@ -184,6 +184,11 @@ def build_docs(output_dir, code_url_prefix, search_hints=True):
   # The custom page will be used for raw_ops.md not the one generated above.
   doc_controls.set_custom_page_content(tf.raw_ops, generate_raw_ops_doc())
 
+  # Hide raw_ops from search.
+  for name, obj in tf_inspect.getmembers(tf.raw_ops):
+    if not name.startswith("_"):
+      doc_controls.hide_from_search(obj)
+
   _hide_layer_and_module_methods()
 
   try:
diff --git a/tensorflow/tools/graph_transforms/BUILD b/tensorflow/tools/graph_transforms/BUILD
index 198859598eb..b900867a2d3 100644
--- a/tensorflow/tools/graph_transforms/BUILD
+++ b/tensorflow/tools/graph_transforms/BUILD
@@ -98,6 +98,7 @@ cc_library(
         "fold_old_batch_norms.cc",
         "freeze_requantization_ranges.cc",
         "fuse_convolutions.cc",
+        "inline_partitionedcall.cc",
         "insert_logging.cc",
         "obfuscate_names.cc",
         "quantize_nodes.cc",
@@ -107,6 +108,7 @@ cc_library(
         "remove_device.cc",
         "remove_nodes.cc",
         "rename_attribute.cc",
+        "rename_node.cc",
         "rename_op.cc",
         "round_weights.cc",
         "set_device.cc",
@@ -121,6 +123,9 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":transform_utils",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/algorithm:container",
         "//tensorflow/c:checkpoint_reader",
         "//tensorflow/core/util/tensor_bundle",
         "//tensorflow/core:core_cpu",
@@ -174,6 +179,7 @@ tf_cc_test(
         "fold_old_batch_norms_test.cc",
         "freeze_requantization_ranges_test.cc",
         "fuse_convolutions_test.cc",
+        "inline_partitionedcall_test.cc",
         "insert_logging_test.cc",
         "obfuscate_names_test.cc",
         "quantize_nodes_test.cc",
@@ -182,6 +188,7 @@ tf_cc_test(
         "remove_device_test.cc",
         "remove_nodes_test.cc",
         "rename_attribute_test.cc",
+        "rename_node_test.cc",
         "rename_op_test.cc",
         "round_weights_test.cc",
         "set_device_test.cc",
diff --git a/tensorflow/tools/graph_transforms/inline_partitionedcall.cc b/tensorflow/tools/graph_transforms/inline_partitionedcall.cc
new file mode 100644
index 00000000000..c6827decf98
--- /dev/null
+++ b/tensorflow/tools/graph_transforms/inline_partitionedcall.cc
@@ -0,0 +1,151 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/tools/graph_transforms/transform_utils.h"
+
+namespace tensorflow {
+namespace graph_transforms {
+
+constexpr char kPartitionedCallOpName[] = "PartitionedCall";
+constexpr char kFunctionAttrName[] = "f";
+
+namespace {
+absl::optional<FunctionDef> GetFunctionByNameFromLibrary(
+    const GraphDef& graph, absl::string_view function_name) {
+  for (const auto& fct : graph.library().function()) {
+    if (fct.signature().name() == function_name) {
+      return fct;
+    }
+  }
+  return {};
+}
+
+std::string NormalizeNodeDefInput(const std::string& input_name) {
+  std::vector<std::string> name_parts =
+      absl::StrSplit(input_name, absl::ByChar(':'));
+  if (name_parts.size() > 2) {
+    return absl::StrCat(name_parts[0], ":", name_parts.back());
+  }
+  return input_name;
+}
+
+}  // namespace
+
+Status InlinePartitionedCall(const GraphDef& input_graph_def,
+                             const TransformFuncContext& context,
+                             GraphDef* output_graph_def) {
+  output_graph_def->Clear();
+  absl::flat_hash_map<std::string, std::string> remap_input;
+
+  for (const NodeDef& node : input_graph_def.node()) {
+    if (node.op() == kPartitionedCallOpName) {
+      if (node.attr().count(kFunctionAttrName) == 0) {
+        return Status(
+            error::Code::NOT_FOUND,
+            "Node " + node.name() + " has no attribute: " + kFunctionAttrName);
+      }
+
+      if (!node.attr().at(kFunctionAttrName).has_func()) {
+        return Status(error::Code::NOT_FOUND,
+                      "Cannot figure out function name");
+      }
+      const std::string function_name =
+          node.attr().at(kFunctionAttrName).func().name();
+      absl::optional<FunctionDef> function =
+          GetFunctionByNameFromLibrary(input_graph_def, function_name);
+      if (!function.has_value()) {
+        return Status(error::Code::NOT_FOUND,
+                      "function " + function_name + " Not found");
+      }
+
+      const std::string prefix = node.name();
+
+      const int kOutputArgumentCount =
+          function->signature().output_arg().size();
+      for (int k = 0; k < kOutputArgumentCount; ++k) {
+        const std::string function_arg_output_name =
+            function->ret().at(function->signature().output_arg()[k].name());
+        remap_input.insert_or_assign(
+            CanonicalInputName(absl::StrCat(node.name(), ":", k)),
+            absl::StrCat(prefix, "/",
+                         NormalizeNodeDefInput(function_arg_output_name)));
+      }
+
+      const int kInputArgumentCount = function->signature().input_arg().size();
+      if (node.input().size() != kInputArgumentCount) {
+        return Status(error::Code::INVALID_ARGUMENT,
+                      "Called function  " + function_name +
+                          " has invalid input signature.");
+      }
+      absl::flat_hash_map<std::string, std::string> input_argument_map;
+      for (int k = 0; k < kInputArgumentCount; ++k) {
+        const std::string canonical_name =
+            CanonicalInputName(function->signature().input_arg()[k].name());
+        input_argument_map.insert_or_assign(canonical_name, node.input()[k]);
+      }
+
+      for (const NodeDef& function_node : function->node_def()) {
+        NodeDef* new_node = output_graph_def->mutable_node()->Add();
+        *new_node = function_node;
+        new_node->set_name(absl::StrCat(prefix, "/", function_node.name()));
+        absl::c_transform(
+            *new_node->mutable_input(), new_node->mutable_input()->begin(),
+            [prefix, input_argument_map](const std::string& input_name) {
+              const std::string canonical_input_name =
+                  CanonicalInputName(input_name);
+              if (input_argument_map.find(canonical_input_name) !=
+                  input_argument_map.end()) {
+                return input_argument_map.at(canonical_input_name);
+              }
+              return absl::StrCat(prefix, "/",
+                                  NormalizeNodeDefInput(input_name));
+            });
+      }
+    } else {
+      NodeDef* new_node = output_graph_def->mutable_node()->Add();
+      *new_node = node;
+    }
+  }
+
+  // Remap PartitionCall outputs to correct nodes.
+  for (NodeDef& node : *output_graph_def->mutable_node()) {
+    absl::c_transform(
+        *node.mutable_input(), node.mutable_input()->begin(),
+        [remap_input](const std::string& input_name) {
+          const std::string canonical_input_name =
+              CanonicalInputName(input_name);
+          if (remap_input.find(canonical_input_name) != remap_input.end()) {
+            return remap_input.at(canonical_input_name);
+          }
+          return input_name;
+        });
+  }
+  return Status::OK();
+}
+
+REGISTER_GRAPH_TRANSFORM("inline_partitionedcall", InlinePartitionedCall);
+}  // namespace graph_transforms
+}  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/inline_partitionedcall_test.cc b/tensorflow/tools/graph_transforms/inline_partitionedcall_test.cc
new file mode 100644
index 00000000000..9523e711327
--- /dev/null
+++ b/tensorflow/tools/graph_transforms/inline_partitionedcall_test.cc
@@ -0,0 +1,136 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <string>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/tools/graph_transforms/transform_utils.h"
+
+namespace tensorflow {
+namespace graph_transforms {
+
+constexpr char kGraphDefWithPartitionedCall[] =
+    "node {\n"
+    "  name: \"y\"\n"
+    "  op: \"Placeholder\"\n"
+    "}\n"
+    "node {\n"
+    "  name: \"sub/y\"\n"
+    "  op: \"Const\"\n"
+    "}\n"
+    "node {\n"
+    "  name: \"PartitionedCall\"\n"
+    "  op: \"PartitionedCall\"\n"
+    "  input: \"y\"\n"
+    "  input: \"sub/y\"\n"
+    "  attr {\n"
+    "    key: \"f\"\n"
+    "    value {\n"
+    "      func {\n"
+    "        name: \"__inference_simple_add_14\"\n"
+    "      }\n"
+    "    }\n"
+    "  }\n"
+    "}\n"
+    "node {\n"
+    "  name: \"add/y\"\n"
+    "  op: \"Const\"\n"
+    "}\n"
+    "node {\n"
+    "  name: \"add\"\n"
+    "  op: \"AddV2\"\n"
+    "  input: \"PartitionedCall\"\n"
+    "  input: \"add/y\"\n"
+    "}\n"
+    "node {\n"
+    "  name: \"Identity\"\n"
+    "  op: \"Identity\"\n"
+    "  input: \"add\"\n"
+    "}\n"
+    "library {\n"
+    "  function {\n"
+    "    signature {\n"
+    "      name: \"__inference_simple_add_14\"\n"
+    "      input_arg {\n"
+    "        name: \"x\"\n"
+    "        type: DT_FLOAT\n"
+    "      }\n"
+    "      input_arg {\n"
+    "        name: \"y\"\n"
+    "        type: DT_FLOAT\n"
+    "      }\n"
+    "      output_arg {\n"
+    "        name: \"identity\"\n"
+    "        type: DT_FLOAT\n"
+    "      }\n"
+    "    }\n"
+    "    node_def {\n"
+    "      name: \"mul\"\n"
+    "      op: \"Mul\"\n"
+    "      input: \"x\"\n"
+    "      input: \"y\"\n"
+    "    }\n"
+    "    node_def {\n"
+    "      name: \"add/y\"\n"
+    "      op: \"Const\"\n"
+    "    }\n"
+    "    node_def {\n"
+    "      name: \"add\"\n"
+    "      op: \"AddV2\"\n"
+    "      input: \"mul:z:0\"\n"
+    "      input: \"add/y:output:0\"\n"
+    "    }\n"
+    "    node_def {\n"
+    "      name: \"Identity\"\n"
+    "      op: \"Identity\"\n"
+    "      input: \"add:z:0\"\n"
+    "    }\n"
+    "    ret {\n"
+    "      key: \"identity\"\n"
+    "      value: \"Identity:output:0\"\n"
+    "    }\n"
+    "  }\n"
+    "}\n";
+
+// Declare here, so we don't need a public header.
+Status InlinePartitionedCall(const GraphDef& input_graph_def,
+                             const TransformFuncContext& context,
+                             GraphDef* output_graph_def);
+
+TEST(InlinePartitionedCallTest, Inlining) {
+  GraphDef in_graph;
+  EXPECT_TRUE(::tensorflow::protobuf::TextFormat::ParseFromString(
+      kGraphDefWithPartitionedCall, &in_graph));
+
+  GraphDef result;
+  TransformFuncContext context;
+  context.input_names = {"y"};
+  context.output_names = {"Identity"};
+  TF_ASSERT_OK(InlinePartitionedCall(in_graph, context, &result));
+
+  EXPECT_TRUE(std::none_of(
+      result.node().cbegin(), result.node().cend(),
+      [](const NodeDef& node) { return node.op() == "PartitionedCall"; }));
+  EXPECT_EQ(9, result.node().size());
+  TF_EXPECT_OK(IsGraphValid(result));
+}
+
+}  // namespace graph_transforms
+}  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/rename_node.cc b/tensorflow/tools/graph_transforms/rename_node.cc
new file mode 100644
index 00000000000..bd40e842577
--- /dev/null
+++ b/tensorflow/tools/graph_transforms/rename_node.cc
@@ -0,0 +1,70 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/tools/graph_transforms/transform_utils.h"
+
+namespace tensorflow {
+namespace graph_transforms {
+
+Status RenameNode(const GraphDef& input_graph_def,
+                  const TransformFuncContext& context,
+                  GraphDef* output_graph_def) {
+  if (!context.params.count("old_node_name") ||
+      (context.params.at("old_node_name").size() != 1) ||
+      !context.params.count("new_node_name") ||
+      (context.params.at("new_node_name").size() != 1)) {
+    return errors::InvalidArgument(
+        "rename_node expects exactly one 'old_node_name' and one "
+        "'new_node_name' argument, e.g. "
+        "rename_node(old_attribute_name=super/deep/output, "
+        "new_attribute_name=output)");
+  }
+
+  const std::string old_node_name = context.params.at("old_node_name")[0];
+  const std::string new_node_name = context.params.at("new_node_name")[0];
+
+  output_graph_def->Clear();
+  for (const NodeDef& input_node : input_graph_def.node()) {
+    NodeDef* node = output_graph_def->mutable_node()->Add();
+    *node = input_node;
+    if (node->name() == new_node_name) {
+      return Status(error::Code::INVALID_ARGUMENT,
+                    "A node is alreading using " + new_node_name + "as name.");
+    }
+    if (node->name() == old_node_name) {
+      node->set_name(new_node_name);
+    }
+    for (std::string& input_name : *node->mutable_input()) {
+      std::string prefix;
+      std::string input_node_name;
+      std::string suffix;
+      NodeNamePartsFromInput(input_name, &prefix, &input_node_name, &suffix);
+      if (input_node_name == old_node_name) {
+        std::string new_input_name = prefix + new_node_name + suffix;
+        input_name = new_input_name;
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+REGISTER_GRAPH_TRANSFORM("rename_node", RenameNode);
+}  // namespace graph_transforms
+}  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/rename_node_test.cc b/tensorflow/tools/graph_transforms/rename_node_test.cc
new file mode 100644
index 00000000000..574272b8cca
--- /dev/null
+++ b/tensorflow/tools/graph_transforms/rename_node_test.cc
@@ -0,0 +1,96 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+#include <utility>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/tools/graph_transforms/transform_utils.h"
+
+namespace tensorflow {
+namespace graph_transforms {
+
+// Declare here, so we don't need a public header.
+Status RenameNode(const GraphDef& input_graph_def,
+                  const TransformFuncContext& context,
+                  GraphDef* output_graph_def);
+
+TEST(RenameNodeTest, Rename) {
+  GraphDef in_graph;
+  NodeDef* node = in_graph.add_node();
+  node->set_name("input");
+  node->set_op("Placeholder");
+
+  NodeDef* node_splitter = in_graph.add_node();
+  node_splitter->set_name("splitter");
+  node_splitter->set_op("Split");
+
+  NodeDef* node_adder = in_graph.add_node();
+  node_adder->set_op("Add");
+  node_adder->set_name("adder");
+  node_adder->add_input("splitter");
+  node_adder->add_input("splitter:1");
+
+  GraphDef result;
+  TransformFuncContext context;
+  context.input_names = {};
+  context.output_names = {"adder"};
+  context.params.insert(std::pair<string, std::vector<string>>(
+      {"old_node_name", {std::string("splitter")}}));
+  context.params.insert(std::pair<string, std::vector<string>>(
+      {"new_node_name", {string("demux")}}));
+  TF_ASSERT_OK(RenameNode(in_graph, context, &result));
+
+  std::map<string, const NodeDef*> node_lookup;
+  MapNamesToNodes(result, &node_lookup);
+  EXPECT_EQ(1, node_lookup.count("demux"));
+  EXPECT_EQ(1, node_lookup.count("adder"));
+  EXPECT_EQ(2, node_lookup["adder"]->input().size());
+  EXPECT_EQ("demux", node_lookup["adder"]->input()[0]);
+  EXPECT_EQ("demux:1", node_lookup["adder"]->input()[1]);
+}
+
+TEST(RenameNodeTest, FailWhenNameAlreadyExists) {
+  GraphDef in_graph;
+  NodeDef* node = in_graph.add_node();
+  node->set_name("input");
+  node->set_op("Placeholder");
+
+  NodeDef* node_splitter = in_graph.add_node();
+  node_splitter->set_name("splitter");
+  node_splitter->set_op("Split");
+
+  NodeDef* node_adder = in_graph.add_node();
+  node_adder->set_op("Add");
+  node_adder->set_name("adder");
+  node_adder->add_input("splitter");
+  node_adder->add_input("splitter:1");
+
+  GraphDef result;
+  TransformFuncContext context;
+  context.input_names = {};
+  context.output_names = {"adder"};
+  context.params.insert(std::pair<string, std::vector<string>>(
+      {"old_node_name", {std::string("splitter")}}));
+  context.params.insert(std::pair<string, std::vector<string>>(
+      {"new_node_name", {string("adder")}}));
+  EXPECT_FALSE(RenameNode(in_graph, context, &result).ok());
+}
+
+}  // namespace graph_transforms
+}  // namespace tensorflow
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 1479c0177e7..30ab95e370d 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -154,7 +154,6 @@ genrule(
         "@icu//:icu4c/LICENSE",
         "@libjpeg_turbo//:LICENSE.md",
         "@llvm-project//llvm:LICENSE.TXT",
-        "@llvm-project//mlir:LICENSE.TXT",
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
         "@local_config_tensorrt//:LICENSE",
@@ -162,7 +161,6 @@ genrule(
         "@nsync//:LICENSE",
         "@png//:LICENSE",
         "@snappy//:COPYING",
-        "@sobol_data//:LICENSE",
         "@zlib//:zlib.h",
     ] + select({
         "//tensorflow:android": [],
@@ -236,7 +234,6 @@ genrule(
         "@icu//:icu4j/main/shared/licenses/LICENSE",
         "@libjpeg_turbo//:LICENSE.md",
         "@llvm-project//llvm:LICENSE.TXT",
-        "@llvm-project//mlir:LICENSE.TXT",
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
         "@local_config_tensorrt//:LICENSE",
@@ -244,7 +241,6 @@ genrule(
         "@nsync//:LICENSE",
         "@png//:LICENSE",
         "@snappy//:COPYING",
-        "@sobol_data//:LICENSE",
         "@zlib//:zlib.h",
     ] + select({
         "//tensorflow:android": [],
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index fb98d847b85..c38d7b84a74 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -33,6 +33,9 @@ transitive_hdrs(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:stream_executor",
+        "//tensorflow/cc/saved_model:loader",
+        "//tensorflow/cc/saved_model:reader",
+        "//tensorflow/cc/saved_model:bundle_v2",
         # WARNING: None of the C/C++ code under python/ has any API guarantees, and TF team
         # reserves the right to change APIs and other header-level interfaces.  If your custom
         # op uses these headers, it may break when users upgrade their version of tensorflow.
@@ -71,6 +74,11 @@ filegroup_as_file(
     dep = "//tensorflow/compiler/tf2xla:xla_compiled_cpu_runtime_srcs",
 )
 
+filegroup(
+    name = "xla_cmake",
+    srcs = ["xla_build/CMakeLists.txt"],
+)
+
 # Add dynamic kernel dso files here.
 DYNAMIC_LOADED_KERNELS = [
     "//tensorflow/core/kernels:libtfkernel_sobol_op.so",
@@ -82,7 +90,8 @@ COMMON_PIP_DEPS = [
     "README",
     "setup.py",
     ":included_headers",
-    ":xla_compiled_cpu_runtime_srcs.txt",
+    ":xla_compiled_cpu_runtime_srcs.txt_file",
+    ":xla_cmake",
     "//tensorflow:tensorflow_py",
     "//tensorflow/compiler/tf2xla:xla_compiled_cpu_runtime_hdrs",
     "//tensorflow/compiler/tf2xla:xla_compiled_cpu_runtime_srcs",
@@ -113,6 +122,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python/distribute:combinations",
     "//tensorflow/python/distribute:multi_process_runner",
     "//tensorflow/python/eager:eager_pip",
+    "//tensorflow/python/keras:combinations",
     "//tensorflow/python/keras/layers/preprocessing:preprocessing_test_utils",
     "//tensorflow/python/keras/distribute:distribute_strategy_test_lib",
     "//tensorflow/python/keras/distribute:multi_worker_testing_utils",
@@ -124,6 +134,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python/kernel_tests/random:util",
     "//tensorflow/python/kernel_tests/signal:test_util",
     "//tensorflow/python/kernel_tests/testdata:self_adjoint_eig_op_test_files",
+    "//tensorflow/python/profiler:traceme",
     "//tensorflow/python/saved_model:saved_model",
     "//tensorflow/python/tools:tools_pip",
     "//tensorflow/python/tools/api/generator:create_python_api",
diff --git a/tensorflow/tools/pip_package/MANIFEST.in b/tensorflow/tools/pip_package/MANIFEST.in
index f0d180209f3..51c1dc52ea6 100644
--- a/tensorflow/tools/pip_package/MANIFEST.in
+++ b/tensorflow/tools/pip_package/MANIFEST.in
@@ -20,4 +20,5 @@ recursive-include tensorflow/include/google *.inc
 recursive-include tensorflow/include/include *.h
 recursive-include tensorflow/include/third_party *
 recursive-include tensorflow/include/unsupported *
+recursive-include tensorflow/xla_aot_runtime_src *
 
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index 23d6beceb89..090ce22b718 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -45,8 +45,15 @@ function copy_xla_aot_runtime_sources() {
   local src_dir=$1
   local dst_dir=$2
 
+  local srcs_txt="tensorflow/tools/pip_package/xla_compiled_cpu_runtime_srcs.txt"
+
+  if [ ! -f "${src_dir}/${srcs_txt}" ]; then
+    echo Could not find source list file "${src_dir}/${srcs_txt}". 1>&2
+    return 0
+  fi
+
   pushd $src_dir
-  for file in $(cat tensorflow/tools/pip_package/xla_compiled_cpu_runtime_srcs.txt)
+  for file in $(cat "${srcs_txt}")
   do
     # Sometimes $file has a prefix bazel-out/host/ we want to remove.
     prefix=${file%%tensorflow/*}  # Find the location of "tensorflow/*"
@@ -62,6 +69,7 @@ function copy_xla_aot_runtime_sources() {
       echo "Missing xla source file: ${file}" 1>&2
     fi
   done
+  cp tensorflow/tools/pip_package/xla_build/CMakeLists.txt "${dst_dir}"
   popd
 }
 
@@ -134,7 +142,7 @@ function prepare_src() {
       bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles \
       "${EXTERNAL_INCLUDES}/"
     copy_xla_aot_runtime_sources \
-      bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles \
+      bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles/org_tensorflow \
       "${XLA_AOT_RUNTIME_SOURCES}/"
     RUNFILES=bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles/org_tensorflow
   else
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index d4e92700eac..4dfe616263b 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -246,6 +246,7 @@ headers = (
     list(find_files('*.proto', 'tensorflow/compiler')) +
     list(find_files('*.proto', 'tensorflow/core')) +
     list(find_files('*.proto', 'tensorflow/python')) +
+    list(find_files('*.h', 'tensorflow/cc')) +
     list(find_files('*.h', 'tensorflow/compiler')) +
     list(find_files('*.h', 'tensorflow/core')) +
     list(find_files('*.h', 'tensorflow/python')) +
@@ -295,13 +296,12 @@ setup(
         'Intended Audience :: Education',
         'Intended Audience :: Science/Research',
         'License :: OSI Approved :: Apache Software License',
-        'Programming Language :: Python :: 2',
-        'Programming Language :: Python :: 2.7',
         'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.4',
         'Programming Language :: Python :: 3.5',
         'Programming Language :: Python :: 3.6',
         'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3 :: Only',
         'Topic :: Scientific/Engineering',
         'Topic :: Scientific/Engineering :: Mathematics',
         'Topic :: Scientific/Engineering :: Artificial Intelligence',
diff --git a/tensorflow/tools/pip_package/xla_build/CMakeLists.txt b/tensorflow/tools/pip_package/xla_build/CMakeLists.txt
new file mode 100644
index 00000000000..1690338a625
--- /dev/null
+++ b/tensorflow/tools/pip_package/xla_build/CMakeLists.txt
@@ -0,0 +1,19 @@
+cmake_minimum_required(VERSION 3.4.3)
+
+file(GLOB_RECURSE TF_RUNTIME_SRC "*.cc")
+add_library(tf_xla_runtime_objects OBJECT
+	${TF_RUNTIME_SRC}
+)
+
+target_include_directories(tf_xla_runtime_objects PRIVATE ../include)
+target_compile_options(tf_xla_runtime_objects PRIVATE
+  -ftemplate-backtrace-limit=0
+  -Wno-ignored-attributes
+  -Wno-deprecated-copy
+  -Wno-cast-qual
+  -Wno-sign-compare
+)
+
+add_library(tf_xla_runtime STATIC
+  $<TARGET_OBJECTS:tf_xla_runtime_objects>
+)
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 6e5fded608c..2ff181e1b25 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -148,11 +148,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "ee54bd30c86d3d959f9367d5a969ba20ca58eea7fe865785d358d5f776b99b2f",
-        strip_prefix = "XNNPACK-5b5a0624a80655e40aff8e96de97706aeb008281",
+        sha256 = "100f675c099c74da46dea8da025f6f9b5e0307370f3dde506d11bd78b2b7d171",
+        strip_prefix = "XNNPACK-4ea95bef8cdd942895f23f5cc09c778d10500551",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/5b5a0624a80655e40aff8e96de97706aeb008281.zip",
-            "https://github.com/google/XNNPACK/archive/5b5a0624a80655e40aff8e96de97706aeb008281.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/4ea95bef8cdd942895f23f5cc09c778d10500551.zip",
+            "https://github.com/google/XNNPACK/archive/4ea95bef8cdd942895f23f5cc09c778d10500551.zip",
         ],
     )
 
@@ -189,11 +189,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         # TODO: Remove the patch when https://github.com/abseil/abseil-cpp/issues/326 is resolved
         # and when TensorFlow is build against CUDA 10.2
         patch_file = clean_dep("//third_party:com_google_absl_fix_mac_and_nvcc_build.patch"),
-        sha256 = "dfe63f014801d5bb1be64c0f94545e3a4a957916a2d353e49f7b746c25636198",  # SHARED_ABSL_SHA
-        strip_prefix = "abseil-cpp-b69c7d880caddfc25bf348dbcfe9d45fdd8bc6e6",
+        sha256 = "acd93f6baaedc4414ebd08b33bebca7c7a46888916101d8c0b8083573526d070",  # SHARED_ABSL_SHA
+        strip_prefix = "abseil-cpp-43ef2148c0936ebf7cb4be6b19927a9d9d145b8f",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/abseil/abseil-cpp/archive/b69c7d880caddfc25bf348dbcfe9d45fdd8bc6e6.tar.gz",
-            "https://github.com/abseil/abseil-cpp/archive/b69c7d880caddfc25bf348dbcfe9d45fdd8bc6e6.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/abseil/abseil-cpp/archive/43ef2148c0936ebf7cb4be6b19927a9d9d145b8f.tar.gz",
+            "https://github.com/abseil/abseil-cpp/archive/43ef2148c0936ebf7cb4be6b19927a9d9d145b8f.tar.gz",
         ],
     )
 
@@ -201,11 +201,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
         patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"),
-        sha256 = "88e95180a7eae9acd3e79d2efeea1026eefad9f515a44418b63b189a1887108c",  # SHARED_EIGEN_SHA
-        strip_prefix = "eigen-52a2fbbb008a47c5e3fb8ac1c65c2feecb0c511c",
+        sha256 = "3d9cbec40e27093956ad46a4482bb03f968964cabb7b9f35807fd80852ec026a",  # SHARED_EIGEN_SHA
+        strip_prefix = "eigen-b733b8b680885c0fcdfddea5423171468609b5a6",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/52a2fbbb008a47c5e3fb8ac1c65c2feecb0c511c/eigen-52a2fbbb008a47c5e3fb8ac1c65c2feecb0c511c.tar.gz",
-            "https://gitlab.com/libeigen/eigen/-/archive/52a2fbbb008a47c5e3fb8ac1c65c2feecb0c511c/eigen-52a2fbbb008a47c5e3fb8ac1c65c2feecb0c511c.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/b733b8b680885c0fcdfddea5423171468609b5a6/eigen-b733b8b680885c0fcdfddea5423171468609b5a6.tar.gz",
+            "https://gitlab.com/libeigen/eigen/-/archive/b733b8b680885c0fcdfddea5423171468609b5a6/eigen-b733b8b680885c0fcdfddea5423171468609b5a6.tar.gz",
         ],
     )
 
@@ -597,8 +597,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "387c3f74fd8efdc0be464b0e1a8033cc1eeb739c"
-    LLVM_SHA256 = "5648d1bdd933f45aa6556cf104b32a0418121ce6961f5f47e8ef5bc6e428434f"
+    LLVM_COMMIT = "398b497cd0e20ca7245bf30c12c761b444581da5"
+    LLVM_SHA256 = "789fd647d166774dde233a13c30d53d8a6c9098d82c4cd12d203b6f37e2555e1"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
diff --git a/third_party/com_google_absl_fix_mac_and_nvcc_build.patch b/third_party/com_google_absl_fix_mac_and_nvcc_build.patch
index e338869c19a..038e618de44 100644
--- a/third_party/com_google_absl_fix_mac_and_nvcc_build.patch
+++ b/third_party/com_google_absl_fix_mac_and_nvcc_build.patch
@@ -1,44 +1,6 @@
-diff -u -r old/absl/strings/string_view.h new/absl/strings/string_view.h
---- old/absl/strings/string_view.h	2020-02-21 12:56:04.000000000 -0800
-+++ new/absl/strings/string_view.h	2020-02-25 18:39:26.377782568 -0800
-@@ -292,11 +292,18 @@
-   // and an exception of type `std::out_of_range` will be thrown on invalid
-   // access.
-   constexpr const_reference at(size_type i) const {
-+#if defined(__NVCC__)
-+    // An nvcc bug treats the original return expression as a non-constant,
-+    // which is not allowed in a constexpr function. This only happens when
-+    // NDEBUG is not defined. This will be fixed in the CUDA 10.2 release.
-+    return ptr_[i];
-+#else
-     return ABSL_PREDICT_TRUE(i < size())
-                ? ptr_[i]
-                : ((void)base_internal::ThrowStdOutOfRange(
-                       "absl::string_view::at"),
-                   ptr_[i]);
-+#endif
-   }
- 
-   // string_view::front()
-@@ -519,7 +526,14 @@
-       (std::numeric_limits<difference_type>::max)();
- 
-   static constexpr size_type CheckLengthInternal(size_type len) {
-+#if defined(__NVCC__) && (__CUDACC_VER_MAJOR__<10 || (__CUDACC_VER_MAJOR__==10 && __CUDACC_VER_MINOR__<2)) && !defined(NDEBUG)
-+    // An nvcc bug treats the original return expression as a non-constant,
-+    // which is not allowed in a constexpr function. This only happens when
-+    // NDEBUG is not defined. This will be fixed in the CUDA 10.2 release.
-+    return len;
-+#else
-     return (void)ABSL_ASSERT(len <= kMaxSize), len;
-+#endif
-   }
- 
-   static constexpr size_type StrlenInternal(const char* str) {
-diff -u -r old/absl/time/internal/cctz/BUILD.bazel new/absl/time/internal/cctz/BUILD.bazel
---- old/absl/time/internal/cctz/BUILD.bazel	2020-02-21 12:56:04.000000000 -0800
-+++ new/absl/time/internal/cctz/BUILD.bazel	2020-02-25 15:19:29.013710932 -0800
-@@ -74,15 +74,6 @@
+--- ./absl/time/internal/cctz/BUILD.bazel	2019-09-23 13:20:52.000000000 -0700
++++ ./absl/time/internal/cctz/BUILD.bazel.fixed	2019-09-23 13:20:48.000000000 -0700
+@@ -76,15 +76,6 @@
          "include/cctz/time_zone.h",
          "include/cctz/zone_info_source.h",
      ],
@@ -52,5 +14,22 @@ diff -u -r old/absl/time/internal/cctz/BUILD.bazel new/absl/time/internal/cctz/B
 -        "//conditions:default": [],
 -    }),
      visibility = ["//visibility:public"],
-     deps = [
-         ":civil_time",
+     deps = [":civil_time"],
+ )
+--- ./absl/strings/string_view.h	2019-09-23 13:20:52.000000000 -0700
++++ ./absl/strings/string_view.h.fixed	2019-09-23 13:20:48.000000000 -0700
+@@ -492,7 +492,14 @@
+       (std::numeric_limits<difference_type>::max)();
+ 
+   static constexpr size_type CheckLengthInternal(size_type len) {
++#if defined(__NVCC__) && (__CUDACC_VER_MAJOR__<10 || (__CUDACC_VER_MAJOR__==10 && __CUDACC_VER_MINOR__<2)) && !defined(NDEBUG)
++    // An nvcc bug treats the original return expression as a non-constant,
++    // which is not allowed in a constexpr function. This only happens when
++    // NDEBUG is not defined. This will be fixed in the CUDA 10.2 release.
++    return len;
++#else
+     return ABSL_ASSERT(len <= kMaxSize), len;
++#endif
+   }
+ 
+   const char* ptr_;
diff --git a/third_party/cpuinfo/BUILD.bazel b/third_party/cpuinfo/BUILD.bazel
index cea88aafbd9..c05a380a1da 100644
--- a/third_party/cpuinfo/BUILD.bazel
+++ b/third_party/cpuinfo/BUILD.bazel
@@ -101,6 +101,8 @@ MACH_ARM_SRCS = [
 cc_library(
     name = "cpuinfo_impl",
     srcs = select({
+        ":linux_aarch64": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM64_SRCS,
+        ":linux_arm": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM32_SRCS,
         ":linux_x86_64": COMMON_SRCS + X86_SRCS + LINUX_SRCS + LINUX_X86_SRCS,
         ":macos_x86_64": COMMON_SRCS + X86_SRCS + MACH_SRCS + MACH_X86_SRCS,
         ":android_armv7": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM32_SRCS + ANDROID_ARM_SRCS,
@@ -159,6 +161,18 @@ cc_library(
 
 ############################# Build configurations #############################
 
+config_setting(
+    name = "linux_aarch64",
+    values = {"cpu": "aarch64"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "linux_arm",
+    values = {"cpu": "arm"},
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "linux_x86_64",
     values = {"cpu": "k8"},
diff --git a/third_party/flatbuffers/BUILD.bazel b/third_party/flatbuffers/BUILD.bazel
index 6fb508db841..3b21a73154a 100644
--- a/third_party/flatbuffers/BUILD.bazel
+++ b/third_party/flatbuffers/BUILD.bazel
@@ -1,3 +1,7 @@
+load("@build_bazel_rules_android//android:rules.bzl", "android_library")
+
+package(default_visibility = ["//visibility:public"])
+
 licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE.txt"])
@@ -116,3 +120,20 @@ py_library(
     srcs = [":runtime_py_srcs"],
     visibility = ["//visibility:public"],
 )
+
+filegroup(
+    name = "runtime_java_srcs",
+    srcs = glob(["java/com/google/flatbuffers/**/*.java"]),
+)
+
+java_library(
+    name = "runtime_java",
+    srcs = [":runtime_java_srcs"],
+    visibility = ["//visibility:public"],
+)
+
+android_library(
+    name = "runtime_android",
+    srcs = [":runtime_java_srcs"],
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/flatbuffers/build_defs.bzl b/third_party/flatbuffers/build_defs.bzl
index a5e9eac654b..d07ad18630f 100644
--- a/third_party/flatbuffers/build_defs.bzl
+++ b/third_party/flatbuffers/build_defs.bzl
@@ -1,6 +1,15 @@
 """BUILD rules for generating flatbuffer files."""
 
+load("@build_bazel_rules_android//android:rules.bzl", "android_library")
+
 flatc_path = "@flatbuffers//:flatc"
+zip_files = "//tensorflow/lite/tools:zip_files"
+
+DEFAULT_INCLUDE_PATHS = [
+    "./",
+    "$(GENDIR)",
+    "$(BINDIR)",
+]
 
 DEFAULT_FLATC_ARGS = [
     "--no-union-value-namespacing",
@@ -422,3 +431,181 @@ def flatbuffer_py_library(
             "@flatbuffers//:runtime_py",
         ],
     )
+
+def flatbuffer_java_library(
+        name,
+        srcs,
+        custom_package = "",
+        package_prefix = "",
+        include_paths = DEFAULT_INCLUDE_PATHS,
+        flatc_args = DEFAULT_FLATC_ARGS,
+        visibility = None):
+    """A java library with the generated reader/writers for the given flatbuffer definitions.
+
+    Args:
+      name: Rule name. (required)
+      srcs: List of source .fbs files including all includes. (required)
+      custom_package: Package name of generated Java files. If not specified
+          namespace in the schema files will be used. (optional)
+      package_prefix: like custom_package, but prefixes to the existing
+          namespace. (optional)
+      include_paths: List of paths that includes files can be found in. (optional)
+      flatc_args: List of additional arguments to pass to flatc. (optional)
+      visibility: Visibility setting for the java_library rule. (optional)
+    """
+    out_srcjar = "java_%s_all.srcjar" % name
+    flatbuffer_java_srcjar(
+        name = "%s_srcjar" % name,
+        srcs = srcs,
+        out = out_srcjar,
+        custom_package = custom_package,
+        flatc_args = flatc_args,
+        include_paths = include_paths,
+        package_prefix = package_prefix,
+    )
+
+    native.filegroup(
+        name = "%s.srcjar" % name,
+        srcs = [out_srcjar],
+    )
+
+    native.java_library(
+        name = name,
+        srcs = [out_srcjar],
+        deps = [
+            "@flatbuffers//:runtime_java",
+        ],
+        visibility = visibility,
+    )
+
+def flatbuffer_java_srcjar(
+        name,
+        srcs,
+        out,
+        custom_package = "",
+        package_prefix = "",
+        include_paths = DEFAULT_INCLUDE_PATHS,
+        flatc_args = DEFAULT_FLATC_ARGS):
+    """Generate flatbuffer Java source files.
+
+    Args:
+      name: Rule name. (required)
+      srcs: List of source .fbs files including all includes. (required)
+      out: Output file name. (required)
+      custom_package: Package name of generated Java files. If not specified
+          namespace in the schema files will be used. (optional)
+      package_prefix: like custom_package, but prefixes to the existing
+          namespace. (optional)
+      include_paths: List of paths that includes files can be found in. (optional)
+      flatc_args: List of additional arguments to pass to flatc. (optional)
+    """
+    command_fmt = """set -e
+      tmpdir=$(@D)
+      schemas=$$tmpdir/schemas
+      java_root=$$tmpdir/java
+      rm -rf $$schemas
+      rm -rf $$java_root
+      mkdir -p $$schemas
+      mkdir -p $$java_root
+
+      for src in $(SRCS); do
+        dest=$$schemas/$$src
+        rm -rf $$(dirname $$dest)
+        mkdir -p $$(dirname $$dest)
+        if [ -z "{custom_package}" ] && [ -z "{package_prefix}" ]; then
+          cp -f $$src $$dest
+        else
+          if [ -z "{package_prefix}" ]; then
+            sed -e "s/namespace\\s.*/namespace {custom_package};/" $$src > $$dest
+          else
+            sed -e "s/namespace \\([^;]\\+\\);/namespace {package_prefix}.\\1;/" $$src > $$dest
+          fi
+        fi
+      done
+
+      flatc_arg_I="-I $$tmpdir/schemas"
+      for include_path in {include_paths}; do
+        flatc_arg_I="$$flatc_arg_I -I $$schemas/$$include_path"
+      done
+
+      flatc_additional_args=
+      for arg in {flatc_args}; do
+        flatc_additional_args="$$flatc_additional_args $$arg"
+      done
+
+      for src in $(SRCS); do
+        $(location {flatc_path}) $$flatc_arg_I --java $$flatc_additional_args -o $$java_root  $$schemas/$$src
+      done
+
+      $(location {zip_files}) -export_zip_path=$@ -file_directory=$$java_root
+      """
+    genrule_cmd = command_fmt.format(
+        package_name = native.package_name(),
+        custom_package = custom_package,
+        package_prefix = package_prefix,
+        flatc_path = flatc_path,
+        zip_files = zip_files,
+        include_paths = " ".join(include_paths),
+        flatc_args = " ".join(flatc_args),
+    )
+
+    native.genrule(
+        name = name,
+        srcs = srcs,
+        outs = [out],
+        tools = [flatc_path, zip_files],
+        cmd = genrule_cmd,
+    )
+
+def flatbuffer_android_library(
+        name,
+        srcs,
+        custom_package = "",
+        package_prefix = "",
+        javacopts = None,
+        include_paths = DEFAULT_INCLUDE_PATHS,
+        flatc_args = DEFAULT_FLATC_ARGS,
+        visibility = None):
+    """An android_library with the generated reader/writers for the given flatbuffer definitions.
+
+    Args:
+      name: Rule name. (required)
+      srcs: List of source .fbs files including all includes. (required)
+      custom_package: Package name of generated Java files. If not specified
+          namespace in the schema files will be used. (optional)
+      package_prefix: like custom_package, but prefixes to the existing
+          namespace. (optional)
+      javacopts: List of options to pass to javac.
+      include_paths: List of paths that includes files can be found in. (optional)
+      flatc_args: List of additional arguments to pass to flatc. (optional)
+      visibility: Visibility setting for the android_library rule. (optional)
+    """
+    out_srcjar = "android_%s_all.srcjar" % name
+    flatbuffer_java_srcjar(
+        name = "%s_srcjar" % name,
+        srcs = srcs,
+        out = out_srcjar,
+        custom_package = custom_package,
+        flatc_args = flatc_args,
+        include_paths = include_paths,
+        package_prefix = package_prefix,
+    )
+
+    native.filegroup(
+        name = "%s.srcjar" % name,
+        srcs = [out_srcjar],
+    )
+
+    # To support org.checkerframework.dataflow.qual.Pure.
+    checkerframework_annotations = [
+        "@org_checkerframework_qual",
+    ] if "--java-checkerframework" in flatc_args else []
+
+    android_library(
+        name = name,
+        srcs = [out_srcjar],
+        visibility = visibility,
+        deps = [
+            "@flatbuffers//:runtime_android",
+        ] + checkerframework_annotations,
+    )
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index bdaaa4ab250..a4eccc4d235 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -49,6 +49,7 @@ load(
     "is_windows",
     "raw_exec",
     "read_dir",
+    "realpath",
     "which",
 )
 
@@ -360,8 +361,8 @@ def _cuda_include_path(repository_ctx, cuda_config):
             )
     inc_entries = []
     if target_dir != "":
-        inc_entries.append(target_dir)
-    inc_entries.append(cuda_config.cuda_toolkit_path + "/include")
+        inc_entries.append(realpath(repository_ctx, target_dir))
+    inc_entries.append(realpath(repository_ctx, cuda_config.cuda_toolkit_path + "/include"))
     return inc_entries
 
 def enable_cuda(repository_ctx):
diff --git a/third_party/gpus/find_cuda_config.py b/third_party/gpus/find_cuda_config.py
index 39f2c21d3fb..e2ab42abf67 100644
--- a/third_party/gpus/find_cuda_config.py
+++ b/third_party/gpus/find_cuda_config.py
@@ -437,7 +437,7 @@ def _get_legacy_path(env_name, default=[]):
 
 def _normalize_path(path):
   """Returns normalized path, with forward slashes on Windows."""
-  path = os.path.normpath(path)
+  path = os.path.realpath(path)
   if _is_windows():
     path = path.replace("\\", "/")
   return path
diff --git a/third_party/gpus/find_cuda_config.py.gz.base64 b/third_party/gpus/find_cuda_config.py.gz.base64
index 418acdfd5ac..ee5ddc22044 100644
--- a/third_party/gpus/find_cuda_config.py.gz.base64
+++ b/third_party/gpus/find_cuda_config.py.gz.base64
@@ -1 +1 @@
-eJzNXHtz47iR/5+fAseJy6RHpu291FZOF++V157JKvHZU7Zm9lK2o0AkLHFNkQofkpVUvvt1NwASBCm/JlsZVa1HAtFAd+PXLxDYd+w0W27yeDYv2XeHR//FxnPBxiItsvxjkq3ZSVXOs7wI2EmSsCvsVrArUYh8JaLAeee8Y+dxCN1FxKo0Ejkrgf5kyUP4Rz0ZsC8iL+IsZd8Fh8zDDq565Pr/DSNssoot+IalWcmqQsAQccHu40Qw8RiKZcnilIXZYpnEPA0FW8flnKZRgwAb7M9qiGxacujNof8Sft2b/RgviWH8zMtyOTw4WK/XASdmgyyfHSSyY3FwPjr9cHH9YR8YJpLPaSKKguXib1Wcg6jTDeNL4CfkU+Ay4WuW5YzPcgHPygz5XedxGaezASuy+3LNcwGjRHFR5vG0KlvK0tyBzGYHUBdPmXtyzUbXLvvx5Hp0PYAxfh6Nf7r8PGY/n1xdnVyMRx+u2eUVO728OBuNR5cX8OsjO7n4M/vT6OJswASoCqYRj8sc+QcmY1QjLR27FqLFwH0mGSqWIozv4xDkSmcVnwk2y1YiT0EcthT5Ii5wMQtgL4JRkngRl7yklo5QOM3xv/TjuK77KY9TgOHp57MTmH6a83yDzLC54Dh/BEsUllkeC+KRrST6AFIZMIiKJSk3RSkWgeMg4IswjwFnheA5YKEgVWwbHoFZtEcZwIqj1srCgcYFQiASJaoqJRXHuWaCBlpK/pE+zNL7eFblpECkK8ooq8qAuFrycl5IPNHoRIxUNQ5r0QBget0QgvM8q2ZzJtJVnGfpQqSls+J5jGgFUx7dg6mxFU/iyGIgVkoaSOGkVjS7xJzIc1r4XJRVTiBg0ATqCrNIKG0mAGO0Pak8XAagvY+BeRi/4ZIj27MKuQOmrqvlMssR+Q0Zmg0tgxenYVJF0BRWP56fXPsD+HJ2cTFgF6en5wNSjHRaV+P2kpb8AQeqeZpyQLqJkIYfMGqUevxxgnNOPp2Mf7p2DBUyrULkHPzRgu8XYslBdUA8S7IpTRIwY/Ykyx4kmiR4Cgc51aCSSCJ3Ned5tI8qjACExGhRTU027/NsgewB9yQCYSNwYClb/KKG0Y3WUoFu2OV1Y9ORuOdVUmI/cLXR0HEYGGtaPQ7BKx5URX6QZCFPDsIq4gNqkNqVWCQudpNIoobtL3cDoP8ZljZbF0NW88H61DYA/+Ow5nM6vL39lGeznC/YR1TF7e3Fl9HZ6IT94dNnCEuLZYUulI2zLHmIy9tbHP72ds9xPoJCpzx8AL8a0VIAd/E0TuJygw53IUwMJUVGUYUnYJEpdFxJFTod9WbSWfawDqtEoWTTNrQAVEewctSS7td67h1FaVuuPqvDIvRtsMVOoihGY+RJB6xAvf+VHxiCLIo+GjpfPlxdQ/DQ64JN48vL8z+NxrSaQCPNrqbBHy2qfhqwUGOei4v2RLJpdHE9Pjk/10RozzVz+MNijppMGukCJj+dXekhtCOgIcYQxy+vrsbGMHVTa2rni/al/QAIwdNNhU4pwKTBEB53MaTuPgabXXRx6JYF4IE7NQyUg4ZIvFiWG+xdpYafzhgPKcPh6Ub3Vf4LogDgH+fjKnBptwGma3AxdMr7ye/Vsx8mapAhA6aCv7efSR80AUANWRAE7YfqS/MUAq3jQLoATpnFmf6WFfobejz9fZnwEpnRv8Fvq2/gxpZ5FkL2UbdsCgfSguUmgbAyxJwHFXw825fP98F97fNyv8yWTplvhrCgZJzFHLxBwtQg63kczh2VHo6o7QOGobo7plJIUATFkq9TTYdRaCIeRViV2pPLoRqGRLqVH8cJEw551Cm5P5rQ+0A8gMp9nHsJj6EbeFk2iQtQKnhWj57IgFlrKpCZg+dDfoTpMPRzDcK1dKkvIFXO1yRe8DB7CekZz2GemnLBS0x+NIQ8HpYVTyY1hHXWoVtoAkDJ6VyED6hHQf6T/K9OdxZCqExHUcto76g4gEWFSGSjBrU9DUtVVg3Wh2ksmmVM2ZcapM1nPTZ+OmNZnbFHAeHQIHmNJ9WfI2Z/joJ22zivhEkQfGcTWL8/Qtx6hiL4zycpuh97DmJKr0Q+K4Y1ta23ISU1WmmdlAnCax7UxG0Vt0kxLQ4xX9L5jIxGkOCWPEkoCdUDXckMc8h+VsDC3nLsBl4Ssu2kOEtFIJEJfyFuW0sO7vMCukhhlX1I3UELVEVQgS1BRB4aWVedaOPA1njHNgCxhlt6vtMDv+NOU6s38dIZjedlgTWv1zFAbbm8nCSCF+XrTFfp8pjdgN/zVj7lqSvKRy0WoGQsPTdw/TtDqD7CrnQWqSEh+6HRhhZkJkodpbQomHgOWMoXQvsbBQx0Biq0wPRADsUWZdOnZqUWSBgghwlWY/A8zoJsKfTIbu5CdE4h94Zc89ityvv937l+kMMQSKD8KJNQowUM6KvnvpNzsp2Cvfduo/e+y3aI0QFN5RMZ4I+6a9tSKqC2YAZl2tI7Mhbfrb1xCMsuipinExAyqsLSu4/zooT8VkD2HdnKAOuhHB2z4WmcqoIcXCpRUQovCZVC1Hw3WREgWfBLFqfePYwuV/MeFSVJ8SfpVdLfmWuVRBNZCkyoPvD6uOqk2e36oTZTLFusmNmwiejRZDQXrAQFbs/Vza6P6ZV7UID4B3Uj0KlM6tjIRYIQY9ZEPvFuWiMDJPaX7p1P0bzEmkGuOpYZACgA8x47/oF5wZ7vypXDCILDi5Ls2MSanEGaQQtOKrUxkaVmU/DC3kEksCb0XF6Ecez6ElIq6/mcxvjwjLrUyY85YBMRuihEngMeRZ4GACwTYtdrI9M3sFlQee5hkevJAXzfBIMqLSdYOio80FfT56h1NtslT2YLsO7u4cqJRPUHB0KbgM+RtX6+BzIYRs7aSaoabNWBC1Shcv8A5PFasdSta1t3YD14Wxm72oGO4DFag7X0RU8Mt3njWvX5/k6BLsckAvBanVzVZLJt9UlT9w601W/ReoWVW+638+72RS4SWWrXm1i4F2JshcjdD8sbKRbdmllXbfx0GpRsduvePrmP/VlaNQ/FY5nz4uD086fx6ODJAWUfelb7OV0Y/Qskb238PCs69P7+t62frR/9suKDR5OsJfxeWzQwrQnVlhPaw/Nwv0HKOajF0L+RZ+WhSAdNXzRXVwaQG/c21SDb3Sl2EZ7krVF6+hLXfqQZwL+TXsac8LWDtqnvDLdlFmtaKadZlUTkWWhbEqvvHbkTB9+MnbfNEKw0ze7pn8ZYPUMZXU0ZgtXWQ3UnUr1Kx6aAPQnB9qGUh6NIju9RgB7L9QD/eK2QLyOeObFvpSr4DLXJ40K8HTItRSg7aBFrM5MJ1LZaU5udTE3kFjZpCMxMFwGiaHaRN01+0RMBDAYBbyYHFDzQ4nQUsupqm9bdKfaCaIMUgE5MeDV2qel9a3Bwtk/Wae3PEzn1zfDojiK0gEDfy9aTfEBkKDL3GW6emx41tWc4si1Qt93oE+hQE4nXGcxguxC2CAOGke4VyOLSiso5Lzslp0SbVk5Tbfz77LZTnxrSkufyVcf4rZs+Hf8wsCbd7i8U9ROLqnqYQAaISb1jHqXl0qFga1VMYJLJRkvlKpPpdzRvwIbKZ16MjpahPAF2K+VqM/6MA3larJaGKH2UOZ+FzM7Kw6RIt6VKV9BocIe7A9tLetd89aBrG5Xvt3L8WmO4bWNm7u5OFOxE5HD1pAcH7Ojw8HBQc7FDv2W77zgqxTY0W6u14XsrclwkDubuc9p/0Yq8YYyuLn2nUwNZ4rB3+CoAtw0E49NsJYJaC6ZLrsXui80kd166A6tAMfCQrsKwDw1GLDoFWiaraP26O0uk/Qt80XUbvb8N4M+AffH0d/rju7VX1HX11lJeV/D7ihco5Huq757tnDqd2154d+vo7fs5bdBCAymIUgtQBf4IxKNwO4kJhXLZwdVUagvM0HC9Wk+4j56KYhqnsgAYNOwMrPrRXkxfMr9atGGyfTrsewAoisQqDo06q4BSyS7bsFqBijuK+T4VtKWskS1yYNetW/aCo8NgGrrKmJdlbFrzdv463tQlYrBmvx7oNQYB/d0eJ1kbF27CyddpMJK9yVIvq6+Va7x+6+2v9O/LTer/Oz/RU8tYoxRXRyUqa5blI5dHYGrtBUQ+vjy79MIins38Ib2yQpOaZkBqbtbhpjzoLNASqTnq+kzHcs1smuX4In7eTlQsZYDuAkgd/cGLiG3VEPVBPYAvM/sudzeHd+w/jvseHN2pzR9KT8za0B2lEAWLGIROS/lawtTqECvEVcFoz2WL7+7OR+ulPME/NOpNg3OHbftr9VFbEyg59LNBYUcyv01sqG0bsYl3i7pZMc2hsYamqVvzdFasGRUt7DmJLHP2LfLnZLKN2BLKXBotlo0bpPinlSBNE168JEWy4uNQ7z123s9YW3bg0Y5c2f0lGZaZY5m7lM+8PjF6YiilWAR277nNeY7J/5788fLKpeSsaRtdYNu2fMXs+ulkfPpTEy51nqarUNNJMvb2JIzWgy/jt6RiX52H9Sdh6FbVIZmqUE55wX/JmpN/WZpsArVpTQJsS9nMCvvwzjHz4k651gZSe+BmI6fr6xSrdZZcsCgT8uiYzIzI/zWPt/s8xrz2tJYR0GI3+xPv8HVwLuQ5tXqCOL2HSmrFITPEUxlTAQBVr4bRNgbsl6pQe3V07gVf7NkYel3oNyBkLZ/xnpgOQaKZ8KKtEA9San/LYtoAQ/lfk1jgcG5nLfvjCHV52qdu8aZE+bQ77ThS2y1GafrrFI6NU3upS7MdGp4sM3wZ/ZRuTP8kV3X+4cuHc+2vnvJWX1Ewpunr3dRXOqmtdSIs2Mu9zqvKwzSV1WEzw7bUp+khQ3Dze9Du9SZcI+FXwToNw+TbRDWdc6xBLX9pTNOvVvT9NdCMuvlGwEzL9GtgGQcGjZrj9yLZ7IB5r/Fz0OrzBhgT3VehuKQzsHn5jSL5S3MutwG02fhUytnq2cL8O/ZXxcRfMXxzECgVOYd6lmXTX6CwxdPhbE3XLqBjyaC2oy1cuh2gso6qiNOZGg6ee4sqLuMlbv/HC0iRpiLJ1ir2NxqhQxGtdFBlbDxJvEbP2iDqnSGc5Is6WybfR8rrFwv+IDrZHtOXLuhgR/CsmddnTN5q7xerUXov8m8ozVZnX4xctk4r9anv79kiW4nIPLZH+WW+qC/ZKLmU4oN5J5d8i5bq0b4RZSEAtBd4oaN8lZ9cxSg1WK49S6+7tDuBS7ObBp2+b3CdNe1b3Sda8gTPqU1EuvLgP7VLqk43Hd/c2W+F7KtA5EzopFvfTYLmhbQem86J1UePWmeTmuYb3ftOr9nANQ45KO5aZ/PEjIdSwheJweXmYetwL3hGGqVfElzn7iWOQd8djd47F3RipsxglF15IAnPrvzu+8n3v21OtuyyNZ3VWOZiFWdVkWxkGUi7ld3beQ5TN6PYGZ00Bi+Q4d0f6KA0RVPtDuRln3Us7z3iRSJ0DNbq+61zxE+tV/d06F+8g5u/HLC7Pf89CXa7ft9IdfA/vwHD6VvdJ0+N3rRfMxhnw56Drd8c+QE/mMR/l4bchHcDCHUX+ZJzIC/cai0VUDjO5QVIdfVAIVq5jM6Grt7m7j+CoaiIJBfLhIfCc29v6dSaCXDsoGTovLXsIjmKQ7pCldMd4Kduciq4SBGaW2PH7Ibns1UAsR7cvTwTiw307mlTBPj95mh4d+d03r9Zpwhd65IVSkaCtQ5QWevnti71PRtQXnT8sjMnvStrnVUwHsNq6ZUUj8BbIdfxTvp2deL1H/+U60q7q3RCs9agcTxzm4ydO2Nu6+yU8oN0ULVaRuBave4L62aCvm2nztrI4W7aO/B35j6OZrZhRBtkWUES6Nkn3s2RjGjqs98z7+hwwI7qrO8d+7GzraW2nNa8UFda5HZTffwMN56M1zv6aoTFa6OE/h2pXkSaV/hqTNoKb/L8np1wkwt7v6pvNRRQ0nQLUqByr6FiBzC3G2d60GLvcPSbYnMPcavcPTtcBoP2RkctHdWOPcJRPbddtk687BHNqnd7JDMvSz4jmLnH0fBmFb21VDql6pOsTre2S9cb+nsk7ElWe6S073M+I6ldB7f57cldSWq07oeBvtJCo8alWNRBC7TyAJxF8lqOSzkm3T5otdJbJqPoI9fzcIdKsqLwykyZ1Y00GesWHGo6GqMu5og5sRngOaRKGAdqu3FRc10zQVf5PXenGMo9feY1I/n99RW6ILV3j4FP/i8AAvzfWwivKHNPqNobH0KkKPH4g+NgsKcMZDKhq4aTCUoymbg4khTK+X+SX8ik
\ No newline at end of file
+eJzNXHtz47iR/5+fAseJy6RHpu291FZOF++V157JKvHZU7Zm9lK2o0AkLHFNkQofkpVUvvt1NwASBCm/JlsZVa1HAtFAd+PXLxDYd+w0W27yeDYv2XeHR//FxnPBxiItsvxjkq3ZSVXOs7wI2EmSsCvsVrArUYh8JaLAeee8Y+dxCN1FxKo0Ejkrgf5kyUP4Rz0ZsC8iL+IsZd8Fh8zDDq565Pr/DSNssoot+IalWcmqQsAQccHu40Qw8RiKZcnilIXZYpnEPA0FW8flnKZRgwAb7M9qiGxacujNof8Sft2b/RgviWH8zMtyOTw4WK/XASdmgyyfHSSyY3FwPjr9cHH9YR8YJpLPaSKKguXib1Wcg6jTDeNL4CfkU+Ay4WuW5YzPcgHPygz5XedxGaezASuy+3LNcwGjRHFR5vG0KlvK0tyBzGYHUBdPmXtyzUbXLvvx5Hp0PYAxfh6Nf7r8PGY/n1xdnVyMRx+u2eUVO728OBuNR5cX8OsjO7n4M/vT6OJswASoCqYRj8sc+QcmY1QjLR27FqLFwH0mGSqWIozv4xDkSmcVnwk2y1YiT0EcthT5Ii5wMQtgL4JRkngRl7yklo5QOM3xv/TjuK77KY9TgOHp57MTmH6a83yDzLC54Dh/BEsUllkeC+KRrST6AFIZMIiKJSk3RSkWgeMg4IswjwFnheA5YKEgVWwbHoFZtEcZwIqj1srCgcYFQiASJaoqJRXHuWaCBlpK/pE+zNL7eFblpECkK8ooq8qAuFrycl5IPNHoRIxUNQ5r0QBget0QgvM8q2ZzJtJVnGfpQqSls+J5jGgFUx7dg6mxFU/iyGIgVkoaSOGkVjS7xJzIc1r4XJRVTiBg0ATqCrNIKG0mAGO0Pak8XAagvY+BeRi/4ZIj27MKuQOmrqvlMssR+Q0Zmg0tgxenYVJF0BRWP56fXPsD+HJ2cTFgF6en5wNSjHRaV+P2kpb8AQeqeZpyQLqJkIYfMGqUevxxgnNOPp2Mf7p2DBUyrULkHPzRgu8XYslBdUA8S7IpTRIwY/Ykyx4kmiR4Cgc51aCSSCJ3Ned5tI8qjACExGhRTU027/NsgewB9yQCYSNwYClb/KKG0Y3WUoFu2OV1Y9ORuOdVUmI/cLXR0HEYGGtaPQ7BKx5URX6QZCFPDsIq4gNqkNqVWCQudpNIoobtL3cDoP8ZljZbF0NW88H61DYA/+Ow5nM6vL39lGeznC/YR1TF7e3Fl9HZ6IT94dNnCEuLZYUulI2zLHmIy9tbHP72ds9xPoJCpzx8AL8a0VIAd/E0TuJygw53IUwMJUVGUYUnYJEpdFxJFTod9WbSWfawDqtEoWTTNrQAVEewctSS7td67h1FaVuuPqvDIvRtsMVOoihGY+RJB6xAvf+VHxiCLIo+GjpfPlxdQ/DQ64JN48vL8z+NxrSaQCPNrqbBHy2qfhqwUGOei4v2RLJpdHE9Pjk/10RozzVz+MNijppMGukCJj+dXekhtCOgIcYQxy+vrsbGMHVTa2rni/al/QAIwdNNhU4pwKTBEB53MaTuPgabXXRx6JYF4IE7NQyUg4ZIvFiWG+xdpYafzhgPKcPh6Ub3Vf4LogDgH+fjKnBptwGma3AxdMr7ye/Vsx8mapAhA6aCv7efSR80AUANWRAE7YfqS/MUAq3jQLoATpnFmf6WFfobejz9fZnwEpnRv8Fvq2/gxpZ5FkL2UbdsCgfSguUmgbAyxJwHFXw825fP98F97fNyv8yWTplvhrCgZJzFHLxBwtQg63kczh2VHo6o7QOGobo7plJIUATFkq9TTYdRaCIeRViV2pPLoRqGRLqVH8cJEw551Cm5P5rQ+0A8gMp9nHsJj6EbeFk2iQtQKnhWj57IgFlrKpCZg+dDfoTpMPRzDcK1dKkvIFXO1yRe8DB7CekZz2GemnLBS0x+NIQ8HpYVTyY1hHXWoVtoAkDJ6VyED6hHQf6T/K9OdxZCqExHUcto76g4gEWFSGSjBrU9DUtVVg3Wh2ksmmVM2ZcapM1nPTZ+OmNZnbFHAeHQIHmNJ9WfI2Z/joJ22zivhEkQfGcTWL8/Qtx6hiL4zycpuh97DmJKr0Q+K4Y1ta23ISU1WmmdlAnCax7UxG0Vt0kxLQ4xX9L5jIxGkOCWPEkoCdUDXckMc8h+VsDC3nLsBl4Ssu2kOEtFIJEJfyFuW0sO7vMCukhhlX1I3UELVEVQgS1BRB4aWVedaOPA1njHNgCxhlt6vtMDv+NOU6s38dIZjedlgTWv1zFAbbm8nCSCF+XrTFfp8pjdgN/zVj7lqSvKRy0WoGQsPTdw/TtDqD7CrnQWqSEh+6HRhhZkJkodpbQomHgOWMoXQvsbBQx0Biq0wPRADsUWZdOnZqUWSBgghwlWY/A8zoJsKfTIbu5CdE4h94Zc89ityvv937l+kMMQSKD8KJNQowUM6KvnvpNzsp2Cvfduo/e+y3aI0QFN5RMZ4I+6a9tSKqC2YAZl2tI7Mhbfrb1xCMsuipinExAyqsLSu4/zooT8VkD2HdnKAOuhHB2z4WmcqoIcXCpRUQovCZVC1Hw3WREgWfBLFqfePYwuV/MeFSVJ8SfpVdLfmWuVRBNZCkyoPvD6uOqk2e36oTZTLFusmNmwiejRZDQXrAQFbs/Vza6P6ZV7UID4B3Uj0KlM6tjIRYIQY9ZEPvFuWiMDJPaX7p1P0bzEmkGuOpYZACgA8x47/oF5wZ7vypXDCILDi5Ls2MSanEGaQQtOKrUxkaVmU/DC3kEksCb0XF6Ecez6ElIq6/mcxvjwjLrUyY85YBMRuihEngMeRZ4GACwTYtdrI9M3sFlQee5hkevJAXzfBIMqLSdYOio80FfT56h1NtslT2YLsO7u4cqJRPUHB0KbgM+RtX6+BzIYRs7aSaoabNWBC1Shcv8A5PFasdSta1t3YD14Wxm72oGO4DFag7X0RU8Mt3njWvX5/k6BLsckAvBanVzVZLJt9UlT9w601W/ReoWVW+638+72RS4SWWrXm1i4F2JshcjdD8sbKRbdmllXbfx0GpRsduvePrmP/VlaNQ/FY5nz4uD086fx6ODJAWUfelb7OV0Y/Qskb238PCs69P7+t62frR/9suKDR5OsJfxeWzQwrQnVlhPaw/Nwv0HKOajF0L+RZ+WhSAdNXzRXVwaQG/c21SDb3Sl2EZ7krVF6+hLXfqQZwL+TXsac8LWDtqnvDLdlFmtaKadZlUTkWWhbEqvvHbkTB9+MnbfNEKw0ze7pn8ZYPUMZXU0ZgtXWQ3UnUr1Kx6aAPQnB9qGUh6NIju9RgB7L9QD/eK2QLyOeObFvpSr4DLXJ40K8HTItRSg7aBFrM5MJ1LZaU5udTE3kFjZpCMxMFwGiaHaRN01+0RMBDAYBbyYHFDzQ4nQUsupqm9bdKfaCaIMUgE5MeDV2qel9a3Bwtk/Wae3PEzn1zfDojiK0gEDfy9aTfEBkKDL3GW6emx41tWc4si1Qt93oE+hQE4nXGcxguxC2CAOGke4VyOLSiso5Lzslp0SbVk5Tbfz77LZTnxrSkufyVcf4rZs+Hf8wsCbd7i8U9ROLqnqYQAaISb1jHqXl0qFga1VMYJLJRkvlKpPpdzRvwIbKZ16MjpahPAF2K+VqM/6MA3larJaGKH2UOZ+FzM7Kw6RIt6VKV9BocIe7A9tLetd89aBrG5Xvt3L8WmO4bWNm7u5OFOxE5HD1pAcH7Ojw8HBQc7FDv2W77zgqxTY0W6u14XsrclwkDubuc9p/0Yq8YYyuLn2nUwNZ4rB3+CoAtw0E49NsJYJaC6ZLrsXui80kd166A6tAMfCQrsKwDw1GLDoFWiaraP26O0uk/Qt80XUbvb8N4M+AffH0d/rju7VX1HX11lJeV/D7ihco5Huq757tnDqd2154d+vo7fs5bdBCAymIUgtQBf4IxKNwO4kJhXLZwdVUagvM0HC9Wk+4j56KYhqnsgAYNOwMrPrRXkxfMr9atGGyfTrsewAoisQqDo06q4BSyS7bsFqBijuK+T4VtKWskS1yYNetW/aCo8NgGrrKmJdlbFrzdv463tQlYrBmvx7oNQYB/d0eJ1kbF27CyddpMJK9yVIvq6+Va7x+6+2v9O/LTer/Oz/RU8tYoxRXRyUqa5blI5dHYGrtBUQ+vjy79MIins38Ib2yQpOaZkBqbtbhpjzoLNASqTnq+kzHcs1smuX4In7eTlQsZYDuAkgd/cGLiG3VEPVBPYAvM/sudzeHd+w/jvseHN2pzR9KT8za0B2lEAWLGIROS/lawtTqECvEVcFoz2WL7+7OR+ulPME/NOpNg3OHbftr9VFbEyg59LNBYUcyv01sqG0bsYl3i7pZMc2hsYamqVvzdFasGRUt7DmJLHP2LfLnZLKN2BLKXBotlo0bpPinlSBNE168JEWy4uNQ7z123s9YW3bg0Y5c2f0lGZaZY5m7lM+8PjF6YiilWAR277nNeY7J/5788fLKpeSsaRtdYNu2fMXs+ulkfPpTEy51nqarUNNJMvb2JIzWgy/jt6RiX52H9Sdh6FbVIZmqUE55wX/JmpN/WZpsArVpTQJsS9nMCvvwzjHz4k651gZSe+BmI6fr6xSrdZZcsCgT8uiYzIzI/zWPt/s8xrz2tJYR0GI3+xPv8HVwLuQ5tXqCOL2HSmrFITPEUxlTAQBVr4bRNgbsl6pQe3V07gVf7NkYel3oNyBkLZ/xnpgOQaKZ8KKtEA9San/LYtoAQ/lfk1jgcG5nLfvjCHV52qdu8aZE+bQ77ThS2y1GafrrFI6NU3upS7MdGp4sM3wZ/ZRuTP8kV3X+4cuHc+2vnvJWX1Ewpunr3dRXOqmtdSIs2Mu9zqvKwzSV1WEzw7bUp+khQ3Dze9Du9SZcI+FXwToNw+TbRDWdc6xBLX9pTNOvVvT9NdCMuvlGwEzL9GtgGQcGjZrj9yLZ7IB5r/Fz0OrzBhgT3VehuKQzsHn5jSL5S3MutwG02fhUytnq2cL8O/ZXxcRfMXxzECgVOYd6lmXTX6CwxdPhbE3XLqBjyaC2oy1cuh2gso6qiNOZGg6ee4sqLuMlbv/HC0iRpiLJ1ir2NxqhQxGtdFBlbDxJvEbP2iDqnSGc5Is6WybfR8rrFwv+IDrZHtOXLuhgR/CsmddnTN5q7xerUXov8m8ozVZnX4xctk4r9anv79kiW4nIPLZH+WW+qC/ZKLmU4oN5J5d8i5bq0b4RZSEAtBd4oaN8lZ9cxSg1WK49S6+7tDuBS7ObBp2+b3CdNe1b3Sda8gTPqU1EuvLgP7VLqk43Hd/c2W+F7KtA5EzopFvfTYLmhbQem86J1UePWmeTmuYb3ftOr9nANQ45KO5aZ/PEjIdSwheJweXmYetwL3hGGqVfElzn7iWOQd8djd47F3RipsxglF15IAnPrvzu+8n3v21OtuyyNZ3VWOZiFWdVkWxkGUi7ld3beQ5TN6PYGZ00Bi+Q4d0f6KA0RVPtDuRln3Us7z3iRSJ0DNbq+61zxE+tV/d06F+8g5u/HLC7Pf89CXa7ft9IdfA/vwHD6VvdJ0+N3rRfMxhnw56Drd8c+QE/mMR/l4bchHcDCHUX+ZJzIC/cai0VUDjO5QVIdfVAIVq5DG1pueBJM4Wz7QiGolIky4SHwnNvb+nUmglw7KBk6Ly17CI5ikO6QpXTHeCnbnIquEgRmltjx+yG57NVALEe3L08E4sN9O5pUwT4/eZoeHfndN6/WacIXeuSFUpGgrUOUFnr57Yu9T0bUF50/LIzJ70ra51VMB7DaumVFI/AWyHX8U76dnXi9R//lOtKu6t0QrPWoHE8c5uMnTtjbuvslPKDdFC1WkbgWr3uC+tmgr5tp87ayOFu2jvwd+Y+jma2YUQbZFlBEujZJ97NkYxo6rPfM+/ocMCO6qzvHfuxs62ltpzWvFBXWuR2U338DDeejNc7+mqExWujhP4dqV5Emlf4akzaCm/y/J6dcJMLe7+qbzUUUNJ0C1Kgcq+hYgcwtxtnetBi73D0m2JzD3Gr3D07XAaD9kZHLR3Vjj3CUT23XbZOvOwRzap3eyQzL0s+I5i5x9HwZhW9tVQ6peqTrE63tkvXG/p7JOxJVnuktO9zPiOpXQe3+e3JXUlqtO6Hgb7SQqPGpVjUQQu08gCcRfJajks5Jt0+aLXSWyaj6CPX83CHSrKi8MpMmdWNNBnrFhxqOhqjLuaIObEZ4DmkShgHartxUXNdM0FX+T13pxjKPX3mNSP5/fUVuiC1d4+BT/4vAAL831sIryhzT6jaGx9CpCjx+IPjYLCnDGQyoauGkwlKMpm4OJIUyvl/z0/IjA==
\ No newline at end of file
diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index a89838ebac9..9c401650975 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -736,11 +736,11 @@ cc_library(
         ":aarch64_target_gen",
         ":aarch64_utils",
         ":attributes_gen",
+        ":binary_format",
         ":config",
         ":intrinsic_enums_gen",
         ":intrinsics_impl_gen",
         ":mc",
-        ":object",
         ":support",
     ],
 )
@@ -1152,12 +1152,12 @@ cc_library(
         ":arm_target_gen",
         ":arm_utils",
         ":attributes_gen",
+        ":binary_format",
         ":config",
         ":intrinsic_enums_gen",
         ":intrinsics_impl_gen",
         ":mc",
         ":mc_disassembler",
-        ":object",
         ":support",
     ],
 )
@@ -1894,6 +1894,7 @@ cc_library(
         ":config",
         ":debug_info_dwarf",
         ":mc",
+        ":object",
         ":support",
     ],
 )
@@ -2379,6 +2380,7 @@ cc_library(
     deps = [
         ":aggressive_inst_combine",
         ":analysis",
+        ":binary_format",
         ":bit_reader",
         ":bit_writer",
         ":code_gen",
@@ -3285,11 +3287,11 @@ cc_library(
     copts = llvm_copts + ["-Iexternal/llvm-project/llvm/lib/Target/PowerPC"],
     deps = [
         ":attributes_gen",
+        ":binary_format",
         ":config",
         ":intrinsic_enums_gen",
         ":intrinsics_impl_gen",
         ":mc",
-        ":object",
         ":powerpc_info",
         ":powerpc_target_gen",
         ":support",
@@ -4370,10 +4372,10 @@ cc_library(
     ]),
     copts = llvm_copts + ["-Iexternal/llvm-project/llvm/lib/Target/X86"],
     deps = [
+        ":binary_format",
         ":config",
         ":mc",
         ":mc_disassembler",
-        ":object",
         ":support",
         ":x86_info",
         ":x86_utils",
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 3d709ce3769..22b91ba36a8 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -60,7 +60,7 @@ cc_library(
     hdrs = glob([
         "include/mlir/IR/*.h",
     ]) + [
-        "include/mlir/Analysis/CallInterfaces.h",
+        "include/mlir/Interfaces/CallInterfaces.h",
     ],
     includes = ["include"],
     deps = [
@@ -68,6 +68,7 @@ cc_library(
         ":DialectSymbolRegistry",
         ":InferTypeOpInterfaceIncGen",
         ":OpAsmInterfacesIncGen",
+        ":SideEffectInterfacesIncGen",
         ":Support",
         "@llvm-project//llvm:support",
     ],
@@ -143,7 +144,8 @@ filegroup(
     srcs = [
         "include/mlir/Dialect/AffineOps/AffineOps.td",
         "include/mlir/Dialect/AffineOps/AffineOpsBase.td",
-        "include/mlir/Transforms/LoopLikeInterface.td",
+        "include/mlir/Interfaces/LoopLikeInterface.td",
+        "include/mlir/Interfaces/SideEffects.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -160,6 +162,10 @@ gentbl(
             "-gen-op-defs",
             "include/mlir/Dialect/AffineOps/AffineOps.cpp.inc",
         ),
+        (
+            "-gen-dialect-decls",
+            "include/mlir/Dialect/AffineOps/AffineOpsDialect.h.inc",
+        ),
     ],
     tblgen = ":mlir-tblgen",
     td_file = "include/mlir/Dialect/AffineOps/AffineOps.td",
@@ -172,7 +178,8 @@ filegroup(
     name = "LoopOpsTdFiles",
     srcs = [
         "include/mlir/Dialect/LoopOps/LoopOps.td",
-        "include/mlir/Transforms/LoopLikeInterface.td",
+        "include/mlir/Interfaces/LoopLikeInterface.td",
+        "include/mlir/Interfaces/SideEffects.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -189,6 +196,10 @@ gentbl(
             "-gen-op-defs",
             "include/mlir/Dialect/LoopOps/LoopOps.cpp.inc",
         ),
+        (
+            "-gen-dialect-decls",
+            "include/mlir/Dialect/LoopOps/LoopOpsDialect.h.inc",
+        ),
     ],
     tblgen = ":mlir-tblgen",
     td_file = "include/mlir/Dialect/LoopOps/LoopOps.td",
@@ -217,9 +228,11 @@ cc_library(
 filegroup(
     name = "StdOpsTdFiles",
     srcs = [
-        "include/mlir/Analysis/CallInterfaces.td",
         "include/mlir/Dialect/StandardOps/IR/Ops.td",
         "include/mlir/IR/OpAsmInterface.td",
+        "include/mlir/Interfaces/CallInterfaces.td",
+        "include/mlir/Interfaces/ControlFlowInterfaces.td",
+        "include/mlir/Interfaces/SideEffects.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -236,6 +249,10 @@ gentbl(
             "-gen-op-defs",
             "include/mlir/Dialect/StandardOps/IR/Ops.cpp.inc",
         ),
+        (
+            "-gen-dialect-decls",
+            "include/mlir/Dialect/StandardOps/IR/OpsDialect.h.inc",
+        ),
         (
             "-gen-enum-decls",
             "include/mlir/Dialect/StandardOps/IR/OpsEnums.h.inc",
@@ -295,20 +312,18 @@ cc_library(
         ],
     ) + [
         "include/mlir/Transforms/InliningUtils.h",
-        "include/mlir/Transforms/LoopLikeInterface.h",
     ],
     hdrs = glob([
         "include/mlir/Dialect/AffineOps/*.h",
         "include/mlir/Dialect/AffineOps/EDSC/*.h",
-    ]) + [
-        "include/mlir/Transforms/SideEffectsInterface.h",
-    ],
+    ]),
     includes = ["include"],
     deps = [
         ":AffineOpsIncGen",
         ":EDSC",
         ":IR",
-        ":LoopLikeOpInterfaceIncGen",
+        ":LoopLikeInterface",
+        ":SideEffects",
         ":StandardOps",
         ":Support",
         "@llvm-project//llvm:support",
@@ -366,22 +381,31 @@ cc_library(
     hdrs = glob([
         "include/mlir/Dialect/LoopOps/*.h",
         "include/mlir/Dialect/LoopOps/EDSC/*.h",
-    ]) + [
-        "include/mlir/Transforms/LoopLikeInterface.h",
-        "include/mlir/Transforms/SideEffectsInterface.h",
-    ],
+    ]),
     includes = ["include"],
     deps = [
         ":EDSC",
         ":IR",
-        ":LoopLikeOpInterfaceIncGen",
+        ":LoopLikeInterface",
         ":LoopOpsIncGen",
+        ":SideEffects",
         ":StandardOps",
         ":Support",
         "@llvm-project//llvm:support",
     ],
 )
 
+cc_library(
+    name = "LoopLikeInterface",
+    srcs = ["lib/Interfaces/LoopLikeInterface.cpp"],
+    hdrs = ["include/mlir/Interfaces/LoopLikeInterface.h"],
+    includes = ["include"],
+    deps = [
+        ":IR",
+        ":LoopLikeInterfaceIncGen",
+    ],
+)
+
 cc_library(
     name = "StandardOps",
     srcs = glob(
@@ -395,15 +419,16 @@ cc_library(
         "include/mlir/Dialect/StandardOps/IR/*.h",
         "include/mlir/Dialect/StandardOps/EDSC/*.h",
     ]) + [
-        "include/mlir/Analysis/CallInterfaces.h",
         "include/mlir/Transforms/InliningUtils.h",
     ],
     includes = ["include"],
     deps = [
-        ":CallOpInterfacesIncGen",
+        ":CallOpInterfaces",
         ":CommonFolders",
+        ":ControlFlowInterfaces",
         ":EDSC",
         ":IR",
+        ":SideEffects",
         ":StandardOpsIncGen",
         ":Support",
         "@llvm-project//llvm:support",
@@ -431,6 +456,7 @@ cc_library(
         ":DialectUtils",
         ":EDSC",
         ":IR",
+        ":SideEffects",
         ":StandardOps",
         ":Support",
         ":VectorOpsIncGen",
@@ -520,8 +546,10 @@ cc_library(
     ),
     includes = ["include"],
     deps = [
+        ":ControlFlowInterfaces",
         ":IR",
         ":LLVMOpsIncGen",
+        ":SideEffects",
         ":Support",
         "@llvm-project//llvm:asm_parser",
         "@llvm-project//llvm:core",
@@ -534,6 +562,7 @@ filegroup(
     srcs = [
         "include/mlir/Dialect/GPU/GPUOps.td",
         "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
+        "include/mlir/Interfaces/SideEffects.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -550,6 +579,10 @@ gentbl(
             "-gen-op-defs",
             "include/mlir/Dialect/GPU/GPUOps.cpp.inc",
         ),
+        (
+            "-gen-dialect-decls -dialect=gpu",
+            "include/mlir/Dialect/GPU/GPUOpsDialect.h.inc",
+        ),
     ],
     tblgen = ":mlir-tblgen",
     td_file = "include/mlir/Dialect/GPU/GPUOps.td",
@@ -574,7 +607,9 @@ cc_library(
         ":GPUOpsIncGen",
         ":IR",
         ":LLVMDialect",
+        ":SideEffects",
         ":StandardOps",
+        ":Support",
     ],
 )
 
@@ -586,7 +621,10 @@ cc_library(
             "lib/Dialect/GPU/Transforms/*.h",
         ],
     ),
-    hdrs = ["include/mlir/Dialect/GPU/Passes.h"],
+    hdrs = [
+        "include/mlir/Dialect/GPU/Passes.h",
+        "include/mlir/Dialect/GPU/Utils.h",
+    ],
     includes = ["include"],
     deps = [
         ":EDSC",
@@ -605,6 +643,8 @@ filegroup(
     srcs = [
         "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
         "include/mlir/Dialect/LLVMIR/LLVMOps.td",
+        "include/mlir/Interfaces/ControlFlowInterfaces.td",
+        "include/mlir/Interfaces/SideEffects.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -685,6 +725,7 @@ cc_library(
 cc_library(
     name = "GPUToVulkanTransforms",
     srcs = [
+        "lib/Conversion/GPUToVulkan/ConvertGPULaunchFuncToVulkanLaunchFunc.cpp",
         "lib/Conversion/GPUToVulkan/ConvertLaunchFuncToVulkanCalls.cpp",
     ],
     hdrs = ["include/mlir/Conversion/GPUToVulkan/ConvertGPUToVulkanPass.h"],
@@ -781,6 +822,10 @@ gentbl(
             "-gen-op-defs",
             "include/mlir/Dialect/LLVMIR/LLVMOps.cpp.inc",
         ),
+        (
+            "-gen-dialect-decls",
+            "include/mlir/Dialect/LLVMIR/LLVMOpsDialect.h.inc",
+        ),
         (
             "-gen-enum-decls",
             "include/mlir/Dialect/LLVMIR/LLVMOpsEnums.h.inc",
@@ -834,6 +879,7 @@ cc_library(
         ":IR",
         ":LLVMDialect",
         ":NVVMOpsIncGen",
+        ":SideEffects",
         ":StandardOps",
         ":Support",
         "@llvm-project//llvm:asm_parser",
@@ -847,6 +893,7 @@ filegroup(
     srcs = [
         "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
         "include/mlir/Dialect/LLVMIR/NVVMOps.td",
+        "include/mlir/Interfaces/SideEffects.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -863,6 +910,10 @@ gentbl(
             "-gen-op-defs",
             "include/mlir/Dialect/LLVMIR/NVVMOps.cpp.inc",
         ),
+        (
+            "-gen-dialect-decls -dialect=nvvm",
+            "include/mlir/Dialect/LLVMIR/NVVMOpsDialect.h.inc",
+        ),
     ],
     tblgen = ":mlir-tblgen",
     td_file = "include/mlir/Dialect/LLVMIR/NVVMOps.td",
@@ -900,6 +951,7 @@ cc_library(
         ":IR",
         ":LLVMDialect",
         ":ROCDLOpsIncGen",
+        ":SideEffects",
         ":StandardOps",
         ":Support",
         "@llvm-project//llvm:asm_parser",
@@ -913,6 +965,7 @@ filegroup(
     srcs = [
         "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
         "include/mlir/Dialect/LLVMIR/ROCDLOps.td",
+        "include/mlir/Interfaces/SideEffects.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -929,6 +982,10 @@ gentbl(
             "-gen-op-defs",
             "include/mlir/Dialect/LLVMIR/ROCDLOps.cpp.inc",
         ),
+        (
+            "-gen-dialect-decls -dialect=rocdl",
+            "include/mlir/Dialect/LLVMIR/ROCDLOpsDialect.h.inc",
+        ),
     ],
     tblgen = ":mlir-tblgen",
     td_file = "include/mlir/Dialect/LLVMIR/ROCDLOps.td",
@@ -957,7 +1014,9 @@ gentbl(
 filegroup(
     name = "SPIRVOpsTdFiles",
     srcs = [
-        "include/mlir/Analysis/CallInterfaces.td",
+        "include/mlir/Interfaces/CallInterfaces.td",
+        "include/mlir/Interfaces/ControlFlowInterfaces.td",
+        "include/mlir/Interfaces/SideEffects.td",
         ":OpBaseTdFiles",
     ] + glob(["include/mlir/Dialect/SPIRV/*.td"]),
 )
@@ -974,6 +1033,10 @@ gentbl(
             "-gen-op-defs",
             "include/mlir/Dialect/SPIRV/SPIRVOps.cpp.inc",
         ),
+        (
+            "-gen-dialect-decls",
+            "include/mlir/Dialect/SPIRV/SPIRVOpsDialect.h.inc",
+        ),
         (
             "-gen-op-doc",
             "g3doc/Dialects/SPIRV/SPIRVOps.md",
@@ -1142,6 +1205,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":CommonFolders",
+        ":ControlFlowInterfaces",
         ":IR",
         ":Parser",
         ":Pass",
@@ -1151,6 +1215,7 @@ cc_library(
         ":SPIRVOpsIncGen",
         ":SPIRVSerializationGen",
         ":SPIRVTargetAndABIStructGen",
+        ":SideEffects",
         ":Support",
         ":Transforms",
         "@llvm-project//llvm:support",
@@ -1163,6 +1228,7 @@ cc_library(
         "lib/Dialect/SPIRV/SPIRVLowering.cpp",
         "lib/Dialect/SPIRV/Transforms/DecorateSPIRVCompositeTypeLayoutPass.cpp",
         "lib/Dialect/SPIRV/Transforms/LowerABIAttributesPass.cpp",
+        "lib/Dialect/SPIRV/Transforms/UpdateVCEPass.cpp",
     ],
     hdrs = [
         "include/mlir/Dialect/SPIRV/Passes.h",
@@ -1268,10 +1334,11 @@ cc_library(
     deps = [
         ":AffineOps",
         ":Analysis",
-        ":EDSC",
+        ":ControlFlowInterfaces",
         ":IR",
-        ":LoopLikeOpInterfaceIncGen",
+        ":LoopLikeInterface",
         ":LoopOps",
+        ":SideEffects",
         ":StandardOps",
         ":Support",
         "@llvm-project//llvm:support",
@@ -1279,20 +1346,57 @@ cc_library(
 )
 
 gentbl(
-    name = "LoopLikeOpInterfaceIncGen",
+    name = "DerivedAttributeOpInterfaceIncGen",
     strip_include_prefix = "include",
     tbl_outs = [
         (
             "-gen-op-interface-decls",
-            "include/mlir/Transforms/LoopLikeInterface.h.inc",
+            "include/mlir/Interfaces/DerivedAttributeOpInterface.h.inc",
         ),
         (
             "-gen-op-interface-defs",
-            "include/mlir/Transforms/LoopLikeInterface.cpp.inc",
+            "include/mlir/Interfaces/DerivedAttributeOpInterface.cpp.inc",
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/Transforms/LoopLikeInterface.td",
+    td_file = "include/mlir/Interfaces/DerivedAttributeOpInterface.td",
+    td_srcs = [
+        ":OpBaseTdFiles",
+    ],
+)
+
+cc_library(
+    name = "DerivedAttributeOpInterface",
+    srcs = [
+        "lib/Interfaces/DerivedAttributeOpInterface.cpp",
+    ],
+    hdrs = [
+        "include/mlir/Interfaces/DerivedAttributeOpInterface.h",
+    ],
+    includes = ["include"],
+    deps = [
+        ":DerivedAttributeOpInterfaceIncGen",
+        ":IR",
+        ":Support",
+        "@llvm-project//llvm:support",
+    ],
+)
+
+gentbl(
+    name = "LoopLikeInterfaceIncGen",
+    strip_include_prefix = "include",
+    tbl_outs = [
+        (
+            "-gen-op-interface-decls",
+            "include/mlir/Interfaces/LoopLikeInterface.h.inc",
+        ),
+        (
+            "-gen-op-interface-defs",
+            "include/mlir/Interfaces/LoopLikeInterface.cpp.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Interfaces/LoopLikeInterface.td",
     td_srcs = [
         ":OpBaseTdFiles",
     ],
@@ -1311,11 +1415,11 @@ cc_library(
     deps = [
         ":AffineOps",
         ":Analysis",
-        ":EDSC",
         ":IR",
-        ":LoopLikeOpInterfaceIncGen",
+        ":LoopLikeInterface",
         ":LoopOps",
         ":Pass",
+        ":SideEffects",
         ":StandardOps",
         ":Support",
         ":TransformUtils",
@@ -1434,40 +1538,148 @@ gentbl(
     tbl_outs = [
         (
             "-gen-op-interface-decls",
-            "include/mlir/Analysis/CallInterfaces.h.inc",
+            "include/mlir/Interfaces/CallInterfaces.h.inc",
         ),
         (
             "-gen-op-interface-defs",
-            "include/mlir/Analysis/CallInterfaces.cpp.inc",
+            "include/mlir/Interfaces/CallInterfaces.cpp.inc",
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/Analysis/CallInterfaces.td",
+    td_file = "include/mlir/Interfaces/CallInterfaces.td",
     td_srcs = [
         ":OpBaseTdFiles",
     ],
 )
 
+cc_library(
+    name = "CallOpInterfaces",
+    srcs = [
+        "lib/Interfaces/CallInterfaces.cpp",
+    ],
+    hdrs = [
+        "include/mlir/Interfaces/CallInterfaces.h",
+    ],
+    includes = ["include"],
+    deps = [
+        ":CallOpInterfacesIncGen",
+        ":IR",
+        ":Support",
+        "@llvm-project//llvm:support",
+    ],
+)
+
+gentbl(
+    name = "ControlFlowInterfacesIncGen",
+    strip_include_prefix = "include",
+    tbl_outs = [
+        (
+            "-gen-op-interface-decls",
+            "include/mlir/Interfaces/ControlFlowInterfaces.h.inc",
+        ),
+        (
+            "-gen-op-interface-defs",
+            "include/mlir/Interfaces/ControlFlowInterfaces.cpp.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Interfaces/ControlFlowInterfaces.td",
+    td_srcs = [
+        ":OpBaseTdFiles",
+    ],
+)
+
+cc_library(
+    name = "ControlFlowInterfaces",
+    srcs = [
+        "lib/Interfaces/ControlFlowInterfaces.cpp",
+    ],
+    hdrs = [
+        "include/mlir/Interfaces/ControlFlowInterfaces.h",
+    ],
+    includes = ["include"],
+    deps = [
+        ":ControlFlowInterfacesIncGen",
+        ":IR",
+        ":Support",
+        "@llvm-project//llvm:support",
+    ],
+)
+
 gentbl(
     name = "InferTypeOpInterfaceIncGen",
     strip_include_prefix = "include",
     tbl_outs = [
         (
             "-gen-op-interface-decls",
-            "include/mlir/Analysis/InferTypeOpInterface.h.inc",
+            "include/mlir/Interfaces/InferTypeOpInterface.h.inc",
         ),
         (
             "-gen-op-interface-defs",
-            "include/mlir/Analysis/InferTypeOpInterface.cpp.inc",
+            "include/mlir/Interfaces/InferTypeOpInterface.cpp.inc",
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/Analysis/InferTypeOpInterface.td",
+    td_file = "include/mlir/Interfaces/InferTypeOpInterface.td",
     td_srcs = [
         ":OpBaseTdFiles",
     ],
 )
 
+cc_library(
+    name = "InferTypeOpInterface",
+    srcs = [
+        "lib/Interfaces/InferTypeOpInterface.cpp",
+    ],
+    hdrs = [
+        "include/mlir/Interfaces/InferTypeOpInterface.h",
+    ],
+    includes = ["include"],
+    deps = [
+        ":IR",
+        ":InferTypeOpInterfaceIncGen",
+        ":Support",
+        "@llvm-project//llvm:support",
+    ],
+)
+
+gentbl(
+    name = "SideEffectInterfacesIncGen",
+    strip_include_prefix = "include",
+    tbl_outs = [
+        (
+            "-gen-op-interface-decls",
+            "include/mlir/Interfaces/SideEffectInterfaces.h.inc",
+        ),
+        (
+            "-gen-op-interface-defs",
+            "include/mlir/Interfaces/SideEffectInterfaces.cpp.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Interfaces/SideEffects.td",
+    td_srcs = [
+        ":OpBaseTdFiles",
+    ],
+)
+
+cc_library(
+    name = "SideEffects",
+    srcs = [
+        "lib/Interfaces/SideEffects.cpp",
+    ],
+    hdrs = [
+        "include/mlir/Interfaces/SideEffects.h",
+    ],
+    includes = ["include"],
+    deps = [
+        ":IR",
+        ":SideEffectInterfacesIncGen",
+        ":Support",
+        "@llvm-project//llvm:support",
+    ],
+)
+
 cc_library(
     name = "Analysis",
     srcs = glob(
@@ -1491,9 +1703,8 @@ cc_library(
     includes = ["include"],
     deps = [
         ":AffineOps",
-        ":CallOpInterfacesIncGen",
+        ":CallOpInterfaces",
         ":IR",
-        ":InferTypeOpInterfaceIncGen",
         ":LoopOps",
         ":StandardOps",
         ":Support",
@@ -1535,8 +1746,10 @@ cc_library(
         ":IR",
         ":LLVMConversionIncGen",
         ":LLVMDialect",
+        ":OpenMPDialect",
         ":Support",
         "@llvm-project//llvm:core",
+        "@llvm-project//llvm:frontend_open_mp",
         "@llvm-project//llvm:support",
         "@llvm-project//llvm:transform_utils",
     ],
@@ -1786,6 +1999,7 @@ cc_library(
 
 cc_library(
     name = "AllPassesAndDialects",
+    srcs = ["@org_tensorflow//third_party/mlir:mlir-auto-init.cpp"],
     deps = [
         ":AllPassesAndDialectsNoRegistration",
     ],
@@ -1852,31 +2066,15 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "mlir_runner_c_utils_header",
-    hdrs = [
-        "include/mlir/ExecutionEngine/CRunnerUtils.h",
-    ],
-    includes = ["include"],
-)
-
-cc_library(
-    name = "mlir_runner_utils_header",
-    hdrs = [
-        "include/mlir/ExecutionEngine/CRunnerUtils.h",
-        "include/mlir/ExecutionEngine/RunnerUtils.h",
-    ],
-    includes = ["include"],
-)
-
 cc_library(
     name = "mlir_c_runner_utils",
     srcs = [
         "lib/ExecutionEngine/CRunnerUtils.cpp",
     ],
-    deps = [
-        ":mlir_runner_c_utils_header",
+    hdrs = [
+        "include/mlir/ExecutionEngine/CRunnerUtils.h",
     ],
+    includes = ["include"],
 )
 
 cc_library(
@@ -1884,9 +2082,12 @@ cc_library(
     srcs = [
         "lib/ExecutionEngine/RunnerUtils.cpp",
     ],
+    hdrs = [
+        "include/mlir/ExecutionEngine/RunnerUtils.h",
+    ],
+    includes = ["include"],
     deps = [
         ":mlir_c_runner_utils",
-        ":mlir_runner_utils_header",
     ],
 )
 
@@ -2003,6 +2204,10 @@ gentbl(
             "-gen-op-defs",
             "include/mlir/Dialect/OpenMP/OpenMPOps.cpp.inc",
         ),
+        (
+            "-gen-dialect-decls",
+            "include/mlir/Dialect/OpenMP/OpenMPOpsDialect.h.inc",
+        ),
         (
             "-gen-op-doc",
             "g3doc/Dialects/OpenMP/OpenMPOps.md",
@@ -2038,7 +2243,8 @@ filegroup(
     name = "QuantizationOpsTdFiles",
     srcs = [
         "include/mlir/Dialect/QuantOps/QuantOps.td",
-        "include/mlir/Dialect/QuantOps/QuantPredicates.td",
+        "include/mlir/Dialect/QuantOps/QuantOpsBase.td",
+        "include/mlir/Interfaces/SideEffects.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -2056,6 +2262,10 @@ gentbl(
             "-gen-op-defs",
             "include/mlir/Dialect/QuantOps/QuantOps.cpp.inc",
         ),
+        (
+            "-gen-dialect-decls",
+            "include/mlir/Dialect/QuantOps/QuantOpsDialect.h.inc",
+        ),
         (
             "-gen-op-doc",
             "g3doc/Dialects/QuantOps/QuantOps.md",
@@ -2091,13 +2301,11 @@ cc_library(
     ],
     includes = ["include"],
     deps = [
-        ":Analysis",
         ":IR",
         ":Pass",
         ":QuantOpsIncGen",
+        ":SideEffects",
         ":StandardOps",
-        ":Support",
-        ":TransformUtils",
         "@llvm-project//llvm:support",
     ],
 )
@@ -2106,7 +2314,8 @@ filegroup(
     name = "FxpMathOpsTdFiles",
     srcs = [
         "include/mlir/Dialect/FxpMathOps/FxpMathOps.td",
-        "include/mlir/Dialect/QuantOps/QuantPredicates.td",
+        "include/mlir/Dialect/QuantOps/QuantOpsBase.td",
+        "include/mlir/Interfaces/SideEffects.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -2124,6 +2333,10 @@ gentbl(
             "-gen-op-defs",
             "include/mlir/Dialect/FxpMathOps/FxpMathOps.cpp.inc",
         ),
+        (
+            "-gen-dialect-decls -dialect=fxpmath",
+            "include/mlir/Dialect/FxpMathOps/FxpMathOpsDialect.h.inc",
+        ),
         (
             "-gen-op-doc",
             "g3doc/Dialects/FxpMathOps/FxpMathOps.md",
@@ -2149,14 +2362,12 @@ cc_library(
     ],
     includes = ["include"],
     deps = [
-        ":Analysis",
         ":FxpMathOpsIncGen",
         ":IR",
         ":Pass",
         ":QuantOps",
+        ":SideEffects",
         ":StandardOps",
-        ":Support",
-        ":TransformUtils",
         "@llvm-project//llvm:support",
     ],
 )
@@ -2183,6 +2394,10 @@ gentbl(
             "-gen-op-defs",
             "include/mlir/Dialect/Linalg/IR/LinalgOps.cpp.inc",
         ),
+        (
+            "-gen-dialect-decls -dialect=linalg",
+            "include/mlir/Dialect/Linalg/IR/LinalgOpsDialect.h.inc",
+        ),
     ],
     tblgen = ":mlir-tblgen",
     td_file = "include/mlir/Dialect/Linalg/IR/LinalgOps.td",
@@ -2351,6 +2566,7 @@ cc_library(
         ":LinalgOpsIncGen",
         ":LinalgStructuredOpsIncGen",
         ":Parser",
+        ":SideEffects",
         ":StandardOps",
         ":Support",
         "@llvm-project//llvm:support",
@@ -2469,6 +2685,10 @@ gentbl(
             "-gen-op-defs",
             "include/mlir/Dialect/VectorOps/VectorOps.cpp.inc",
         ),
+        (
+            "-gen-dialect-decls -dialect=vector",
+            "include/mlir/Dialect/VectorOps/VectorOpsDialect.h.inc",
+        ),
         (
             "-gen-op-doc",
             "g3doc/Dialects/Vector/VectorOps.md",
@@ -2520,6 +2740,7 @@ cc_library(
     ]),
     includes = ["include"],
     deps = [
+        ":DialectUtils",
         ":EDSC",
         ":IR",
         ":LLVMDialect",
@@ -2569,8 +2790,11 @@ filegroup(
 
 exports_files(
     [
-        "include/mlir/Analysis/CallInterfaces.h",
-        "include/mlir/Analysis/CallInterfaces.td",
+        "include/mlir/Interfaces/CallInterfaces.h",
+        "include/mlir/Interfaces/CallInterfaces.td",
+        "include/mlir/Interfaces/ControlFlowInterfaces.h",
+        "include/mlir/Interfaces/ControlFlowInterfaces.td",
+        "include/mlir/Interfaces/SideEffects.td",
         "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
         "include/mlir/Dialect/StandardOps/IR/Ops.td",
         "include/mlir/IR/OpAsmInterface.td",
@@ -2582,8 +2806,8 @@ exports_files(
 
 exports_files(
     [
-        "include/mlir/Analysis/InferTypeOpInterface.td",
-        "include/mlir/Transforms/LoopLikeInterface.td",
+        "include/mlir/Interfaces/InferTypeOpInterface.td",
+        "include/mlir/Interfaces/LoopLikeInterface.td",
     ],
     visibility = ["@llvm-project//mlir:friends"],
 )
diff --git a/third_party/mlir/mlir-auto-init.cpp b/third_party/mlir/mlir-auto-init.cpp
new file mode 100644
index 00000000000..af22a2c966e
--- /dev/null
+++ b/third_party/mlir/mlir-auto-init.cpp
@@ -0,0 +1,14 @@
+#include "mlir/InitAllDialects.h"
+#include "mlir/InitAllPasses.h"
+
+namespace mlir {
+// This target is a convenient dependency for users to auto-initialize MLIR
+// internals.
+static bool auto_init = []() {
+  registerAllDialects();
+  registerAllPasses();
+
+  return true;
+}();
+
+} // namespace mlir
diff --git a/third_party/mlir/test.BUILD b/third_party/mlir/test.BUILD
index 42fe944d7d5..3c881ca4b2e 100644
--- a/third_party/mlir/test.BUILD
+++ b/third_party/mlir/test.BUILD
@@ -58,6 +58,10 @@ gentbl(
             "-gen-op-defs",
             "lib/TestDialect/TestOps.cpp.inc",
         ),
+        (
+            "-gen-dialect-decls",
+            "lib/TestDialect/TestOpsDialect.h.inc",
+        ),
         (
             "-gen-enum-decls",
             "lib/TestDialect/TestOpEnums.h.inc",
@@ -76,8 +80,10 @@ gentbl(
     td_srcs = [
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:include/mlir/IR/OpAsmInterface.td",
-        "@llvm-project//mlir:include/mlir/Analysis/CallInterfaces.td",
-        "@llvm-project//mlir:include/mlir/Analysis/InferTypeOpInterface.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/ControlFlowInterfaces.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
     ],
     test = True,
 )
@@ -98,10 +104,12 @@ cc_library(
     deps = [
         ":TestOpsIncGen",
         "@llvm-project//llvm:support",
-        "@llvm-project//mlir:Analysis",
+        "@llvm-project//mlir:ControlFlowInterfaces",
         "@llvm-project//mlir:Dialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SideEffects",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
@@ -113,6 +121,7 @@ cc_library(
     srcs = [
         "lib/IR/TestFunc.cpp",
         "lib/IR/TestMatchers.cpp",
+        "lib/IR/TestSideEffects.cpp",
         "lib/IR/TestSymbolUses.cpp",
     ],
     deps = [
diff --git a/third_party/pthreadpool/workspace.bzl b/third_party/pthreadpool/workspace.bzl
index b21c9ca12f8..63eeac2c5a6 100644
--- a/third_party/pthreadpool/workspace.bzl
+++ b/third_party/pthreadpool/workspace.bzl
@@ -5,11 +5,11 @@ load("//third_party:repo.bzl", "third_party_http_archive")
 def repo():
     third_party_http_archive(
         name = "pthreadpool",
-        strip_prefix = "pthreadpool-7ad026703b3109907ad124025918da15cfd3f100",
-        sha256 = "96eb4256fc438b7b8cab40541d383efaf546fae7bad380c24ea601c326c5f685",
+        strip_prefix = "pthreadpool-ebd50d0cfa3664d454ffdf246fcd228c3b370a11",
+        sha256 = "ca4fc774cf2339cb739bba827de8ed4ccbd450c4608e05329e974153448aaf56",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/Maratyszcza/pthreadpool/archive/7ad026703b3109907ad124025918da15cfd3f100.tar.gz",
-            "https://github.com/Maratyszcza/pthreadpool/archive/7ad026703b3109907ad124025918da15cfd3f100.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/Maratyszcza/pthreadpool/archive/ebd50d0cfa3664d454ffdf246fcd228c3b370a11.tar.gz",
+            "https://github.com/Maratyszcza/pthreadpool/archive/ebd50d0cfa3664d454ffdf246fcd228c3b370a11.tar.gz",
         ],
         build_file = "//third_party/pthreadpool:BUILD.bazel",
     )
diff --git a/third_party/remote_config/common.bzl b/third_party/remote_config/common.bzl
index 5a78a8320b9..2d627e26cb8 100644
--- a/third_party/remote_config/common.bzl
+++ b/third_party/remote_config/common.bzl
@@ -265,7 +265,7 @@ def realpath(repository_ctx, path, bash_bin = None):
     if bash_bin == None:
         bash_bin = get_bash_bin(repository_ctx)
 
-    return execute(repository_ctx, [bash_bin, "-c", "realpath %s" % path]).stdout.strip()
+    return execute(repository_ctx, [bash_bin, "-c", "realpath \"%s\"" % path]).stdout.strip()
 
 def err_out(result):
     """Returns stderr if set, else stdout.
diff --git a/third_party/toolchains/clang6/CROSSTOOL.tpl b/third_party/toolchains/clang6/CROSSTOOL.tpl
index ffba9850bb8..f98bef7a09b 100644
--- a/third_party/toolchains/clang6/CROSSTOOL.tpl
+++ b/third_party/toolchains/clang6/CROSSTOOL.tpl
@@ -74,13 +74,13 @@ toolchain {
   compiler_flag: "-fstack-protector"
   linker_flag: "-Wl,-z,relro,-z,now"
 
+  # TODO(b/151234342): Clean up the following options.
   # This adds a little bit more durability to our Clang build.
   #
   # Folks who do maintenance work on TF Bazel Clang should consider
   # commenting out these lines, while doing that work, to gain a better
   # understanding of what the intersection of support looks like between GCC
-  # and Clang. Please note that,  Bazel does not support
-  # -Xclang-only / -Xgcc-only.
+  # and Clang. Please note that Bazel does not support -Xclang-only.
   compiler_flag: "-Wno-unknown-warning-option"
   compiler_flag: "-Wno-unused-command-line-argument"
   compiler_flag: "-Wno-ignored-optimization-argument"